From 6b82f3cb2d480b7714eb0ff61aee99c22160389e Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 14 Apr 2009 07:37:40 -0400
Subject: ext4: really print the find_group_flex fallback warning only once

Missing braces caused the warning to print more than once.

Signed-Off-By: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 47b84e8..cbce5aa 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -831,11 +831,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
 			ret2 = find_group_other(sb, dir, &group, mode);
-			if (ret2 == 0 && once)
+			if (ret2 == 0 && once) {
 				once = 0;
 				printk(KERN_NOTICE "ext4: find_group_flex "
 				       "failed, fallback succeeded dir %lu\n",
 				       dir->i_ino);
+			}
 		}
 		goto got_group;
 	}
-- 
cgit v1.1


From 67c457a8c378a006a34d92f9bd3078a80a92f250 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 14 Apr 2009 07:50:56 -0400
Subject: jbd2: use SWRITE_SYNC_PLUG when writing synchronous revoke records

The revoke records must be written using the same way as the rest of
the blocks during the commit process; that is, either marked as
synchronous writes or as asynchornous writes.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c |  3 ++-
 fs/jbd2/revoke.c | 21 ++++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 073c8c3..0b7d3b8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -506,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd2_journal_write_revoke_records(journal, commit_transaction);
+	jbd2_journal_write_revoke_records(journal, commit_transaction,
+					  write_op);
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index bbe6d59..a360b06 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -86,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
 
@@ -118,8 +119,8 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
 				    struct journal_head **, int *,
-				    struct jbd2_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+				    struct jbd2_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
  * revoke hash, deleting the entries as we go.
  */
 void jbd2_journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction)
+				       transaction_t *transaction,
+				       int write_op)
 {
 	struct journal_head *descriptor;
 	struct jbd2_revoke_record_s *record;
@@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
 						&descriptor, &offset,
-						record);
+						record, write_op);
 			count++;
 			list_del(&record->hash);
 			kmem_cache_free(jbd2_revoke_record_cache, record);
 		}
 	}
 	if (descriptor)
-		flush_descriptor(journal, descriptor, offset);
+		flush_descriptor(journal, descriptor, offset, write_op);
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
@@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
 				    struct journal_head **descriptorp,
 				    int *offsetp,
-				    struct jbd2_revoke_record_s *record)
+				    struct jbd2_revoke_record_s *record,
+				    int write_op)
 {
 	struct journal_head *descriptor;
 	int offset;
@@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
 	/* Make sure we have a descriptor with space left for the record */
 	if (descriptor) {
 		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset);
+			flush_descriptor(journal, descriptor, offset, write_op);
 			descriptor = NULL;
 		}
 	}
@@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
 
 static void flush_descriptor(journal_t *journal,
 			     struct journal_head *descriptor,
-			     int offset)
+			     int offset, int write_op)
 {
 	jbd2_journal_revoke_header_t *header;
 	struct buffer_head *bh = jh2bh(descriptor);
@@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(SWRITE, 1, &bh);
+	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
 
-- 
cgit v1.1


From 38d726d153cfe5efe5fe22d28d36ab382dda3a5c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 14 Apr 2009 10:10:47 -0400
Subject: jbd: use SWRITE_SYNC_PLUG when writing synchronous revoke records

The revoke records must be written using the same way as the rest of
the blocks during the commit process; that is, either marked as
synchronous writes or as asynchornous writes.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd/commit.c |  2 +-
 fs/jbd/revoke.c | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index a8e8513..06560c5 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -502,7 +502,7 @@ void journal_commit_transaction(journal_t *journal)
 		err = 0;
 	}
 
-	journal_write_revoke_records(journal, commit_transaction);
+	journal_write_revoke_records(journal, commit_transaction, write_op);
 
 	/*
 	 * If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649..1b1a06e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -67,6 +67,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
 
@@ -99,8 +100,8 @@ struct jbd_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
 				    struct journal_head **, int *,
-				    struct jbd_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+				    struct jbd_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -486,7 +487,7 @@ void journal_switch_revoke_table(journal_t *journal)
  */
 
 void journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction)
+				  transaction_t *transaction, int write_op)
 {
 	struct journal_head *descriptor;
 	struct jbd_revoke_record_s *record;
@@ -510,14 +511,14 @@ void journal_write_revoke_records(journal_t *journal,
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
 						&descriptor, &offset,
-						record);
+						record, write_op);
 			count++;
 			list_del(&record->hash);
 			kmem_cache_free(revoke_record_cache, record);
 		}
 	}
 	if (descriptor)
-		flush_descriptor(journal, descriptor, offset);
+		flush_descriptor(journal, descriptor, offset, write_op);
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
@@ -530,7 +531,8 @@ static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
 				    struct journal_head **descriptorp,
 				    int *offsetp,
-				    struct jbd_revoke_record_s *record)
+				    struct jbd_revoke_record_s *record,
+				    int write_op)
 {
 	struct journal_head *descriptor;
 	int offset;
@@ -549,7 +551,7 @@ static void write_one_revoke_record(journal_t *journal,
 	/* Make sure we have a descriptor with space left for the record */
 	if (descriptor) {
 		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset);
+			flush_descriptor(journal, descriptor, offset, write_op);
 			descriptor = NULL;
 		}
 	}
@@ -586,7 +588,7 @@ static void write_one_revoke_record(journal_t *journal,
 
 static void flush_descriptor(journal_t *journal,
 			     struct journal_head *descriptor,
-			     int offset)
+			     int offset, int write_op)
 {
 	journal_revoke_header_t *header;
 	struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +603,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(SWRITE, 1, &bh);
+	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
 
-- 
cgit v1.1


From 52fcd11c0900b0cbc584eeda12a6e27dd6c9d046 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Apr 2009 08:58:45 +0100
Subject: GFS2: Clear dirty bit at end of inode glock sync

The dirty bit can get set during the inode glock sync. Its too
complicated to change that at the moment, so this is the quick
fix - to clear the bit again at the end of the function.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62..70f87f4 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
 	error = filemap_fdatawait(metamapping);
 	mapping_set_error(metamapping, error);
 	gfs2_ail_empty_gl(gl);
+	/*
+	 * Writeback of the data mapping may cause the dirty flag to be set
+	 * so we have to clear it again here.
+	 */
+	smp_mb__before_clear_bit();
+	clear_bit(GLF_DIRTY, &gl->gl_flags);
 }
 
 /**
-- 
cgit v1.1


From e56985da455b9dc0591b8cb2006cc94b6f4fb0f4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Apr 2009 09:45:54 +0100
Subject: GFS2: Fix page_mkwrite() return code

This allows for the possibility of returning VM_FAULT_OOM as
well as VM_FAULT_SIGBUS. This ensures that the correct action
is taken.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 101caf3..5d82e91 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -413,7 +413,9 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
-	if (ret)
+	if (ret == -ENOMEM)
+		ret = VM_FAULT_OOM;
+	else if (ret)
 		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
-- 
cgit v1.1


From ffbd517d5a8c8e93ddd11046434fb029f3df73aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:09 -0400
Subject: Btrfs: use WRITE_SYNC for synchronous writes

Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for
writes we plan on waiting on in the near future.  This patch
mirrors recent changes in other filesystems and the generic code to
use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for
other latency critical writes.

Btrfs uses async worker threads for checksumming before the write is done,
and then again to actually submit the bios.  The bio submission code just
runs a per-device list of bios that need to be sent down the pipe.

This list is split into low priority and high priority lists so the
WRITE_SYNC IO happens first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c      |   4 +-
 fs/btrfs/extent_io.c    |  44 +++++++++++------
 fs/btrfs/ordered-data.c |   2 +-
 fs/btrfs/volumes.c      | 124 +++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/volumes.h      |  13 ++++-
 5 files changed, 141 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa80..fec18b4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device,
 				device->barriers = 0;
 				get_bh(bh);
 				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
+				ret = submit_bh(WRITE_SYNC, bh);
 			}
 		} else {
-			ret = submit_bh(WRITE, bh);
+			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
 		if (!ret && wait) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8..483b672 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -50,7 +50,10 @@ struct extent_page_data {
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
 	 */
-	int extent_locked;
+	unsigned int extent_locked:1;
+
+	/* tells the submit_bio code to use a WRITE_SYNC */
+	unsigned int sync_io:1;
 };
 
 int __init extent_io_init(void)
@@ -2136,8 +2139,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 delalloc_end;
 	int page_started;
 	int compressed;
+	int write_flags;
 	unsigned long nr_written = 0;
 
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC_PLUG;
+	else
+		write_flags = WRITE;
+
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
@@ -2314,9 +2323,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 				       (unsigned long long)end);
 			}
 
-			ret = submit_extent_page(WRITE, tree, page, sector,
-						 iosize, pg_offset, bdev,
-						 &epd->bio, max_nr,
+			ret = submit_extent_page(write_flags, tree, page,
+						 sector, iosize, pg_offset,
+						 bdev, &epd->bio, max_nr,
 						 end_bio_extent_writepage,
 						 0, 0, 0);
 			if (ret)
@@ -2460,15 +2469,23 @@ retry:
 	return ret;
 }
 
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-	struct extent_page_data *epd = data;
 	if (epd->bio) {
-		submit_one_bio(WRITE, epd->bio, 0, 0);
+		if (epd->sync_io)
+			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+		else
+			submit_one_bio(WRITE, epd->bio, 0, 0);
 		epd->bio = NULL;
 	}
 }
 
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	flush_epd_write_bio(epd);
+}
+
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
@@ -2480,6 +2497,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= wbc->bdi,
@@ -2490,13 +2508,11 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.range_end	= (loff_t)-1,
 	};
 
-
 	ret = __extent_writepage(page, wbc, &epd);
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
@@ -2515,6 +2531,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 1,
+		.sync_io = mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= inode->i_mapping->backing_dev_info,
@@ -2540,8 +2557,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		start += PAGE_CACHE_SIZE;
 	}
 
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
@@ -2556,13 +2572,13 @@ int extent_writepages(struct extent_io_tree *tree,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b1..d6f0806 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
 
 	/* The compression code will leave pages locked but return from
 	 * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e4..e53835b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+			struct bio *head, struct bio *tail)
+{
+
+	struct bio *old_head;
+
+	old_head = pending_bios->head;
+	pending_bios->head = head;
+	if (pending_bios->tail)
+		tail->bi_next = old_head;
+	else
+		pending_bios->tail = tail;
+}
+
 /*
  * we try to collect pending bios for a device so we don't get a large
  * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	struct bio *pending;
 	struct backing_dev_info *bdi;
 	struct btrfs_fs_info *fs_info;
+	struct btrfs_pending_bios *pending_bios;
 	struct bio *tail;
 	struct bio *cur;
 	int again = 0;
-	unsigned long num_run = 0;
+	unsigned long num_run;
+	unsigned long num_sync_run;
 	unsigned long limit;
 	unsigned long last_waited = 0;
 
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
+	/* we want to make sure that every time we switch from the sync
+	 * list to the normal list, we unplug
+	 */
+	num_sync_run = 0;
+
 loop:
 	spin_lock(&device->io_lock);
+	num_run = 0;
 
 loop_lock:
+
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
 	 * into the list if we hit congestion
 	 */
-	pending = device->pending_bios;
-	tail = device->pending_bio_tail;
+	if (device->pending_sync_bios.head)
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
+
+	pending = pending_bios->head;
+	tail = pending_bios->tail;
 	WARN_ON(pending && !tail);
-	device->pending_bios = NULL;
-	device->pending_bio_tail = NULL;
 
 	/*
 	 * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
 	 * device->running_pending is used to synchronize with the
 	 * schedule_bio code.
 	 */
-	if (pending) {
-		again = 1;
-		device->running_pending = 1;
-	} else {
+	if (device->pending_sync_bios.head == NULL &&
+	    device->pending_bios.head == NULL) {
 		again = 0;
 		device->running_pending = 0;
+	} else {
+		again = 1;
+		device->running_pending = 1;
 	}
+
+	pending_bios->head = NULL;
+	pending_bios->tail = NULL;
+
 	spin_unlock(&device->io_lock);
 
+	/*
+	 * if we're doing the regular priority list, make sure we unplug
+	 * for any high prio bios we've sent down
+	 */
+	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+		num_sync_run = 0;
+		blk_run_backing_dev(bdi, NULL);
+	}
+
 	while (pending) {
+
+		rmb();
+		if (pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head &&
+		    num_run > 16) {
+			cond_resched();
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			goto loop_lock;
+		}
+
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
 			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-		bio_get(cur);
 		submit_bio(cur->bi_rw, cur);
-		bio_put(cur);
 		num_run++;
+		if (bio_sync(cur))
+			num_sync_run++;
+
+		if (need_resched()) {
+			if (num_sync_run) {
+				blk_run_backing_dev(bdi, NULL);
+				num_sync_run = 0;
+			}
+			cond_resched();
+		}
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
 		 */
 		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
-			struct bio *old_head;
 			struct io_context *ioc;
 
 			ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
+				if (need_resched()) {
+					if (num_sync_run) {
+						blk_run_backing_dev(bdi, NULL);
+						num_sync_run = 0;
+					}
+					cond_resched();
+				}
 				continue;
 			}
 			spin_lock(&device->io_lock);
-
-			old_head = device->pending_bios;
-			device->pending_bios = pending;
-			if (device->pending_bio_tail)
-				tail->bi_next = old_head;
-			else
-				device->pending_bio_tail = tail;
-
+			requeue_list(pending_bios, pending, tail);
 			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
 			goto done;
 		}
 	}
+
+	if (num_sync_run) {
+		num_sync_run = 0;
+		blk_run_backing_dev(bdi, NULL);
+	}
+
+	cond_resched();
 	if (again)
 		goto loop;
 
 	spin_lock(&device->io_lock);
-	if (device->pending_bios)
+	if (device->pending_bios.head || device->pending_sync_bios.head)
 		goto loop_lock;
 	spin_unlock(&device->io_lock);
 
@@ -2497,7 +2562,7 @@ again:
 			max_errors = 1;
 		}
 	}
-	if (multi_ret && rw == WRITE &&
+	if (multi_ret && (rw & (1 << BIO_RW)) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 				 int rw, struct bio *bio)
 {
 	int should_queue = 1;
+	struct btrfs_pending_bios *pending_bios;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
+	if (bio_sync(bio))
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
 
-	if (device->pending_bio_tail)
-		device->pending_bio_tail->bi_next = bio;
+	if (pending_bios->tail)
+		pending_bios->tail->bi_next = bio;
 
-	device->pending_bio_tail = bio;
-	if (!device->pending_bios)
-		device->pending_bios = bio;
+	pending_bios->tail = bio;
+	if (!pending_bios->head)
+		pending_bios->head = bio;
 	if (device->running_pending)
 		should_queue = 0;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de7..5836327 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
 #include "async-thread.h"
 
 struct buffer_head;
+struct btrfs_pending_bios {
+	struct bio *head;
+	struct bio *tail;
+};
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct list_head dev_alloc_list;
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
-	struct bio *pending_bios;
-	struct bio *pending_bio_tail;
+
+	/* regular prio bios */
+	struct btrfs_pending_bios pending_bios;
+	/* WRITE_SYNC bios */
+	struct btrfs_pending_bios pending_sync_bios;
+
 	int running_pending;
 	u64 generation;
 
-- 
cgit v1.1


From d313d7a31a752c88f7288692bd98e66d0789779b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:09 -0400
Subject: Btrfs: add a priority queue to the async thread helpers

Btrfs is using WRITE_SYNC_PLUG to send down synchronous IOs with a
higher priority.  But, the checksumming helper threads prevent it
from being fully effective.

There are two problems.  First, a big queue of pending checksumming
will delay the synchronous IO behind other lower priority writes.  Second,
the checksumming uses an ordered async work queue.  The ordering makes sure
that IOs are sent to the block layer in the same order they are sent
to the checksumming threads.  Usually this gives us less seeky IO.

But, when we start mixing IO priorities, the lower priority IO can delay
the higher priority IO.

This patch solves both problems by adding a high priority list to the async
helper threads, and a new btrfs_set_work_high_prio(), which is used
to make put a new async work item onto the higher priority list.

The ordering is still done on high priority IO, but all of the high
priority bios are ordered separately from the low priority bios.  This
ordering is purely an IO optimization, it is not involved in data
or metadata integrity.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 60 ++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/async-thread.h |  2 ++
 fs/btrfs/disk-io.c      |  5 +++++
 fs/btrfs/extent_io.c    |  2 +-
 fs/btrfs/file.c         |  2 +-
 5 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 51bfdfc..502c3d6 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 
 /*
  * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
 
 	/* list of struct btrfs_work that are waiting for service */
 	struct list_head pending;
+	struct list_head prio_pending;
 
 	/* list of worker threads from struct btrfs_workers */
 	struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
 	spin_lock_irqsave(&workers->lock, flags);
 
-	while (!list_empty(&workers->order_list)) {
-		work = list_entry(workers->order_list.next,
-				  struct btrfs_work, order_list);
-
+	while (1) {
+		if (!list_empty(&workers->prio_order_list)) {
+			work = list_entry(workers->prio_order_list.next,
+					  struct btrfs_work, order_list);
+		} else if (!list_empty(&workers->order_list)) {
+			work = list_entry(workers->order_list.next,
+					  struct btrfs_work, order_list);
+		} else {
+			break;
+		}
 		if (!test_bit(WORK_DONE_BIT, &work->flags))
 			break;
 
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
 	do {
 		spin_lock_irq(&worker->lock);
 again_locked:
-		while (!list_empty(&worker->pending)) {
-			cur = worker->pending.next;
+		while (1) {
+			if (!list_empty(&worker->prio_pending))
+				cur = worker->prio_pending.next;
+			else if (!list_empty(&worker->pending))
+				cur = worker->pending.next;
+			else
+				break;
+
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
 			clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
 
 			spin_lock_irq(&worker->lock);
 			check_idle_worker(worker);
-
 		}
 		if (freezing(current)) {
 			worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
 				 * jump_in?
 				 */
 				smp_mb();
-				if (!list_empty(&worker->pending))
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
 					continue;
 
 				/*
@@ -191,7 +205,8 @@ again_locked:
 				 */
 				schedule_timeout(1);
 				smp_mb();
-				if (!list_empty(&worker->pending))
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
 					continue;
 
 				if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
 				/* still no more work?, sleep for real */
 				spin_lock_irq(&worker->lock);
 				set_current_state(TASK_INTERRUPTIBLE);
-				if (!list_empty(&worker->pending))
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
 					goto again_locked;
 
 				/*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	INIT_LIST_HEAD(&workers->worker_list);
 	INIT_LIST_HEAD(&workers->idle_list);
 	INIT_LIST_HEAD(&workers->order_list);
+	INIT_LIST_HEAD(&workers->prio_order_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		}
 
 		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->prio_pending);
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	list_add_tail(&work->list, &worker->pending);
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+		list_add_tail(&work->list, &worker->prio_pending);
+	else
+		list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 
 	/* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
 	return 0;
 }
 
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
 /*
  * places a struct btrfs_work into the pending queue of one of the kthreads
  */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	worker = find_worker(workers);
 	if (workers->ordered) {
 		spin_lock_irqsave(&workers->lock, flags);
-		list_add_tail(&work->order_list, &workers->order_list);
+		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+			list_add_tail(&work->order_list,
+				      &workers->prio_order_list);
+		} else {
+			list_add_tail(&work->order_list, &workers->order_list);
+		}
 		spin_unlock_irqrestore(&workers->lock, flags);
 	} else {
 		INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	spin_lock_irqsave(&worker->lock, flags);
 
-	list_add_tail(&work->list, &worker->pending);
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+		list_add_tail(&work->list, &worker->prio_pending);
+	else
+		list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
 
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed..1b511c1 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
 	 * of work items waiting for completion
 	 */
 	struct list_head order_list;
+	struct list_head prio_order_list;
 
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fec18b4..a6b8374 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->bio_flags = bio_flags;
 
 	atomic_inc(&fs_info->nr_async_submits);
+
+	if (rw & (1 << BIO_RW_SYNCIO))
+		btrfs_set_work_high_prio(&async->work);
+
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
 	int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
 	}
+
 	/*
 	 * kthread helpers are used to submit writes so that checksumming
 	 * can happen in parallel across all CPUs
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 483b672..5d66cb2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2501,7 +2501,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= wbc->bdi,
-		.sync_mode	= WB_SYNC_NONE,
+		.sync_mode	= wbc->sync_mode,
 		.older_than_this = NULL,
 		.nr_to_write	= 64,
 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c9fb46..e21c006 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1131,7 +1131,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
 					       pos + write_bytes - 1,
-					       WB_SYNC_NONE);
+					       WB_SYNC_ALL);
 		} else {
 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
 							   num_pages);
-- 
cgit v1.1


From 11c8349b4eb68f2b04cd8ece577377e6c0e5dd4b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:09 -0400
Subject: Btrfs: fix oops on page->mapping->host during writepage

The extent_io writepage call updates the writepage index in the inode
as it makes progress.  But, it was doing the update after unlocking the page,
which isn't legal because page->mapping can't be trusted once the page
is unlocked.

This lead to an oops, especially common with compression turned on.  The
fix here is to update the writeback index before unlocking the page.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5d66cb2..05a1c42 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2104,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	return ret;
 }
 
+static noinline void update_nr_written(struct page *page,
+				      struct writeback_control *wbc,
+				      unsigned long nr_written)
+{
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
+}
+
 /*
  * the writepage semantics are similar to regular writepage.  extent
  * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2173,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
+		/*
+		 * make sure the wbc mapping index is at least updated
+		 * to this page.
+		 */
+		update_nr_written(page, wbc, 0);
+
 		while (delalloc_end < page_end) {
 			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
@@ -2194,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		 */
 		if (page_started) {
 			ret = 0;
-			goto update_nr_written;
+			/*
+			 * we've unlocked the page, so we can't update
+			 * the mapping's writeback index, just update
+			 * nr_to_write.
+			 */
+			wbc->nr_to_write -= nr_written;
+			goto done_unlocked;
 		}
 	}
 	lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2207,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (ret == -EAGAIN) {
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
+			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
 			ret = 0;
-			goto update_nr_written;
+			goto done_unlocked;
 		}
 	}
 
-	nr_written++;
+	/*
+	 * we don't want to touch the inode after unlocking the page,
+	 * so we update the mapping writeback index now
+	 */
+	update_nr_written(page, wbc, nr_written + 1);
 
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2345,11 +2372,8 @@ done:
 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 
-update_nr_written:
-	wbc->nr_to_write -= nr_written;
-	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-		page->mapping->writeback_index = page->index + nr_written;
+done_unlocked:
+
 	return 0;
 }
 
-- 
cgit v1.1


From 8c594ea81d7abbbffdda447b127f8ba8d76f319d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:10 -0400
Subject: Btrfs: use the right node in reada_for_balance

reada_for_balance was using the wrong index into the path node array,
so it wasn't reading the right blocks.  We never directly used the
results of the read done by this function because the btree search is
started over at the end.

This fixes reada_for_balance to reada in the correct node and to
avoid searching past the last slot in the node.  It also makes sure to
hold the parent lock while we are finding the nodes to read.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e5b2533..a99f1c2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 	int ret = 0;
 	int blocksize;
 
-	parent = path->nodes[level - 1];
+	parent = path->nodes[level + 1];
 	if (!parent)
 		return 0;
 
 	nritems = btrfs_header_nritems(parent);
-	slot = path->slots[level];
+	slot = path->slots[level + 1];
 	blocksize = btrfs_level_size(root, level);
 
 	if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 			block1 = 0;
 		free_extent_buffer(eb);
 	}
-	if (slot < nritems) {
+	if (slot + 1 < nritems) {
 		block2 = btrfs_node_blockptr(parent, slot + 1);
 		gen = btrfs_node_ptr_generation(parent, slot + 1);
 		eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 	}
 	if (block1 || block2) {
 		ret = -EAGAIN;
+
+		/* release the whole path */
 		btrfs_release_path(root, path);
+
+		/* read the blocks */
 		if (block1)
 			readahead_tree_block(root, block1, blocksize, 0);
 		if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 			eb = read_tree_block(root, block1, blocksize, 0);
 			free_extent_buffer(eb);
 		}
-		if (block1) {
+		if (block2) {
 			eb = read_tree_block(root, block2, blocksize, 0);
 			free_extent_buffer(eb);
 		}
@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	 * of the btree by dropping locks before
 	 * we read.
 	 */
-	btrfs_release_path(NULL, p);
+	btrfs_unlock_up_safe(p, level + 1);
+	btrfs_set_path_blocking(p);
+
 	if (tmp)
 		free_extent_buffer(tmp);
 	if (p->reada)
 		reada_for_search(root, p, level, slot, key->objectid);
 
+	btrfs_release_path(NULL, p);
 	tmp = read_tree_block(root, blocknr, blocksize, gen);
 	if (tmp)
 		free_extent_buffer(tmp);
-- 
cgit v1.1


From cf2706a340ae98616d4e2a54900393e0e0b6b72c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 09:03:30 -0400
Subject: Fix AUTOFS_DEV_IOCTL_REQUESTER_CMD

Missing conversion from kernel to userland dev_t; this sucker
breaks as soon as we get sufficiently many autofs mounts for
new_encode_dev(s_dev) != s_dev.

Note: this is the minimal fix.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9e5ae8a..463f798 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -488,7 +488,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 	}
 
 	path = param->path;
-	devid = sbi->sb->s_dev;
+	devid = new_encode_dev(sbi->sb->s_dev);
 
 	param->requester.uid = param->requester.gid = -1;
 
-- 
cgit v1.1


From e5d67f0715bc60f6c19abdd86d007d7bb29c9951 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 12:15:39 -0400
Subject: Touch all affected namespaces on propagation of mount

We shouldn't just touch the namespace of current process

Caught-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index d9138f8..4119620 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1377,7 +1377,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	if (parent_path) {
 		detach_mnt(source_mnt, parent_path);
 		attach_mnt(source_mnt, path);
-		touch_mnt_namespace(current->nsproxy->mnt_ns);
+		touch_mnt_namespace(parent_path->mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
 		commit_tree(source_mnt);
-- 
cgit v1.1


From 1644ccc8a99ae73859c39372f96afdbf03c9f80d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:32:31 -0400
Subject: Safer nfsd_cross_mnt()

AFAICS, we have a subtle bug there: if we have crossed mountpoint
*and* it got mount --move'd away, we'll be holding only one
reference to fs containing dentry - exp->ex_path.mnt.  IOW, we
ought to dput() before exp_put().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ab93fcf..46e6bd2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 	}
 	if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
 		/* successfully crossed mount point */
-		exp_put(exp);
-		*expp = exp2;
+		/*
+		 * This is subtle: dentry is *not* under mnt at this point.
+		 * The only reason we are safe is that original mnt is pinned
+		 * down by exp, so we should dput before putting exp.
+		 */
 		dput(dentry);
 		*dpp = mounts;
+		exp_put(exp);
+		*expp = exp2;
 	} else {
 		exp_put(exp2);
 		dput(mounts);
-- 
cgit v1.1


From 24b6f16ecf37f918a1934d590e9e71c100d6388f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:25:41 -0400
Subject: No need for crossing to mountpoint in audit_tag_tree()

is_under() will DTRT anyway.  And yes, is_subdir() behaviour
is intentional.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 761d30b..1fcffeb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 	int result;
 	unsigned long seq;
 
-	/* FIXME: This is old behavior, needed? Please check callers. */
 	if (new_dentry == old_dentry)
 		return 1;
 
-- 
cgit v1.1


From 117aff744a20a2a04ccdb36cd5978316e1af0c3a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 11:19:26 -0400
Subject: Fix autofs_expire()

mnt should remain the same for all iterations through the list;
as it is, if we have a busy mount, mnt follows into it and isn't
restored for the next iteration.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs/dirhash.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af..4eb4d8d 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 {
 	struct autofs_dirhash *dh = &sbi->dirhash;
 	struct autofs_dir_ent *ent;
-	struct dentry *dentry;
 	unsigned long timeout = sbi->exp_timeout;
 
 	while (1) {
+		struct path path;
+		int umount_ok;
+
 		if ( list_empty(&dh->expiry_head) || sbi->catatonic )
 			return NULL;	/* No entries */
 		/* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 			return ent; /* Symlinks are always expirable */
 
 		/* Get the dentry for the autofs subdirectory */
-		dentry = ent->dentry;
+		path.dentry = ent->dentry;
 
-		if ( !dentry ) {
+		if (!path.dentry) {
 			/* Should only happen in catatonic mode */
 			printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
 			autofs_delete_usage(ent);
 			continue;
 		}
 
-		if ( !dentry->d_inode ) {
-			dput(dentry);
+		if (!path.dentry->d_inode) {
+			dput(path.dentry);
 			printk("autofs: negative dentry on expiry queue: %s\n",
 			       ent->name);
 			autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 
 		/* Make sure entry is mounted and unused; note that dentry will
 		   point to the mounted-on-top root. */
-		if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) {
+		if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
+		    !d_mountpoint(path.dentry)) {
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		mntget(mnt);
-		dget(dentry);
-		if (!follow_down(&mnt, &dentry)) {
-			dput(dentry);
-			mntput(mnt);
+		path.mnt = mnt;
+		path_get(&path);
+		if (!follow_down(&path.mnt, &path.dentry)) {
+			path_put(&path);
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(dentry) && follow_down(&mnt, &dentry))
+		while (d_mountpoint(path.dentry) &&
+		       follow_down(&path.mnt, &path.dentry))
 			;
-		dput(dentry);
+		umount_ok = may_umount(path.mnt);
+		path_put(&path);
 
-		if ( may_umount(mnt) ) {
-			mntput(mnt);
+		if (umount_ok) {
 			DPRINTK(("autofs: signaling expire on %s\n", ent->name));
 			return ent; /* Expirable! */
 		}
 		DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-		mntput(mnt);
 	}
 	return NULL;		/* No expirable entries */
 }
-- 
cgit v1.1


From 1ba0c7dbbbc24230394100c5f0d0df38cb400cff Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Mon, 20 Apr 2009 12:23:02 +0400
Subject: fs/compat_ioctl: fix build when !BLOCK

In file included from fs/compat_ioctl.c:61:
include/linux/loop.h:59: error: field 'lo_bio_list' has incomplete type

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3e87ce4..b83f6bc 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,7 +58,6 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
-#include <linux/loop.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
 #include <linux/gigaset_dev.h>
 
 #ifdef CONFIG_BLOCK
+#include <linux/loop.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
 
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
-
 #ifdef CONFIG_SPARC
 /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
 IGNORE_IOCTL(FBIOGTYPE)
-- 
cgit v1.1


From 2f9092e1020246168b1309b35e085ecd7ff9ff72 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 20 Apr 2009 23:18:37 +0100
Subject: Fix i_mutex vs. readdir handling in nfsd

Commit 14f7dd63 ("Copy XFS readdir hack into nfsd code") introduced a
bug to generic code which had been extant for a long time in the XFS
version -- it started to call through into lookup_one_len() and hence
into the file systems' ->lookup() methods without i_mutex held on the
directory.

This patch fixes it by locking the directory's i_mutex again before
calling the filldir functions. The original deadlocks which commit
14f7dd63 was designed to avoid are still avoided, because they were due
to fs-internal locking, not i_mutex.

While we're at it, fix the return type of nfsd_buffered_readdir() which
should be a __be32 not an int -- it's an NFS errno, not a Linux errno.
And return nfserrno(-ENOMEM) when allocation fails, not just -ENOMEM.
Sparse would have caught that, if it wasn't so busy bitching about
__cold__.

Commit 05f4f678 ("nfsd4: don't do lookup within readdir in recovery
code") introduced a similar problem with calling lookup_one_len()
without i_mutex, which this patch also addresses. To fix that, it was
necessary to fix the called functions so that they expect i_mutex to be
held; that part was done by J. Bruce Fields.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Umm-I-can-live-with-that-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Tested-by: J. Bruce Fields <bfields@citi.umich.edu>
LKML-Reference: <8036.1237474444@jrobl>
Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  2 ++
 fs/nfsd/nfs4recover.c | 46 +++++++++-------------------------------------
 fs/nfsd/vfs.c         | 25 +++++++++++++++++++------
 3 files changed, 30 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b8433eb..78f253c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	int err;
 	struct qstr this;
 
+	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
 	err = __lookup_one_len(name, &this, base, len);
 	if (err)
 		return ERR_PTR(err);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 3444c00..5275097 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -229,21 +229,23 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 		goto out;
 	status = vfs_readdir(filp, nfsd4_build_namelist, &names);
 	fput(filp);
+	mutex_lock(&dir->d_inode->i_mutex);
 	while (!list_empty(&names)) {
 		entry = list_entry(names.next, struct name_list, list);
 
 		dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
 		if (IS_ERR(dentry)) {
 			status = PTR_ERR(dentry);
-			goto out;
+			break;
 		}
 		status = f(dir, dentry);
 		dput(dentry);
 		if (status)
-			goto out;
+			break;
 		list_del(&entry->list);
 		kfree(entry);
 	}
+	mutex_unlock(&dir->d_inode->i_mutex);
 out:
 	while (!list_empty(&names)) {
 		entry = list_entry(names.next, struct name_list, list);
@@ -255,36 +257,6 @@ out:
 }
 
 static int
-nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
-{
-	int status;
-
-	if (!S_ISREG(dir->d_inode->i_mode)) {
-		printk("nfsd4: non-file found in client recovery directory\n");
-		return -EINVAL;
-	}
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	status = vfs_unlink(dir->d_inode, dentry);
-	mutex_unlock(&dir->d_inode->i_mutex);
-	return status;
-}
-
-static int
-nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
-{
-	int status;
-
-	/* For now this directory should already be empty, but we empty it of
-	 * any regular files anyway, just in case the directory was created by
-	 * a kernel from the future.... */
-	nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	status = vfs_rmdir(dir->d_inode, dentry);
-	mutex_unlock(&dir->d_inode->i_mutex);
-	return status;
-}
-
-static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
 	struct dentry *dentry;
@@ -294,18 +266,18 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 
 	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
-		return status;
+		goto out_unlock;
 	}
 	status = -ENOENT;
 	if (!dentry->d_inode)
 		goto out;
-
-	status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
+	status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
 out:
 	dput(dentry);
+out_unlock:
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	return status;
 }
 
@@ -348,7 +320,7 @@ purge_old(struct dentry *parent, struct dentry *child)
 	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
-	status = nfsd4_clear_clid_dir(parent, child);
+	status = vfs_rmdir(parent->d_inode, child);
 	if (status)
 		printk("failed to remove client recovery directory %s\n",
 				child->d_name.name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46e6bd2..6c68ffd 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1890,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
 	return 0;
 }
 
-static int nfsd_buffered_readdir(struct file *file, filldir_t func,
-				 struct readdir_cd *cdp, loff_t *offsetp)
+static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
+				    struct readdir_cd *cdp, loff_t *offsetp)
 {
 	struct readdir_data buf;
 	struct buffered_dirent *de;
@@ -1901,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 
 	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
 	if (!buf.dirent)
-		return -ENOMEM;
+		return nfserrno(-ENOMEM);
 
 	offset = *offsetp;
 
 	while (1) {
+		struct inode *dir_inode = file->f_path.dentry->d_inode;
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1924,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		if (!size)
 			break;
 
+		/*
+		 * Various filldir functions may end up calling back into
+		 * lookup_one_len() and the file system's ->lookup() method.
+		 * These expect i_mutex to be held, as it would within readdir.
+		 */
+		host_err = mutex_lock_killable(&dir_inode->i_mutex);
+		if (host_err)
+			break;
+
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
 
 			if (func(cdp, de->name, de->namlen, de->offset,
 				 de->ino, de->d_type))
-				goto done;
+				break;
 
 			if (cdp->err != nfs_ok)
-				goto done;
+				break;
 
 			reclen = ALIGN(sizeof(*de) + de->namlen,
 				       sizeof(u64));
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
+		mutex_unlock(&dir_inode->i_mutex);
+		if (size > 0) /* We bailed out early */
+			break;
+
 		offset = vfs_llseek(file, 0, SEEK_CUR);
 	}
 
- done:
 	free_page((unsigned long)(buf.dirent));
 
 	if (host_err)
-- 
cgit v1.1


From 3eac8778a237d83a1e553eba0c6f4fd4b39eeec0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:12:46 -0400
Subject: autofs4: use memchr() in invalid_string()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 463f798..84168c0 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
  * Check a string doesn't overrun the chunk of
  * memory we copied from user land.
  */
-static int invalid_str(char *str, void *end)
+static int invalid_str(char *str, size_t size)
 {
-	while ((void *) str <= end)
-		if (!*str++)
-			return 0;
+	if (memchr(str, 0, size))
+		return 0;
 	return -EINVAL;
 }
 
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 	}
 
 	if (param->size > sizeof(*param)) {
-		err = invalid_str(param->path,
-				 (void *) ((size_t) param + param->size));
+		err = invalid_str(param->path, param->size - sizeof(*param));
 		if (err) {
 			AUTOFS_WARN(
 			  "path string terminator missing for cmd(0x%08x)",
-- 
cgit v1.1


From 3939fcde24473dc09ce16e922c88df9b3bee45d9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 15:06:12 +0800
Subject: xattr: use memdup_user()

Remove open-coded memdup_user()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xattr.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fc..d51b8f9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 	if (size) {
 		if (size > XATTR_SIZE_MAX)
 			return -E2BIG;
-		kvalue = kmalloc(size, GFP_KERNEL);
-		if (!kvalue)
-			return -ENOMEM;
-		if (copy_from_user(kvalue, value, size)) {
-			kfree(kvalue);
-			return -EFAULT;
-		}
+		kvalue = memdup_user(value, size);
+		if (IS_ERR(kvalue))
+			return PTR_ERR(kvalue);
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
-- 
cgit v1.1


From dae7b665cf6d6e6e733f1c9c16cf55547dd37e33 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 15:06:54 +0800
Subject: btrfs: use memdup_user()

Remove open-coded memdup_user().

Note this changes some GFP_NOFS to GFP_KERNEL, since copy_from_user() may
cause pagefault, it's pointless to pass GFP_NOFS to kmalloc().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ioctl.c | 49 ++++++++++++-------------------------------------
 fs/btrfs/super.c | 13 ++++---------
 2 files changed, 16 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7594bec..9f135e8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	namelen = strlen(vol_args->name);
@@ -545,7 +539,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 
 out_unlock:
 	mutex_unlock(&root->fs_info->volume_mutex);
-out:
 	kfree(vol_args);
 	return ret;
 }
@@ -565,15 +558,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	namelen = strlen(vol_args->name);
@@ -675,19 +662,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
-out:
 	kfree(vol_args);
 	return ret;
 }
@@ -703,19 +684,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_rm_device(root, vol_args->name);
 
-out:
 	kfree(vol_args);
 	return ret;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9744af9..a7acfe6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -635,14 +635,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
-	if (!vol)
-		return -ENOMEM;
-
-	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
-		ret = -EFAULT;
-		goto out;
-	}
+	vol = memdup_user((void __user *)arg, sizeof(*vol));
+	if (IS_ERR(vol))
+		return PTR_ERR(vol);
 
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
@@ -650,7 +645,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 					    &btrfs_fs_type, &fs_devices);
 		break;
 	}
-out:
+
 	kfree(vol);
 	return ret;
 }
-- 
cgit v1.1


From 1c8542c7bb239ef02fe21477acd9cdac04c1b640 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 15:07:30 +0800
Subject: sysfs: use memdup_user()

Remove open-coded memdup_user().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysfs/bin.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 93e0c02..9345806 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 			count = size - offs;
 	}
 
-	temp = kmalloc(count, GFP_KERNEL);
-	if (!temp)
-		return -ENOMEM;
-
-	if (copy_from_user(temp, userbuf, count)) {
-		count = -EFAULT;
-		goto out_free;
-	}
+	temp = memdup_user(userbuf, count);
+	if (IS_ERR(temp))
+		return PTR_ERR(temp);
 
 	mutex_lock(&bb->mutex);
 
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 	if (count > 0)
 		*off = offs + count;
 
-out_free:
-	kfree(temp);
 	return count;
 }
 
-- 
cgit v1.1


From 0e639bdeef26faf287db77a15530f3f295a4ae04 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 15:08:04 +0800
Subject: xfs: use memdup_user()

Remove open-coded memdup_user()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/linux-2.6/xfs_ioctl.c   | 23 +++++++----------------
 fs/xfs/linux-2.6/xfs_ioctl32.c | 12 ++++--------
 2 files changed, 11 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b4994..34eaab6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
 	if (len > XATTR_SIZE_MAX)
 		return EINVAL;
 
-	kbuf = kmalloc(len, GFP_KERNEL);
-	if (!kbuf)
-		return ENOMEM;
-
-	if (copy_from_user(kbuf, ubuf, len))
-		goto out_kfree;
+	kbuf = memdup_user(ubuf, len);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
 
 	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
 
- out_kfree:
-	kfree(kbuf);
 	return error;
 }
 
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_dput;
 
-	error = ENOMEM;
-	ops = kmalloc(size, GFP_KERNEL);
-	if (!ops)
+	ops = memdup_user(am_hreq.ops, size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
 		goto out_dput;
-
-	error = EFAULT;
-	if (copy_from_user(ops, am_hreq.ops, size))
-		goto out_kfree_ops;
+	}
 
 	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
 	if (!attr_name)
 		goto out_kfree_ops;
 
-
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
 		ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3..0882d16 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_dput;
 
-	error = ENOMEM;
-	ops = kmalloc(size, GFP_KERNEL);
-	if (!ops)
+	ops = memdup_user(compat_ptr(am_hreq.ops), size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
 		goto out_dput;
-
-	error = EFAULT;
-	if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
-		goto out_kfree_ops;
+	}
 
 	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
 	if (!attr_name)
 		goto out_kfree_ops;
 
-
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
 		ops[i].am_error = strncpy_from_user(attr_name,
-- 
cgit v1.1


From a9482ebcdedbc5872ed34a266e6a45c35116f264 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 15:08:53 +0800
Subject: ncpfs: use memdup_user()

Remove open-coded memdup_user()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/ioctl.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f..fa038df 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
 			if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
 				return -ENOMEM;
 			if (user.object_name_len) {
-				newname = kmalloc(user.object_name_len, GFP_USER);
-				if (!newname)
-					return -ENOMEM;
-				if (copy_from_user(newname, user.object_name, user.object_name_len)) {
-					kfree(newname);
-					return -EFAULT;
-				}
+				newname = memdup_user(user.object_name,
+						      user.object_name_len);
+				if (IS_ERR(newname))
+					return PTR_ERR(newname);
 			} else {
 				newname = NULL;
 			}
@@ -760,13 +757,9 @@ outrel:
 			if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
 				return -ENOMEM;
 			if (user.len) {
-				new = kmalloc(user.len, GFP_USER);
-				if (!new)
-					return -ENOMEM;
-				if (copy_from_user(new, user.data, user.len)) {
-					kfree(new);
-					return -EFAULT;
-				}
+				new = memdup_user(user.data, user.len);
+				if (IS_ERR(new))
+					return PTR_ERR(new);
 			} else {
 				new = NULL;
 			}
-- 
cgit v1.1


From fd56d242b3b80b6f2ca174272b20029aae61df75 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 15:09:29 +0800
Subject: ecryptfs: use memdup_user()

Remove open-coded memdup_user().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/miscdev.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea6..dda3c58 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -418,18 +418,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 
 	if (count == 0)
 		goto out;
-	data = kmalloc(count, GFP_KERNEL);
-	if (!data) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
+
+	data = memdup_user(buf, count);
+	if (IS_ERR(data)) {
+		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+		       __func__, PTR_ERR(data));
 		goto out;
 	}
-	rc = copy_from_user(data, buf, count);
-	if (rc) {
-		printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
-		       __func__, rc);
-		goto out_free;
-	}
 	sz = count;
 	i = 0;
 	switch (data[i++]) {
-- 
cgit v1.1


From 0112fc2229847feb6c4eb011e6833d8f1742a375 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Wed, 8 Apr 2009 20:05:42 +0400
Subject: Separate out common fstatat code into vfs_fstatat

This is a version incorporating Christoph's suggestion.

Separate out common *fstatat functionality into a single function
instead of duplicating it all over the code.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 19 +++++--------------
 fs/stat.c   | 56 ++++++++++++++++++++++++++++----------------------------
 2 files changed, 33 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 3f84d5f..dda72e2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -204,21 +204,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 #endif
 
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a..5471166 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -109,6 +109,24 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_fstat);
 
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+{
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, stat);
+	else
+		error = vfs_stat_fd(dfd, filename, stat);
+out:
+	return error;
+}
+
+EXPORT_SYMBOL(vfs_fstatat);
+
+
 #ifdef __ARCH_WANT_OLD_STAT
 
 /*
@@ -264,21 +282,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 #endif
 
@@ -404,21 +413,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat64(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
 
-- 
cgit v1.1


From 2eae7a1874ca5be3232765d89e0250a449f1bc90 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 8 Apr 2009 16:34:03 -0400
Subject: kill vfs_stat_fd / vfs_lstat_fd

There's really no reason to keep vfs_stat_fd and vfs_lstat_fd with
Oleg's vfs_fstatat.  Use vfs_fstatat for the few cases having the
directory fd, and switch all others to vfs_stat / vfs_lstat.

Reviewed-by: Christoph Hellwig <hch@lst.de>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c |  18 ++++++-----
 fs/stat.c   | 105 ++++++++++++++++++++++++------------------------------------
 2 files changed, 52 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index dda72e2..379a399 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 
 asmlinkage long compat_sys_newlstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 
 #ifndef __ARCH_WANT_STAT64
diff --git a/fs/stat.c b/fs/stat.c
index 5471166..075694e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,46 +55,6 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
-{
-	struct path path;
-	int error;
-
-	error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
-	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
-		path_put(&path);
-	}
-	return error;
-}
-
-int vfs_stat(char __user *name, struct kstat *stat)
-{
-	return vfs_stat_fd(AT_FDCWD, name, stat);
-}
-
-EXPORT_SYMBOL(vfs_stat);
-
-int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
-{
-	struct path path;
-	int error;
-
-	error = user_path_at(dfd, name, 0, &path);
-	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
-		path_put(&path);
-	}
-	return error;
-}
-
-int vfs_lstat(char __user *name, struct kstat *stat)
-{
-	return vfs_lstat_fd(AT_FDCWD, name, stat);
-}
-
-EXPORT_SYMBOL(vfs_lstat);
-
 int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
 	struct file *f = fget(fd);
@@ -106,26 +66,43 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 	}
 	return error;
 }
-
 EXPORT_SYMBOL(vfs_fstat);
 
 int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
 {
+	struct path path;
 	int error = -EINVAL;
+	int lookup_flags = 0;
 
 	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
 		goto out;
 
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, stat);
-	else
-		error = vfs_stat_fd(dfd, filename, stat);
+	if (!(flag & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = vfs_getattr(path.mnt, path.dentry, stat);
+	path_put(&path);
 out:
 	return error;
 }
-
 EXPORT_SYMBOL(vfs_fstatat);
 
+int vfs_stat(char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, 0);
+}
+EXPORT_SYMBOL(vfs_stat);
+
+int vfs_lstat(char __user *name, struct kstat *stat)
+{
+	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
+}
+EXPORT_SYMBOL(vfs_lstat);
+
 
 #ifdef __ARCH_WANT_OLD_STAT
 
@@ -173,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_old_stat(&stat, statbuf);
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_old_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_old_stat(&stat, statbuf);
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_old_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -258,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error = vfs_stat(filename, &stat);
 
-	return error;
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_new_stat(&stat, statbuf);
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-- 
cgit v1.1


From 38e23c95f92a84fb8505a9f572b8a209c9c372c1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 9 Apr 2009 20:17:52 +0900
Subject: fs: Mark get_filesystem_list() as __init function.

"int get_filesystem_list(char * buf)" is called by only
"static void __init get_fs_names(char *page)".
We can mark get_filesystem_list() as "__init".

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/filesystems.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa7026..a24c58e 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 	return retval;
 }
 
-int get_filesystem_list(char * buf)
+int __init get_filesystem_list(char *buf)
 {
 	int len = 0;
 	struct file_system_type * tmp;
-- 
cgit v1.1


From 8340437210390676f687633a80e3748c40885dc8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Apr 2009 14:58:35 -0400
Subject: NFS: Fix the XDR iovec calculation in nfs3_xdr_setaclargs

Commit ae46141ff08f1965b17c531b571953c39ce8b9e2 (NFSv3: Fix posix ACL code)
introduces a bug in the calculation of the XDR header iovec. In the case
where we are inlining the acls, we need to adjust the length of the iovec
req->rq_svec, in addition to adjusting the total buffer length.

Tested-by: Leonardo Chiquitto <leonardo.lists@gmail.com>
Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs3xdr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6a1932..35869a4 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
 	if (args->npages != 0)
 		xdr_encode_pages(buf, args->pages, 0, args->len);
 	else
-		req->rq_slen += args->len;
+		req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+				p + XDR_QUADLEN(args->len));
 
 	err = nfsacl_encode(buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
-- 
cgit v1.1


From 546888da82082555a56528730a83f0afd12f33bf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 21 Apr 2009 11:53:38 -0400
Subject: Btrfs: fix btrfs fallocate oops and deadlock

Btrfs fallocate was incorrectly starting a transaction with a lock held
on the extent_io tree for the file, which could deadlock.  Strictly
speaking it was using join_transaction which would be safe, but it is better
to move the transaction outside of the lock.

When preallocated extents are overwritten, btrfs_mark_buffer_dirty was
being called on an unlocked buffer.  This was triggering an assertion and
oops because the lock is supposed to be held.

The bug was calling btrfs_mark_buffer_dirty on a leaf after btrfs_del_item had
been run.  btrfs_del_item takes care of dirtying things, so the solution is a
to skip the btrfs_mark_buffer_dirty call in this case.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  |  4 +++-
 fs/btrfs/inode.c | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e21c006..482f8db 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -830,7 +830,7 @@ again:
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		BUG_ON(ret);
-		goto done;
+		goto release;
 	} else if (split == start) {
 		if (locked_end < extent_end) {
 			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +926,8 @@ again:
 	}
 done:
 	btrfs_mark_buffer_dirty(leaf);
+
+release:
 	btrfs_release_path(root, path);
 	if (split_end && split == start) {
 		split = end;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0d1dd4..65219f6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4970,10 +4970,10 @@ out_fail:
 	return err;
 }
 
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
+			       struct inode *inode, u64 start, u64 end,
 			       u64 alloc_hint, int mode)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 alloc_size;
@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 	u64 num_bytes = end - start;
 	int ret = 0;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
-	btrfs_set_trans_block_group(trans, inode);
-
 	while (num_bytes > 0) {
 		alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -5015,7 +5011,6 @@ out:
 		BUG_ON(ret);
 	}
 
-	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	u64 alloc_hint = 0;
 	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
+	struct btrfs_trans_handle *trans;
 	int ret;
 
 	alloc_start = offset & ~mask;
 	alloc_end =  (offset + len + mask) & ~mask;
 
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
 	mutex_lock(&inode->i_mutex);
 	if (alloc_start > inode->i_size) {
 		ret = btrfs_cont_expand(inode, alloc_start);
@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
+
+		trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+		if (!trans) {
+			ret = -EIO;
+			goto out;
+		}
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
 		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
 			    alloc_end - 1, GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent(&BTRFS_I(inode)->io_tree,
 				      alloc_start, alloc_end - 1, GFP_NOFS);
+			btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
 			btrfs_wait_ordered_range(inode, alloc_start,
 						 alloc_end - alloc_start);
 		} else {
@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		last_byte = min(extent_map_end(em), alloc_end);
 		last_byte = (last_byte + mask) & ~mask;
 		if (em->block_start == EXTENT_MAP_HOLE) {
-			ret = prealloc_file_range(inode, cur_offset,
+			ret = prealloc_file_range(trans, inode, cur_offset,
 					last_byte, alloc_hint, mode);
 			if (ret < 0) {
 				free_extent_map(em);
@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	}
 	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
 		      GFP_NOFS);
+
+	btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.1


From c12ddba09394c60e1120e6997794fa6ed52da884 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 21 Apr 2009 12:24:05 -0700
Subject: hugetlbfs: return negative error code for bad mount option

This fixes the following BUG:

  # mount -o size=MM -t hugetlbfs none /huge
  hugetlbfs: Bad value 'MM' for mount option 'size=MM'
  ------------[ cut here ]------------
  kernel BUG at fs/super.c:996!

Due to

	BUG_ON(!mnt->mnt_sb);

in vfs_kern_mount().

Also, remove unused #include <linux/quotaops.h>

Cc: William Irwin <wli@holomorphy.com>
Cc: <stable@kernel.org>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 23a3c76..153d968 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/pagevec.h>
 #include <linux/parser.h>
 #include <linux/mman.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
@@ -842,7 +841,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 bad_val:
  	printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
 	       args[0].from, p);
- 	return 1;
+ 	return -EINVAL;
 }
 
 static int
-- 
cgit v1.1


From 451a9ebf653d28337ba53ed5b4b70b0b9543cca1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Apr 2009 19:50:51 +0200
Subject: bio: fix bio_kmalloc()

Impact: fix bio_kmalloc() and its destruction path

bio_kmalloc() was broken in two ways.

* bvec_alloc_bs() first allocates bvec using kmalloc() and then
  ignores it and allocates again like non-kmalloc bvecs.

* bio_kmalloc_destructor() didn't check for and free bio integrity
  data.

This patch fixes the above problems.  kmalloc patch is separated out
from bio_alloc_bioset() and allocates the requested number of bvecs as
inline bvecs.

* bio_alloc_bioset() no longer takes NULL @bs.  None other than
  bio_kmalloc() used it and outside users can't know how it was
  allocated anyway.

* Define and use BIO_POOL_NONE so that pool index check in
  bvec_free_bs() triggers if inline or kmalloc allocated bvec gets
  there.

* Relocate destructors on top of each allocation function so that how
  they're used is more clear.

Jens Axboe suggested allocating bvecs inline.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 118 +++++++++++++++++++++++++++++----------------------------------
 1 file changed, 54 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index cd42bb8..d35588f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 	struct bio_vec *bvl;
 
 	/*
-	 * If 'bs' is given, lookup the pool and do the mempool alloc.
-	 * If not, this is a bio_kmalloc() allocation and just do a
-	 * kzalloc() for the exact number of vecs right away.
-	 */
-	if (!bs)
-		bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
-
-	/*
 	 * see comment near bvec_array define!
 	 */
 	switch (nr) {
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 	mempool_free(p, bs->bio_pool);
 }
 
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_fs_destructor(struct bio *bio)
-{
-	bio_free(bio, fs_bio_set);
-}
-
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-	if (bio_has_allocated_vec(bio))
-		kfree(bio->bi_io_vec);
-	kfree(bio);
-}
-
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	unsigned long idx = BIO_POOL_NONE;
 	struct bio_vec *bvl = NULL;
-	struct bio *bio = NULL;
-	unsigned long idx = 0;
-	void *p = NULL;
-
-	if (bs) {
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
-		if (!p)
-			goto err;
-		bio = p + bs->front_pad;
-	} else {
-		bio = kmalloc(sizeof(*bio), gfp_mask);
-		if (!bio)
-			goto err;
-	}
+	struct bio *bio;
+	void *p;
+
+	p = mempool_alloc(bs->bio_pool, gfp_mask);
+	if (unlikely(!p))
+		return NULL;
+	bio = p + bs->front_pad;
 
 	bio_init(bio);
 
@@ -332,22 +303,50 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		nr_iovecs = bvec_nr_vecs(idx);
 	}
+out_set:
 	bio->bi_flags |= idx << BIO_POOL_OFFSET;
 	bio->bi_max_vecs = nr_iovecs;
-out_set:
 	bio->bi_io_vec = bvl;
-
 	return bio;
 
 err_free:
-	if (bs)
-		mempool_free(p, bs->bio_pool);
-	else
-		kfree(bio);
-err:
+	mempool_free(p, bs->bio_pool);
 	return NULL;
 }
 
+static void bio_fs_destructor(struct bio *bio)
+{
+	bio_free(bio, fs_bio_set);
+}
+
+/**
+ *	bio_alloc - allocate a new bio, memory pool backed
+ *	@gfp_mask: allocation mask to use
+ *	@nr_iovecs: number of iovecs
+ *
+ *	Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask
+ *	contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ *
+ *	RETURNS:
+ *	Pointer to new bio on success, NULL on failure.
+ */
+struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+
+	if (bio)
+		bio->bi_destructor = bio_fs_destructor;
+
+	return bio;
+}
+
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	if (bio_integrity(bio))
+		bio_integrity_free(bio);
+	kfree(bio);
+}
+
 /**
  * bio_alloc - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -366,29 +365,20 @@ err:
  *   do so can cause livelocks under memory pressure.
  *
  **/
-struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
-{
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
-	if (bio)
-		bio->bi_destructor = bio_fs_destructor;
-
-	return bio;
-}
-
-/*
- * Like bio_alloc(), but doesn't use a mempool backing. This means that
- * it CAN fail, but while bio_alloc() can only be used for allocations
- * that have a short (finite) life span, bio_kmalloc() should be used
- * for more permanent bio allocations (like allocating some bio's for
- * initalization or setup purposes).
- */
 struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+	struct bio *bio;
 
-	if (bio)
-		bio->bi_destructor = bio_kmalloc_destructor;
+	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
+		      gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+
+	bio_init(bio);
+	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bio->bi_inline_vecs;
+	bio->bi_destructor = bio_kmalloc_destructor;
 
 	return bio;
 }
-- 
cgit v1.1


From a9e9dc24bbc3e084450a22cf4fb82f5f5d4cbeea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Apr 2009 22:10:27 +0900
Subject: bio: use bio_kmalloc() in copy/map functions

Impact: remove possible deadlock condition

There is no reason to use mempool backed allocation for map functions.
Also, because kern mapping is used inside LLDs (e.g. for EH), using
mempool backed allocation can lead to deadlock under extreme
conditions (mempool already consumed by the time a request reached EH
and requests are blocked on EH).

Switch copy/map functions to bio_kmalloc().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index d35588f..7bbc98f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -822,7 +822,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	bio = bio_alloc(gfp_mask, nr_pages);
+	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
@@ -946,7 +946,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
-	bio = bio_alloc(gfp_mask, nr_pages);
+	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
@@ -1130,7 +1130,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
 	int offset, i;
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_mask, nr_pages);
+	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.1


From ae6e84596e7b321d9a08e81679c6a3f799634636 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 12 Mar 2009 00:19:46 -0500
Subject: eCryptfs: Copy lower inode attrs before dentry instantiation

Copies the lower inode attributes to the upper inode before passing the
upper inode to d_instantiate().  This is important for
security_d_instantiate().

The problem was discovered by a user seeing SELinux denials like so:

type=AVC msg=audit(1236812817.898:47): avc:  denied  { 0x100000 } for
pid=3584 comm="httpd" name="testdir" dev=ecryptfs ino=943872
scontext=root:system_r:httpd_t:s0
tcontext=root:object_r:httpd_sys_content_t:s0 tclass=file

Notice target class is file while testdir is really a directory,
confusing the permission translation (0x100000) due to the wrong i_mode.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index aed56c2..7638b0a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 		init_special_inode(inode, lower_inode->i_mode,
 				   lower_inode->i_rdev);
 	dentry->d_op = &ecryptfs_dops;
-	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-		d_add(dentry, inode);
-	else
-		d_instantiate(dentry, inode);
 	fsstack_copy_attr_all(inode, lower_inode, NULL);
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
 	fsstack_copy_inode_size(inode, lower_inode);
+	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
+		d_add(dentry, inode);
+	else
+		d_instantiate(dentry, inode);
 out:
 	return rc;
 }
-- 
cgit v1.1


From 57ea34d19963781d05eb12f9b31bd4f70d61ec16 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Sun, 15 Mar 2009 14:17:01 -0500
Subject: eCryptfs: NULL pointer dereference in ecryptfs_send_miscdev()

If data is NULL, msg_ctx->msg is set to NULL and then dereferenced
afterwards.  ecryptfs_send_raw_message() is the only place that
ecryptfs_send_miscdev() is called with data being NULL, but the only
caller of that function (ecryptfs_process_helo()) is never called.  In
short, there is currently no way to trigger the NULL pointer
dereference.

This patch removes the two unused functions and modifies
ecryptfs_send_miscdev() to remove the NULL dereferences.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/messaging.c | 82 -------------------------------------------------
 fs/ecryptfs/miscdev.c   | 28 +++++++----------
 2 files changed, 11 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 295e7fa5..f1c17e8 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -133,45 +133,6 @@ out:
 	return rc;
 }
 
-static int
-ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
-			     struct ecryptfs_msg_ctx **msg_ctx);
-
-/**
- * ecryptfs_send_raw_message
- * @msg_type: Message type
- * @daemon: Daemon struct for recipient of message
- *
- * A raw message is one that does not include an ecryptfs_message
- * struct. It simply has a type.
- *
- * Must be called with ecryptfs_daemon_hash_mux held.
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_send_raw_message(u8 msg_type,
-				     struct ecryptfs_daemon *daemon)
-{
-	struct ecryptfs_msg_ctx *msg_ctx;
-	int rc;
-
-	rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
-	if (rc) {
-		printk(KERN_ERR "%s: Error whilst attempting to send "
-		       "message to ecryptfsd; rc = [%d]\n", __func__, rc);
-		goto out;
-	}
-	/* Raw messages are logically context-free (e.g., no
-	 * reply is expected), so we set the state of the
-	 * ecryptfs_msg_ctx object to indicate that it should
-	 * be freed as soon as the message is sent. */
-	mutex_lock(&msg_ctx->mux);
-	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
-	mutex_unlock(&msg_ctx->mux);
-out:
-	return rc;
-}
-
 /**
  * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
  * @daemon: Pointer to set to newly allocated daemon struct
@@ -212,49 +173,6 @@ out:
 }
 
 /**
- * ecryptfs_process_helo
- * @euid: The user ID owner of the message
- * @user_ns: The namespace in which @euid applies
- * @pid: The process ID for the userspace program that sent the
- *       message
- *
- * Adds the euid and pid values to the daemon euid hash.  If an euid
- * already has a daemon pid registered, the daemon will be
- * unregistered before the new daemon is put into the hash list.
- * Returns zero after adding a new daemon to the hash list;
- * non-zero otherwise.
- */
-int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
-			  struct pid *pid)
-{
-	struct ecryptfs_daemon *new_daemon;
-	struct ecryptfs_daemon *old_daemon;
-	int rc;
-
-	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
-	if (rc != 0) {
-		printk(KERN_WARNING "Received request from user [%d] "
-		       "to register daemon [0x%p]; unregistering daemon "
-		       "[0x%p]\n", euid, pid, old_daemon->pid);
-		rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
-		if (rc)
-			printk(KERN_WARNING "Failed to send QUIT "
-			       "message to daemon [0x%p]; rc = [%d]\n",
-			       old_daemon->pid, rc);
-		hlist_del(&old_daemon->euid_chain);
-		kfree(old_daemon);
-	}
-	rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
-	if (rc)
-		printk(KERN_ERR "%s: The gods are displeased with this attempt "
-		       "to create a new daemon object for euid [%d]; pid "
-		       "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
-	mutex_unlock(&ecryptfs_daemon_hash_mux);
-	return rc;
-}
-
-/**
  * ecryptfs_exorcise_daemon - Destroy the daemon struct
  *
  * Must be called ceremoniously while in possession of
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index dda3c58..4ec8f61 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 	int rc = 0;
 
 	mutex_lock(&msg_ctx->mux);
-	if (data) {
-		msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
-				       GFP_KERNEL);
-		if (!msg_ctx->msg) {
-			rc = -ENOMEM;
-			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
-			       (sizeof(*msg_ctx->msg) + data_size));
-			goto out_unlock;
-		}
-	} else
-		msg_ctx->msg = NULL;
+	msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
+			       GFP_KERNEL);
+	if (!msg_ctx->msg) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
+		       (sizeof(*msg_ctx->msg) + data_size));
+		goto out_unlock;
+	}
 	msg_ctx->msg->index = msg_ctx->index;
 	msg_ctx->msg->data_len = data_size;
 	msg_ctx->type = msg_type;
-	if (data) {
-		memcpy(msg_ctx->msg->data, data, data_size);
-		msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
-	} else
-		msg_ctx->msg_size = 0;
+	memcpy(msg_ctx->msg->data, data, data_size);
+	msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
 	mutex_lock(&daemon->mux);
 	list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
 	daemon->num_queued_msg_ctx++;
-- 
cgit v1.1


From 3a5203ab3c0c31e0f1434c69e893bfb85c6e6657 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 16 Mar 2009 12:35:12 -0500
Subject: eCryptfs: Print FNEK sig properly in /proc/mounts

The filename encryption key signature is not properly displayed in
/proc/mounts.  The "ecryptfs_sig=" mount option name is displayed for
all global authentication tokens, included those for filename keys.

This patch checks the global authentication token flags to determine if
the key is a FEKEK or FNEK and prints the appropriate mount option name
before the signature.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index c27ac2b..b500302 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 	list_for_each_entry(walker,
 			    &mount_crypt_stat->global_auth_tok_list,
 			    mount_crypt_stat_list) {
-		seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
+		if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
+		else
+			seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
 	}
 	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
 
-- 
cgit v1.1


From 13a791b4e63eb0537a7f804a340d6527485983b4 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 15:29:27 -0500
Subject: eCryptfs: Fix data corruption when using ecryptfs_passthrough

ecryptfs_passthrough is a mount option that allows eCryptfs to allow
data to be written to non-eCryptfs files in the lower filesystem.  The
passthrough option was causing data corruption due to it not always
being treated as a non-eCryptfs file.

The first 8 bytes of an eCryptfs file contains the decrypted file size.
This value was being written to the non-eCryptfs files, too.  Also,
extra 0x00 characters were being written to make the file size a
multiple of PAGE_CACHE_SIZE.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c     | 21 ++-------------------
 fs/ecryptfs/inode.c      |  9 +++++++--
 fs/ecryptfs/mmap.c       | 11 +++++++++++
 fs/ecryptfs/read_write.c | 32 +++++++++++++++++++++-----------
 4 files changed, 41 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8b65f28..b91851f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page)
 	ecryptfs_inode = page->mapping->host;
 	crypt_stat =
 		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page,
-						       0, PAGE_CACHE_SIZE);
-		if (rc)
-			printk(KERN_ERR "%s: Error attempting to copy "
-			       "page at index [%ld]\n", __func__,
-			       page->index);
-		goto out;
-	}
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 	enc_extent_page = alloc_page(GFP_USER);
 	if (!enc_extent_page) {
 		rc = -ENOMEM;
@@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page)
 	ecryptfs_inode = page->mapping->host;
 	crypt_stat =
 		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-		rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
-						      PAGE_CACHE_SIZE,
-						      ecryptfs_inode);
-		if (rc)
-			printk(KERN_ERR "%s: Error attempting to copy "
-			       "page at index [%ld]\n", __func__,
-			       page->index);
-		goto out;
-	}
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 	enc_extent_page = alloc_page(GFP_USER);
 	if (!enc_extent_page) {
 		rc = -ENOMEM;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55b3145..5ed86e2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -814,6 +814,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 		size_t num_zeros = (PAGE_CACHE_SIZE
 				    - (new_length & ~PAGE_CACHE_MASK));
 
+		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+			rc = vmtruncate(inode, new_length);
+			if (rc)
+				goto out_free;
+			rc = vmtruncate(lower_dentry->d_inode, new_length);
+			goto out_free;
+		}
 		if (num_zeros) {
 			char *zeros_virt;
 
@@ -915,8 +922,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 			}
 			rc = 0;
 			crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-			mutex_unlock(&crypt_stat->cs_mutex);
-			goto out;
 		}
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 46cec2b..5c6bab9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
 	struct ecryptfs_crypt_stat *crypt_stat;
 
 	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
 		return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
 	else
@@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file,
 		ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
 	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
 			"(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
+						       to);
+		if (!rc) {
+			rc = copied;
+			fsstack_copy_inode_size(ecryptfs_inode,
+				ecryptfs_inode_to_lower(ecryptfs_inode));
+		}
+		goto out;
+	}
 	/* Fills in zeros if 'to' goes beyond inode size */
 	rc = fill_zeros_to_end_of_page(page, to);
 	if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 75c2ea9..a137c6e 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		   size_t size)
 {
 	struct page *ecryptfs_page;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
 	char *ecryptfs_page_virt;
-	loff_t ecryptfs_file_size =
-		i_size_read(ecryptfs_file->f_dentry->d_inode);
+	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
 	loff_t data_offset = 0;
 	loff_t pos;
 	int rc = 0;
 
+	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
 	/*
 	 * if we are writing beyond current size, then start pos
 	 * at the current size - we'll fill in zeros from there.
@@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		flush_dcache_page(ecryptfs_page);
 		SetPageUptodate(ecryptfs_page);
 		unlock_page(ecryptfs_page);
-		rc = ecryptfs_encrypt_page(ecryptfs_page);
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+			rc = ecryptfs_encrypt_page(ecryptfs_page);
+		else
+			rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+						ecryptfs_page,
+						start_offset_in_page,
+						data_offset);
 		page_cache_release(ecryptfs_page);
 		if (rc) {
 			printk(KERN_ERR "%s: Error encrypting "
@@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		pos += num_bytes;
 	}
 	if ((offset + size) > ecryptfs_file_size) {
-		i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size));
-		rc = ecryptfs_write_inode_size_to_metadata(
-			ecryptfs_file->f_dentry->d_inode);
-		if (rc) {
-			printk(KERN_ERR	"Problem with "
-			       "ecryptfs_write_inode_size_to_metadata; "
-			       "rc = [%d]\n", rc);
-			goto out;
+		i_size_write(ecryptfs_inode, (offset + size));
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
+			rc = ecryptfs_write_inode_size_to_metadata(
+								ecryptfs_inode);
+			if (rc) {
+				printk(KERN_ERR	"Problem with "
+				       "ecryptfs_write_inode_size_to_metadata; "
+				       "rc = [%d]\n", rc);
+				goto out;
+			}
 		}
 	}
 out:
-- 
cgit v1.1


From e77cc8d243f9f1e1d3f0799e23cc14e837ccc8c6 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Wed, 22 Apr 2009 04:08:46 -0500
Subject: eCryptfs: Remove ecryptfs_unlink_sigs warnings

A feature was added to the eCryptfs umount helper to automatically
unlink the keys used for an eCryptfs mount from the kernel keyring upon
umount.  This patch keeps the unrecognized mount option warnings for
ecryptfs_unlink_sigs out of the logs.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/ecryptfs_kernel.h | 1 +
 fs/ecryptfs/main.c            | 6 +++++-
 fs/ecryptfs/super.c           | 2 ++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 064c582..00b30a2 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
 #define ECRYPTFS_ENCFN_USE_FEK        0x00002000
+#define ECRYPTFS_UNLINK_SIGS	      0x00004000
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 7638b0a..ccabd5f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
        ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
        ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
 
 static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
@@ -222,6 +222,7 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
 	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
 	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
+	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 				fn_cipher_key_bytes;
 			fn_cipher_key_bytes_set = 1;
 			break;
+		case ecryptfs_opt_unlink_sigs:
+			mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
+			break;
 		case ecryptfs_opt_err:
 		default:
 			printk(KERN_WARNING
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b500302..fa4c7e7 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -189,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_printf(m, ",ecryptfs_xattr_metadata");
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
 		seq_printf(m, ",ecryptfs_encrypted_view");
+	if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
+		seq_printf(m, ",ecryptfs_unlink_sigs");
 
 	return 0;
 }
-- 
cgit v1.1


From ca8e34f2b05a8289b47907b083dc01dd654ecbde Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Wed, 22 Apr 2009 16:27:12 -0500
Subject: eCryptfs: Lock lower directory inode mutex during lookup

This patch locks the lower directory inode's i_mutex before calling
lookup_one_len() to find the appropriate dentry in the lower filesystem.
This bug was found thanks to the warning set in commit 2f9092e1.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5ed86e2..d832526 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
 	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
 				      lower_dir_dentry,
 				      ecryptfs_dentry->d_name.len);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out_d_drop;
 	}
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
 	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
 				      lower_dir_dentry,
 				      encrypted_and_encoded_name_size - 1);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
-- 
cgit v1.1


From 3a6b42cadc112b01daf0525e5fcd90bb333a5bb3 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 18:35:37 -0500
Subject: eCryptfs: Larger buffer for encrypted symlink targets

When using filename encryption with eCryptfs, the value of the symlink
in the lower filesystem is encrypted and stored as a Tag 70 packet.
This results in a longer symlink target than if the target value wasn't
encrypted.

Users were reporting these messages in their syslog:

[ 45.653441] ecryptfs_parse_tag_70_packet: max_packet_size is [56]; real
packet size is [51]
[ 45.653444] ecryptfs_decode_and_decrypt_filename: Could not parse tag
70 packet from filename; copying through filename as-is

This was due to bufsiz, one the arguments in readlink(), being used to
when allocating the buffer passed to the lower inode's readlink().
That symlink target may be very large, but when decoded and decrypted,
could end up being smaller than bufsize.

To fix this, the buffer passed to the lower inode's readlink() will
always be PATH_MAX in size when filename encryption is enabled.  Any
necessary truncation occurs after the decoding and decrypting.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d832526..93bc0f8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -640,8 +640,9 @@ static int
 ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
 	char *lower_buf;
+	size_t lower_bufsiz;
 	struct dentry *lower_dentry;
-	struct ecryptfs_crypt_stat *crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	char *plaintext_name;
 	size_t plaintext_name_size;
 	mm_segment_t old_fs;
@@ -652,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 		rc = -EINVAL;
 		goto out;
 	}
-	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						dentry->d_sb)->mount_crypt_stat;
+	/*
+	 * If the lower filename is encrypted, it will result in a significantly
+	 * longer name.  If needed, truncate the name after decode and decrypt.
+	 */
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+		lower_bufsiz = PATH_MAX;
+	else
+		lower_bufsiz = bufsiz;
 	/* Released in this function */
-	lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+	lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
 	if (lower_buf == NULL) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%d] bytes\n", __func__, bufsiz);
+		       "kmalloc [%d] bytes\n", __func__, lower_bufsiz);
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -665,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 	set_fs(get_ds());
 	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
 						   (char __user *)lower_buf,
-						   bufsiz);
+						   lower_bufsiz);
 	set_fs(old_fs);
 	if (rc >= 0) {
 		rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
@@ -678,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 				rc);
 			goto out_free_lower_buf;
 		}
-		rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+		/* Check for bufsiz <= 0 done in sys_readlinkat() */
+		rc = copy_to_user(buf, plaintext_name,
+				  min((unsigned) bufsiz, plaintext_name_size));
 		if (rc)
 			rc = -EFAULT;
 		else
-- 
cgit v1.1


From e84a26ce178345498a7eca0590852bcc36f1092f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 22 Apr 2009 20:52:25 -0400
Subject: ext4: Make the extent validity check more paranoid

Instead of just checking that the extent block number is greater or
equal than s_first_data_block, make sure it it is not pointing into
the block group descriptors, since that is clearly wrong.  This helps
prevent filesystem from getting very badly corrupted in case an extent
block is corrupted.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6132353..c28ffe2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,11 +326,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext);
+	ext4_fsblk_t block = ext_pblock(ext), valid_block;
 	int len = ext4_ext_get_actual_len(ext);
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			((block + len) > ext4_blocks_count(es))))
+
+	valid_block = le32_to_cpu(es->s_first_data_block) +
+		EXT4_SB(inode->i_sb)->s_gdb_count;
+	if (unlikely(block <= valid_block ||
+		     ((block + len) > ext4_blocks_count(es))))
 		return 0;
 	else
 		return 1;
@@ -339,10 +342,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx);
+	ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			(block >= ext4_blocks_count(es))))
+
+	valid_block = le32_to_cpu(es->s_first_data_block) +
+		EXT4_SB(inode->i_sb)->s_gdb_count;
+	if (unlikely(block <= valid_block ||
+		     (block >= ext4_blocks_count(es))))
 		return 0;
 	else
 		return 1;
-- 
cgit v1.1


From b5451f7b2694b04d9f912f6cf09db1729f291996 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 22 Apr 2009 21:00:36 -0400
Subject: ext4: Fix potential inode allocation soft lockup in Orlov allocator

If the Orlov allocator is having trouble finding an appropriate block
group, the fallback code could loop forever, causing a soft lockup
warning in find_group_orlov():

BUG: soft lockup - CPU#0 stuck for 61s! [cp:11728]
     ...
Pid: 11728, comm: cp Not tainted (2.6.30-rc1-dirty #77) Lenovo
EIP: 0060:[<c021650e>] EFLAGS: 00000246 CPU: 0
EIP is at ext4_get_group_desc+0x54/0x9d
    ...
Call Trace:
 [<c0218021>] find_group_orlov+0x2ee/0x334
 [<c0120a5f>] ? sched_clock+0x8/0xb
 [<c02188e3>] ext4_new_inode+0x2cf/0xb1a

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index cbce5aa..f18e0a0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -585,6 +585,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 fallback:
 	ngroups = sbi->s_groups_count;
 	avefreei = freei / ngroups;
+fallback_retry:
 	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
 		grp = (parent_group + i) % ngroups;
@@ -602,7 +603,7 @@ fallback:
 		 * filesystems the above test can fail to find any blockgroups
 		 */
 		avefreei = 0;
-		goto fallback;
+		goto fallback_retry;
 	}
 
 	return -1;
-- 
cgit v1.1


From d8bd504ab800c8e9aadb983914a33e7166320bec Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Apr 2009 08:54:02 +0100
Subject: GFS2: Fix bug in block allocation

The new bitfit algorithm was counting from the wrong end of
64 bit words in the bitfield. This fixes it by using __ffs64
instead of fls64

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f03d024..c9786a4 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 	if (tmp == 0)
 		return BFITNOENT;
 	ptr--;
-	bit = fls64(tmp);
-	bit--;		/* fls64 always adds one to the bit count */
+	bit = __ffs64(tmp);
 	bit /= 2;	/* two bits per entry in the bitmap */
 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
 }
-- 
cgit v1.1


From d9ba7615bfd8bb06f79c853f9dfff9e93a837941 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Apr 2009 08:59:41 +0100
Subject: GFS2: Ensure that the inode goal block settings are updated

GFS2 has a goal block associated with each inode indicating the
search start position for future block allocations (in fact there
are two, but thats for backward compatibility with GFS1 as they
are set to identical locations in GFS2).

In some circumstances, depending on the ordering of updates to
the inode it was possible for the goal block settings to not
be updated on disk. This patch ensures that the goal block will
always get updated, thus reducing the potential for searching
the same (already allocated) blocks again when looking for free
space during block allocation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c9786a4..5650382 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1444,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_rgrpd *rgd = al->al_rgd;
 	u32 goal, blk;
 	u64 block;
+	int error;
 
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
@@ -1460,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
-
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error == 0) {
+		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
+		brelse(dibh);
+	}
 	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
 	rgd->rd_free -= *n;
 
-- 
cgit v1.1


From e1c805309d19c69d4ebeac38724076fa86feacdf Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 23 Apr 2009 13:58:08 +0200
Subject: [S390] /proc/stat idle field for idle cpus

The cpu idle field in the output of /proc/stat is too small for cpus
that have been idle for more than a tick. Add the architecture hook
arch_idle_time that allows to add the not accounted idle time of a
sleeping cpu without waking the cpu.

The s390 implementation of arch_idle_time uses the already existing
s390_idle_data per_cpu variable to find the sleep time of a neighboring
idle cpu.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 fs/proc/stat.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f75efa2..81e4eb6 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,6 +18,9 @@
 #ifndef arch_irq_stat
 #define arch_irq_stat() 0
 #endif
+#ifndef arch_idle_time
+#define arch_idle_time(cpu) 0
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v)
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
 		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+		idle = cputime64_add(idle, arch_idle_time(i));
 		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
@@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v)
 		nice = kstat_cpu(i).cpustat.nice;
 		system = kstat_cpu(i).cpustat.system;
 		idle = kstat_cpu(i).cpustat.idle;
+		idle = cputime64_add(idle, arch_idle_time(i));
 		iowait = kstat_cpu(i).cpustat.iowait;
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
-- 
cgit v1.1


From 8c652f96d3852b97a49c331cd0bb02d22f3cb31b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Apr 2009 01:01:56 +0200
Subject: do_execve() must not clear fs->in_exec if it was set by another
 thread

If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec
unconditionally. This is wrong if we race with our sub-thread which
also does do_execve:

	Two threads T1 and T2 and another process P, all share the same
	->fs.

	T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since
	->fs is shared, we set LSM_UNSAFE but not ->in_exec.

	P exits and decrements fs->users.

	T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not
	shared, we set fs->in_exec.

	T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and
	return to the user-space.

	T1 does clone(CLONE_FS /* without CLONE_THREAD */).

	T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with
	another process.

Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change
do_execve() to clear ->in_exec depending on res.

When do_execve() suceeds, it is safe to clear ->in_exec unconditionally.
It can be set only if we don't share ->fs with another process, and since
we already killed all sub-threads either ->in_exec == 0 or we are the
only user of this ->fs.

Also, we do not need fs->lock to clear fs->in_exec.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 11 +++++------
 fs/exec.c   | 19 ++++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 379a399..681ed81 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1476,6 +1476,7 @@ int compat_do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
+	bool clear_in_exec;
 	int retval;
 
 	retval = unshare_files(&displaced);
@@ -1498,8 +1499,9 @@ int compat_do_execve(char * filename,
 		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
-	if (retval)
+	if (retval < 0)
 		goto out_unlock;
+	clear_in_exec = retval;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1546,9 +1548,7 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
-	write_lock(&current->fs->lock);
 	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1568,9 +1568,8 @@ out_file:
 	}
 
 out_unmark:
-	write_lock(&current->fs->lock);
-	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
+	if (clear_in_exec)
+		current->fs->in_exec = 0;
 
 out_unlock:
 	current->in_execve = 0;
diff --git a/fs/exec.c b/fs/exec.c
index 052a961..a2e6989 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1077,9 +1077,11 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 	if (p->fs->users > n_fs) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	} else {
-		if (p->fs->in_exec)
-			res = -EAGAIN;
-		p->fs->in_exec = 1;
+		res = -EAGAIN;
+		if (!p->fs->in_exec) {
+			p->fs->in_exec = 1;
+			res = 1;
+		}
 	}
 
 	unlock_task_sighand(p, &flags);
@@ -1284,6 +1286,7 @@ int do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
+	bool clear_in_exec;
 	int retval;
 
 	retval = unshare_files(&displaced);
@@ -1306,8 +1309,9 @@ int do_execve(char * filename,
 		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
-	if (retval)
+	if (retval < 0)
 		goto out_unlock;
+	clear_in_exec = retval;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1355,9 +1359,7 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
-	write_lock(&current->fs->lock);
 	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1377,9 +1379,8 @@ out_file:
 	}
 
 out_unmark:
-	write_lock(&current->fs->lock);
-	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
+	if (clear_in_exec)
+		current->fs->in_exec = 0;
 
 out_unlock:
 	current->in_execve = 0;
-- 
cgit v1.1


From 437f7fdb607f32b737e4da9f14bebcfdac2c90c3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 24 Apr 2009 01:02:45 +0200
Subject: check_unsafe_exec: s/lock_task_sighand/rcu_read_lock/

write_lock(&current->fs->lock) guarantees we can't wrongly miss
LSM_UNSAFE_SHARE, this is what we care about. Use rcu_read_lock()
instead of ->siglock to iterate over the sub-threads. We must see
all CLONE_THREAD|CLONE_FS threads which didn't pass exit_fs(), it
takes fs->lock too.

With or without this patch we can miss the freshly cloned thread
and set LSM_UNSAFE_SHARE, we don't care.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
[ Fixed lock/unlock typo  - Hugh ]
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index a2e6989..a3a8ce8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1060,7 +1060,6 @@ EXPORT_SYMBOL(install_exec_creds);
 int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
-	unsigned long flags;
 	unsigned n_fs;
 	int res = 0;
 
@@ -1068,11 +1067,12 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 
 	n_fs = 1;
 	write_lock(&p->fs->lock);
-	lock_task_sighand(p, &flags);
+	rcu_read_lock();
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
 			n_fs++;
 	}
+	rcu_read_unlock();
 
 	if (p->fs->users > n_fs) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
@@ -1083,8 +1083,6 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 			res = 1;
 		}
 	}
-
-	unlock_task_sighand(p, &flags);
 	write_unlock(&p->fs->lock);
 
 	return res;
-- 
cgit v1.1


From 485c26ec70f823f2a9cf45982b724893e53a859e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Apr 2009 13:43:20 -0400
Subject: ext4: Fix softlockup caused by illegal i_file_acl value in on-disk
 inode

If the block containing external extended attributes (which is stored
in i_file_acl and i_file_acl_high) is larger than the on-disk
filesystem, the process which tried to access the extended attributes
will endlessly issue kernel printks complaining that
"__find_get_block_slow() failed", locking up that CPU until the system
is forcibly rebooted.

So when we read in the inode, make sure the i_file_acl value is legal,
and if not, flag the filesystem as being corrupted.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c6bd6ce..cab75bb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4409,7 +4409,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 
-	if (ei->i_flags & EXT4_EXTENTS_FL) {
+	if (ei->i_file_acl &&
+	    ((ei->i_file_acl < 
+	      (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+	       EXT4_SB(sb)->s_gdb_count)) ||
+	     (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+		ext4_error(sb, __func__,
+			   "bad extended attribute block %llu in inode #%lu",
+			   ei->i_file_acl, inode->i_ino);
+		ret = -EIO;
+		goto bad_inode;
+	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
 		/* Validate extent which is part of inode */
 		ret = ext4_ext_check_inode(inode);
  	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-- 
cgit v1.1


From 97e728d4353f38c87bf0804cdfd79a9b13fc2c3e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 21 Apr 2009 17:40:57 -0400
Subject: Btrfs: try to keep a healthy ratio of metadata vs data block groups

This patch makes the chunk allocator keep a good ratio of metadata vs data
block groups.  By default for every 8 data block groups, we'll allocate 1
metadata chunk, or about 12% of the disk will be allocated for metadata.  This
can be changed by specifying the metadata_ratio mount option.

This is simply the number of data block groups that have to be allocated to
force a metadata chunk allocation.  By making sure we allocate metadata chunks
more often, we are less likely to get into situations where the whole disk
has been allocated as data block groups.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++-
 fs/btrfs/super.c       | 12 +++++++++++-
 4 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad96495..213535f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -881,6 +881,9 @@ struct btrfs_fs_info {
 	u64 metadata_alloc_profile;
 	u64 system_alloc_profile;
 
+	unsigned data_chunk_allocations;
+	unsigned metadata_ratio;
+
 	void *bdev_holder;
 };
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6b8374..44c94d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1604,6 +1604,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+	fs_info->metadata_ratio = 8;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 178df4c..2895a83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1918,15 +1918,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 	spin_unlock(&info->lock);
 }
 
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+			found->force_alloc = 1;
+	}
+	rcu_read_unlock();
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	u64 thresh;
 	int ret = 0;
 
-	mutex_lock(&extent_root->fs_info->chunk_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
 
 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
 
@@ -1958,6 +1972,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&space_info->lock);
 
+	/*
+	 * if we're doing a data chunk, go ahead and make sure that
+	 * we keep a reasonable number of metadata chunks allocated in the
+	 * FS as well.
+	 */
+	if (flags & BTRFS_BLOCK_GROUP_DATA) {
+		fs_info->data_chunk_allocations++;
+		if (!(fs_info->data_chunk_allocations %
+		      fs_info->metadata_ratio))
+			force_metadata_allocation(fs_info);
+	}
+
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	if (ret)
 		space_info->full = 1;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9744af9..30c9a8c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
 	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
-	Opt_flushoncommit, Opt_err,
+	Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -87,6 +87,7 @@ static match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
+	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_err, NULL},
 };
 
@@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
 			break;
+		case Opt_ratio:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->metadata_ratio = intarg;
+				printk(KERN_INFO "btrfs: metadata ratio %d\n",
+				       info->metadata_ratio);
+			}
+			break;
 		default:
 			break;
 		}
-- 
cgit v1.1


From 2ea2544ef5dad5cac52f1e4c7b812631274fc1cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Apr 2009 15:32:28 +0200
Subject: Btrfs: simplify makefile

Get rid of the hacks for building out of tree, and always use += for
assigning to the object lists.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9adf5e4..9421284 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
-ifneq ($(KERNELRELEASE),)
-# kbuild part of makefile
 
 obj-$(CONFIG_BTRFS_FS) := btrfs.o
-btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
 	   compression.o delayed-ref.o
-else
-
-# Normal Makefile
-
-KERNELDIR := /lib/modules/`uname -r`/build
-all:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
-
-modules_install:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-clean:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
-
-endif
-- 
cgit v1.1


From 0d4bf11e5309eff64272a49e1ea55658372abc56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Apr 2009 15:33:56 +0200
Subject: Btrfs: don't export symbols

Currently the extent_map code is only for btrfs so don't export it's
symbols.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b187917..9827fa1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -43,7 +43,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 	tree->map.rb_node = NULL;
 	spin_lock_init(&tree->lock);
 }
-EXPORT_SYMBOL(extent_map_tree_init);
 
 /**
  * alloc_extent_map - allocate new extent map structure
@@ -64,7 +63,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 	atomic_set(&em->refs, 1);
 	return em;
 }
-EXPORT_SYMBOL(alloc_extent_map);
 
 /**
  * free_extent_map - drop reference count of an extent_map
@@ -83,7 +81,6 @@ void free_extent_map(struct extent_map *em)
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
-EXPORT_SYMBOL(free_extent_map);
 
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
@@ -264,7 +261,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 out:
 	return ret;
 }
-EXPORT_SYMBOL(add_extent_mapping);
 
 /* simple helper to do math around the end of an extent, handling wrap */
 static u64 range_end(u64 start, u64 len)
@@ -326,7 +322,6 @@ found:
 out:
 	return em;
 }
-EXPORT_SYMBOL(lookup_extent_mapping);
 
 /**
  * remove_extent_mapping - removes an extent_map from the extent tree
@@ -346,4 +341,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	em->in_tree = 0;
 	return ret;
 }
-EXPORT_SYMBOL(remove_extent_mapping);
-- 
cgit v1.1


From 9601e3f6336f6ca66929f451b1f66085e68e36e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 Apr 2009 15:33:09 +0200
Subject: Btrfs: kill btrfs_cache_create

Just use kmem_cache_create directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c  | 18 ++++++------------
 fs/btrfs/extent_map.c | 11 +++--------
 fs/btrfs/inode.c      | 42 +++++++++++++++++++-----------------------
 3 files changed, 28 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 05a1c42..c33b540 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-				       unsigned long extra_flags,
-				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long));
-
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
 
@@ -58,15 +52,15 @@ struct extent_page_data {
 
 int __init extent_io_init(void)
 {
-	extent_state_cache = btrfs_cache_create("extent_state",
-					    sizeof(struct extent_state), 0,
-					    NULL);
+	extent_state_cache = kmem_cache_create("extent_state",
+			sizeof(struct extent_state), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 
-	extent_buffer_cache = btrfs_cache_create("extent_buffers",
-					    sizeof(struct extent_buffer), 0,
-					    NULL);
+	extent_buffer_cache = kmem_cache_create("extent_buffers",
+			sizeof(struct extent_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
 		goto free_state_cache;
 	return 0;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 9827fa1..30c9365 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
 #include <linux/hardirq.h>
 #include "extent_map.h"
 
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-				       unsigned long extra_flags,
-				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long));
 
 static struct kmem_cache *extent_map_cache;
 
 int __init extent_map_init(void)
 {
-	extent_map_cache = btrfs_cache_create("extent_map",
-					    sizeof(struct extent_map), 0,
-					    NULL);
+	extent_map_cache = kmem_cache_create("extent_map",
+			sizeof(struct extent_map), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_map_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65219f6..176b6cc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4640,39 +4640,35 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_path_cachep);
 }
 
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-				       unsigned long extra_flags,
-				       void (*ctor)(void *))
-{
-	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-				 SLAB_MEM_SPREAD | extra_flags), ctor);
-}
-
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
-					  sizeof(struct btrfs_inode),
-					  0, init_once);
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+			sizeof(struct btrfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
-	btrfs_trans_handle_cachep =
-			btrfs_cache_create("btrfs_trans_handle_cache",
-					   sizeof(struct btrfs_trans_handle),
-					   0, NULL);
+
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+			sizeof(struct btrfs_trans_handle), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
-	btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
-					     sizeof(struct btrfs_transaction),
-					     0, NULL);
+
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+			sizeof(struct btrfs_transaction), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
-	btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
-					 sizeof(struct btrfs_path),
-					 0, NULL);
+
+	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+			sizeof(struct btrfs_path), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
-	btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
-					      SLAB_DESTROY_BY_RCU, NULL);
+
+	btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+			SLAB_DESTROY_BY_RCU, NULL);
 	if (!btrfs_bit_radix_cachep)
 		goto fail;
 	return 0;
-- 
cgit v1.1


From e980b50cda1610f1c17978d9b7fd311a9dd93877 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 24 Apr 2009 14:39:24 -0400
Subject: Btrfs: fix fallocate deadlock on inode extent lock

The btrfs fallocate call takes an extent lock on the entire range
being fallocated, and then runs through insert_reserved_extent on each
extent as they are allocated.

The problem with this is that btrfs_drop_extents may decide to try
and take the same extent lock fallocate was already holding.  The solution
used here is to push down knowledge of the range that is already locked
going into btrfs_drop_extents.

It turns out that at least one other caller had the same bug.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h    |  3 ++-
 fs/btrfs/file.c     | 11 ++++++-----
 fs/btrfs/inode.c    | 27 ++++++++++++++++++---------
 fs/btrfs/ioctl.c    |  3 ++-
 fs/btrfs/tree-log.c |  2 +-
 5 files changed, 29 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 213535f..4414a5d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2177,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+		       u64 start, u64 end, u64 locked_end,
+		       u64 inline_limit, u64 *hint_block);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 482f8db..da3ed96 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -363,15 +363,16 @@ out:
  */
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+		       u64 start, u64 end, u64 locked_end,
+		       u64 inline_limit, u64 *hint_byte)
 {
 	u64 extent_end = 0;
-	u64 locked_end = end;
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
 	u64 orig_parent = 0;
 	u64 disk_bytenr = 0;
+	u64 orig_locked_end = locked_end;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
@@ -684,9 +685,9 @@ next_slot:
 	}
 out:
 	btrfs_free_path(path);
-	if (locked_end > end) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
-			      GFP_NOFS);
+	if (locked_end > orig_locked_end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
+			      locked_end - 1, GFP_NOFS);
 	}
 	btrfs_check_file(root, inode);
 	return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 176b6cc..2fdb299 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -234,7 +234,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, start, &hint_byte);
+				 aligned_end, aligned_end, start, &hint_byte);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -1439,6 +1439,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 				       struct inode *inode, u64 file_pos,
 				       u64 disk_bytenr, u64 disk_num_bytes,
 				       u64 num_bytes, u64 ram_bytes,
+				       u64 locked_end,
 				       u8 compression, u8 encryption,
 				       u16 other_encoding, int extent_type)
 {
@@ -1455,7 +1456,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	path->leave_spinning = 1;
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes, file_pos, &hint);
+				 file_pos + num_bytes, locked_end,
+				 file_pos, &hint);
 	BUG_ON(ret);
 
 	ins.objectid = inode->i_ino;
@@ -1590,6 +1592,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->disk_len,
 						ordered_extent->len,
 						ordered_extent->len,
+						ordered_extent->file_offset +
+						ordered_extent->len,
 						compressed, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
 		BUG_ON(ret);
@@ -2877,6 +2881,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 			err = btrfs_drop_extents(trans, root, inode,
 						 cur_offset,
 						 cur_offset + hole_size,
+						 block_end,
 						 cur_offset, &hint_byte);
 			if (err)
 				break;
@@ -4968,7 +4973,7 @@ out_fail:
 
 static int prealloc_file_range(struct btrfs_trans_handle *trans,
 			       struct inode *inode, u64 start, u64 end,
-			       u64 alloc_hint, int mode)
+			       u64 locked_end, u64 alloc_hint, int mode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
@@ -4989,7 +4994,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 		ret = insert_reserved_file_extent(trans, inode,
 						  cur_offset, ins.objectid,
 						  ins.offset, ins.offset,
-						  ins.offset, 0, 0, 0,
+						  ins.offset, locked_end,
+						  0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		BUG_ON(ret);
 		num_bytes -= ins.offset;
@@ -5018,6 +5024,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
+	u64 locked_end;
 	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
 	struct btrfs_trans_handle *trans;
@@ -5039,6 +5046,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			goto out;
 	}
 
+	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 
@@ -5051,8 +5059,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		/* the extent lock is ordered inside the running
 		 * transaction
 		 */
-		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
-			    alloc_end - 1, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			    GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
@@ -5060,7 +5068,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		    ordered->file_offset < alloc_end) {
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      alloc_start, alloc_end - 1, GFP_NOFS);
+				      alloc_start, locked_end, GFP_NOFS);
 			btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 
 			/*
@@ -5085,7 +5093,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		last_byte = (last_byte + mask) & ~mask;
 		if (em->block_start == EXTENT_MAP_HOLE) {
 			ret = prealloc_file_range(trans, inode, cur_offset,
-					last_byte, alloc_hint, mode);
+					last_byte, locked_end + 1,
+					alloc_hint, mode);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
@@ -5101,7 +5110,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			break;
 		}
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 		      GFP_NOFS);
 
 	btrfs_end_transaction(trans, BTRFS_I(inode)->root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7594bec..f4e5d2e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -830,7 +830,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	BUG_ON(!trans);
 
 	/* punch hole in destination first */
-	btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+	btrfs_drop_extents(trans, root, inode, off, off + len,
+			   off + len, 0, &hint_byte);
 
 	/* clone data */
 	key.objectid = src->i_ino;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 25f20ea..db5e212 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode,
-			 start, extent_end, start, &alloc_hint);
+			 start, extent_end, extent_end, start, &alloc_hint);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
-- 
cgit v1.1


From 59bc5c758ece00fb0b2a170dd8fbbf31f1856c8a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 24 Apr 2009 14:39:25 -0400
Subject: Btrfs: fix deadlocks and stalls on dead root removal

After a transaction commit, the old root of the subvol btrees are sent through
snapshot removal.  This is what actually frees up any blocks replaced by
COW, and anything the old blocks pointed to.

Snapshot deletion will pause when a transaction commit has started, which
helps to avoid a huge amount of delayed reference count updates piling up
as the transaction is trying to close.

But, this pause happens after the snapshot deletion process has asked other
procs on the system to throttle back a bit so that it can make progress.

We don't want to throttle everyone while we're waiting for the transaction
commit, it leads to deadlocks in the user transaction ioctls used by Ceph
and makes things slower in general.

This patch changes things to avoid the throttling while we sleep.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2869b33..01b1436 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 		prepare_to_wait(&info->transaction_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&info->trans_mutex);
+
+		atomic_dec(&info->throttles);
+		wake_up(&info->transaction_throttle);
+
 		schedule();
+
+		atomic_inc(&info->throttles);
 		mutex_lock(&info->trans_mutex);
 		finish_wait(&info->transaction_wait, &wait);
 	}
-- 
cgit v1.1


From a9e817425dc0baede8ebe5fbc9984a640257432b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Apr 2009 16:11:18 -0400
Subject: ext4: Ignore i_file_acl_high unless EXT4_FEATURE_INCOMPAT_64BIT is
 present

Don't try to look at i_file_acl_high unless the INCOMPAT_64BIT feature
bit is set.  The field is normally zero, but older versions of e2fsck
didn't automatically check to make sure of this, so in the spirit of
"be liberal in what you accept", don't look at i_file_acl_high unless
we are using a 64-bit filesystem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cab75bb..1146003 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4357,11 +4357,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-	}
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
-- 
cgit v1.1


From 84baf74bf23bbe9f3deafb5d2f27e2b5dc0bc052 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Apr 2009 16:41:13 +0100
Subject: ROMFS: romfs_lookup() shouldn't be doing a partial name comparison

romfs_lookup() should be using a routine akin to strcmp() on the backing store,
rather than one akin to strncmp().  If it uses the latter, it's liable to match
/bin/shutdown when looking up /bin/sh.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Michal Simek <monstr@monstr.eu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/romfs/internal.h |  4 ++--
 fs/romfs/storage.c  | 67 +++++++++++++++++++++++++++++++++++++++--------------
 fs/romfs/super.c    |  4 ++--
 3 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
index 06044a9..95217b8 100644
--- a/fs/romfs/internal.h
+++ b/fs/romfs/internal.h
@@ -43,5 +43,5 @@ extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
 			  void *buf, size_t buflen);
 extern ssize_t romfs_dev_strnlen(struct super_block *sb,
 				 unsigned long pos, size_t maxlen);
-extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
-			     const char *str, size_t size);
+extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size);
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index 7e3e1e1..66ce9dd 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -67,26 +67,35 @@ static ssize_t romfs_mtd_strnlen(struct super_block *sb,
  * compare a string to one in a romfs image on MTD
  * - return 1 if matched, 0 if differ, -ve if error
  */
-static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
-			     const char *str, size_t size)
+static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
 {
-	u_char buf[16];
+	u_char buf[17];
 	size_t len, segment;
 	int ret;
 
-	/* scan the string up to 16 bytes at a time */
+	/* scan the string up to 16 bytes at a time, and attempt to grab the
+	 * trailing NUL whilst we're at it */
+	buf[0] = 0xff;
+
 	while (size > 0) {
-		segment = min_t(size_t, size, 16);
+		segment = min_t(size_t, size + 1, 17);
 		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
 		if (ret < 0)
 			return ret;
+		len--;
 		if (memcmp(buf, str, len) != 0)
 			return 0;
+		buf[0] = buf[len];
 		size -= len;
 		pos += len;
 		str += len;
 	}
 
+	/* check the trailing NUL was */
+	if (buf[0])
+		return 0;
+
 	return 1;
 }
 #endif /* CONFIG_ROMFS_ON_MTD */
@@ -154,28 +163,48 @@ static ssize_t romfs_blk_strnlen(struct super_block *sb,
  * compare a string to one in a romfs image on a block device
  * - return 1 if matched, 0 if differ, -ve if error
  */
-static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
-			     const char *str, size_t size)
+static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
 {
 	struct buffer_head *bh;
 	unsigned long offset;
 	size_t segment;
-	bool x;
+	bool matched, terminated = false;
 
-	/* scan the string up to 16 bytes at a time */
+	/* compare string up to a block at a time */
 	while (size > 0) {
 		offset = pos & (ROMBSIZE - 1);
 		segment = min_t(size_t, size, ROMBSIZE - offset);
 		bh = sb_bread(sb, pos >> ROMBSBITS);
 		if (!bh)
 			return -EIO;
-		x = (memcmp(bh->b_data + offset, str, segment) != 0);
-		brelse(bh);
-		if (x)
-			return 0;
+		matched = (memcmp(bh->b_data + offset, str, segment) == 0);
+
 		size -= segment;
 		pos += segment;
 		str += segment;
+		if (matched && size == 0 && offset + segment < ROMBSIZE) {
+			if (!bh->b_data[offset + segment])
+				terminated = true;
+			else
+				matched = false;
+		}
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	if (!terminated) {
+		/* the terminating NUL must be on the first byte of the next
+		 * block */
+		BUG_ON((pos & (ROMBSIZE - 1)) != 0);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = !bh->b_data[0];
+		brelse(bh);
+		if (!matched)
+			return 0;
 	}
 
 	return 1;
@@ -234,10 +263,12 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
 
 /*
  * compare a string to one in romfs
+ * - the string to be compared to, str, may not be NUL-terminated; instead the
+ *   string is of the specified size
  * - return 1 if matched, 0 if differ, -ve if error
  */
-int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
-		      const char *str, size_t size)
+int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+		     const char *str, size_t size)
 {
 	size_t limit;
 
@@ -246,16 +277,16 @@ int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
 		return -EIO;
 	if (size > ROMFS_MAXFN)
 		return -ENAMETOOLONG;
-	if (size > limit - pos)
+	if (size + 1 > limit - pos)
 		return -EIO;
 
 #ifdef CONFIG_ROMFS_ON_MTD
 	if (sb->s_mtd)
-		return romfs_mtd_strncmp(sb, pos, str, size);
+		return romfs_mtd_strcmp(sb, pos, str, size);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev)
-		return romfs_blk_strncmp(sb, pos, str, size);
+		return romfs_blk_strcmp(sb, pos, str, size);
 #endif
 	return -EIO;
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 10ca7d9..c53b5ef 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -240,8 +240,8 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
 			goto error;
 
 		/* try to match the first 16 bytes of name */
-		ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
-					len);
+		ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
+				       len);
 		if (ret < 0)
 			goto error;
 		if (ret == 1)
-- 
cgit v1.1


From 4b2b0b9753194cad44d7295c32044b89710efd70 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Apr 2009 16:41:18 +0100
Subject: ROMFS: Advance destination buffer pointer when reading from a
 blockdev

RomFS should advance the destination buffer pointer when reading data from a
blockdev source (the data may be split over multiple blocks, each requiring its
own sb_read() call).  Without this, all the data is copied to the beginning of
the output buffer.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Michal Simek <monstr@monstr.eu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/romfs/storage.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index 66ce9dd..b3208ad 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -120,6 +120,7 @@ static int romfs_blk_read(struct super_block *sb, unsigned long pos,
 			return -EIO;
 		memcpy(buf, bh->b_data + offset, segment);
 		brelse(bh);
+		buf += segment;
 		buflen -= segment;
 		pos += segment;
 	}
-- 
cgit v1.1


From c4b5a614316c505922a522b2e35ba05ea3e08a7c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Apr 2009 18:45:35 -0400
Subject: ext4: Do not try to validate extents on special files

The EXTENTS_FL flag should never be set on special files, but if it
is, don't bother trying to validate that the extents tree is valid,
since only files, directories, and non-fast symlinks will ever have an
extent data structure.  We perhaps should flag the filesystem as being
corrupted if we see a special file (named pipes, device nodes, Unix
domain sockets, etc.) with the EXTENTS_FL flag, but e2fsck doesn't
currently check this case, so we'll just ignore this for now, since
it's harmless.

Without this fix, a special device with the extents flag is flagged as
an error by the kernel, so it is impossible to access or delete the
inode, but e2fsck doesn't see it as a problem, leading to
confused/frustrated users.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1146003..e91f978 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4407,6 +4407,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 
+	ret = 0;
 	if (ei->i_file_acl &&
 	    ((ei->i_file_acl < 
 	      (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
@@ -4418,8 +4419,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ret = -EIO;
 		goto bad_inode;
 	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
-		/* Validate extent which is part of inode */
-		ret = ext4_ext_check_inode(inode);
+		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		    (S_ISLNK(inode->i_mode) &&
+		     !ext4_inode_is_fast_symlink(inode)))
+			/* Validate extent which is part of inode */
+			ret = ext4_ext_check_inode(inode);
  	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		   (S_ISLNK(inode->i_mode) &&
 		    !ext4_inode_is_fast_symlink(inode))) {
-- 
cgit v1.1


From d6397baee468809ef311e763dfc6e9f73418f8a6 Mon Sep 17 00:00:00 2001
From: Chris Ball <cjb@laptop.org>
Date: Mon, 27 Apr 2009 07:29:03 -0400
Subject: Btrfs: When shrinking, only update disk size on success

Previously, we updated a device's size prior to attempting a shrink
operation.  This patch moves the device resizing logic to only happen if
the shrink completes successfully.  In the process, it introduces a new
field to btrfs_device -- disk_total_bytes -- to track the on-disk size.

Signed-off-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 35 ++++++++++++++++++++++++-----------
 fs/btrfs/volumes.h |  3 +++
 2 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e53835b..5f01dad 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1543,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1940,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	device->total_bytes = new_size;
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes -= diff;
-	ret = btrfs_update_device(trans, device);
-	if (ret) {
-		unlock_chunks(root);
-		btrfs_end_transaction(trans, root);
-		goto done;
-	}
-	WARN_ON(diff > old_total);
-	btrfs_set_super_total_bytes(super_copy, old_total - diff);
 	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 
@@ -1979,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 		length = btrfs_dev_extent_length(l, dev_extent);
 
 		if (key.offset + length <= new_size)
-			goto done;
+			break;
 
 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1992,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 			goto done;
 	}
 
+	/* Shrinking succeeded, else we would be at "done". */
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	lock_chunks(root);
+
+	device->disk_total_bytes = new_size;
+	/* Now btrfs_update_device() will change the on-disk size. */
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		unlock_chunks(root);
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
 done:
 	btrfs_free_path(path);
 	return ret;
@@ -3076,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	unsigned long ptr;
 
 	device->devid = btrfs_device_id(leaf, dev_item);
-	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->total_bytes = device->disk_total_bytes;
 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
 	device->type = btrfs_device_type(leaf, dev_item);
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5836327..5c3ff6d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -61,6 +61,9 @@ struct btrfs_device {
 	/* size of the device */
 	u64 total_bytes;
 
+	/* size of the disk */
+	u64 disk_total_bytes;
+
 	/* bytes used */
 	u64 bytes_used;
 
-- 
cgit v1.1


From b7967db75a38df4891b22efe1b0969b9357eb946 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 07:29:04 -0400
Subject: Btrfs: remove #if 0 code

Btrfs had some old code sitting around under #if 0, this drops it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 48 +-------------------------------
 fs/btrfs/extent_io.c | 63 ------------------------------------------
 fs/btrfs/file.c      | 78 ----------------------------------------------------
 3 files changed, 1 insertion(+), 188 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44c94d8..77f9a3b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -584,18 +584,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 		btrfs_set_work_high_prio(&async->work);
 
 	btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
-	int limit = btrfs_async_submit_limit(fs_info);
-	if (atomic_read(&fs_info->nr_async_submits) > limit) {
-		wait_event_timeout(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_submits) < limit),
-			   HZ/10);
 
-		wait_event_timeout(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_bios) < limit),
-			   HZ/10);
-	}
-#endif
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
@@ -770,27 +759,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	}
 }
 
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct buffer_head *bh;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct buffer_head *head;
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	head = page_buffers(page);
-	bh = head;
-	do {
-		if (buffer_dirty(bh))
-			csum_tree_block(root, bh, 0);
-		bh = bh->b_this_page;
-	} while (bh != head);
-	return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
-
 static struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
 	.writepage	= btree_writepage,
@@ -1278,11 +1246,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	int ret = 0;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
-#if 0
-	if ((bdi_bits & (1 << BDI_write_congested)) &&
-	    btrfs_congested_async(info, 0))
-		return 1;
-#endif
+
 	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
@@ -2334,16 +2298,6 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
-#if 0
-	while (!list_empty(&fs_info->hashers)) {
-		struct btrfs_hasher *hasher;
-		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
-				    hashers);
-		list_del(&hasher->hashers);
-		crypto_free_hash(&fs_info->hash_tfm);
-		kfree(hasher);
-	}
-#endif
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c33b540..fe9eb99 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1401,69 +1401,6 @@ out:
 	return total_bytes;
 }
 
-#if 0
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-	int err;
-
-	while (index <= end_index) {
-		page = grab_cache_page(tree->mapping, index);
-		if (!page) {
-			err = -ENOMEM;
-			goto failed;
-		}
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto failed;
-		}
-		index++;
-	}
-	lock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-
-failed:
-	/*
-	 * we failed above in getting the page at 'index', so we undo here
-	 * up to but not including the page at 'index'
-	 */
-	end_index = index;
-	index = start >> PAGE_CACHE_SHIFT;
-	while (index < end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	return err;
-}
-
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	unlock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-}
-#endif
-
 /*
  * set the private field for a given byte offset in the tree.  If there isn't
  * an extent_state there already, this does nothing.
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index da3ed96..1d51dc3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
-{
-	return 0;
-#if 0
-	struct btrfs_path *path;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	struct btrfs_file_extent_item *extent;
-	u64 last_offset = 0;
-	int nritems;
-	int slot;
-	int found_type;
-	int ret;
-	int err = 0;
-	u64 extent_end = 0;
-
-	path = btrfs_alloc_path();
-	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
-				       last_offset, 0);
-	while (1) {
-		nritems = btrfs_header_nritems(path->nodes[0]);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				goto out;
-			nritems = btrfs_header_nritems(path->nodes[0]);
-		}
-		slot = path->slots[0];
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (found_key.objectid != inode->i_ino)
-			break;
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			goto out;
-
-		if (found_key.offset < last_offset) {
-			WARN_ON(1);
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_ERR "inode %lu found offset %llu "
-			       "expected %llu\n", inode->i_ino,
-			       (unsigned long long)found_key.offset,
-			       (unsigned long long)last_offset);
-			err = 1;
-			goto out;
-		}
-		extent = btrfs_item_ptr(leaf, slot,
-					struct btrfs_file_extent_item);
-		found_type = btrfs_file_extent_type(leaf, extent);
-		if (found_type == BTRFS_FILE_EXTENT_REG) {
-			extent_end = found_key.offset +
-			     btrfs_file_extent_num_bytes(leaf, extent);
-		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-			struct btrfs_item *item;
-			item = btrfs_item_nr(leaf, slot);
-			extent_end = found_key.offset +
-			     btrfs_file_extent_inline_len(leaf, extent);
-			extent_end = (extent_end + root->sectorsize - 1) &
-				~((u64)root->sectorsize - 1);
-		}
-		last_offset = extent_end;
-		path->slots[0]++;
-	}
-	if (0 && last_offset < inode->i_size) {
-		WARN_ON(1);
-		btrfs_print_leaf(root, leaf);
-		printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
-		       inode->i_ino, (unsigned long long)last_offset,
-		       (unsigned long long)inode->i_size);
-		err = 1;
-
-	}
-out:
-	btrfs_free_path(path);
-	return err;
-#endif
-}
-
 /*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
@@ -689,7 +612,6 @@ out:
 		unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
 			      locked_end - 1, GFP_NOFS);
 	}
-	btrfs_check_file(root, inode);
 	return ret;
 }
 
-- 
cgit v1.1


From 193f284d4985db0370a8a1bbdfb20df548cf9ffb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 07:29:05 -0400
Subject: Btrfs: ratelimit IO error printks

Btrfs has printks for various IO errors, including bad checksums and
mismatches between what we expect the block headers to contain and what
we actually find on the disk.

Longer term we need a real reporting mechanism for this, but for now
printk is going to have to do.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 38 +++++++++++++++++++++++++-------------
 fs/btrfs/inode.c   | 10 ++++++----
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 77f9a3b..aa0c259 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk(KERN_INFO "btrfs: %s checksum verify failed "
-			       "on %llu wanted %X found %X level %d\n",
-			       root->fs_info->sb->s_id,
-			       buf->start, val, found, btrfs_header_level(buf));
+			if (printk_ratelimit()) {
+				printk(KERN_INFO "btrfs: %s checksum verify "
+				       "failed on %llu wanted %X found %X "
+				       "level %d\n",
+				       root->fs_info->sb->s_id,
+				       (unsigned long long)buf->start, val, found,
+				       btrfs_header_level(buf));
+			}
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	printk("parent transid verify failed on %llu wanted %llu found %llu\n",
-	       (unsigned long long)eb->start,
-	       (unsigned long long)parent_transid,
-	       (unsigned long long)btrfs_header_generation(eb));
+	if (printk_ratelimit()) {
+		printk("parent transid verify failed on %llu wanted %llu "
+		       "found %llu\n",
+		       (unsigned long long)eb->start,
+		       (unsigned long long)parent_transid,
+		       (unsigned long long)btrfs_header_generation(eb));
+	}
 	ret = 1;
 	clear_extent_buffer_uptodate(io_tree, eb);
 out:
@@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
-		       (unsigned long long)found_start,
-		       (unsigned long long)eb->start);
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "btrfs bad tree block start "
+			       "%llu %llu\n",
+			       (unsigned long long)found_start,
+			       (unsigned long long)eb->start);
+		}
 		ret = -EIO;
 		goto err;
 	}
@@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
-		       (unsigned long long)eb->start);
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+			       (unsigned long long)eb->start);
+		}
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2fdb299..552e08a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1823,10 +1823,12 @@ good:
 	return 0;
 
 zeroit:
-	printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
-	       "private %llu\n", page->mapping->host->i_ino,
-	       (unsigned long long)start, csum,
-	       (unsigned long long)private);
+	if (printk_ratelimit()) {
+		printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+		       "private %llu\n", page->mapping->host->i_ino,
+		       (unsigned long long)start, csum,
+		       (unsigned long long)private);
+	}
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_USER0);
-- 
cgit v1.1


From 45c06543afe2772c02f21efee0e2138b4e1c911e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 07:49:10 -0400
Subject: Btrfs: remove unused btrfs_bit_radix slab

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 552e08a..98bd506 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
-struct kmem_cache *btrfs_bit_radix_cachep;
 struct kmem_cache *btrfs_path_cachep;
 
 #define S_SHIFT 12
@@ -4641,8 +4640,6 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_trans_handle_cachep);
 	if (btrfs_transaction_cachep)
 		kmem_cache_destroy(btrfs_transaction_cachep);
-	if (btrfs_bit_radix_cachep)
-		kmem_cache_destroy(btrfs_bit_radix_cachep);
 	if (btrfs_path_cachep)
 		kmem_cache_destroy(btrfs_path_cachep);
 }
@@ -4673,11 +4670,6 @@ int btrfs_init_cachep(void)
 	if (!btrfs_path_cachep)
 		goto fail;
 
-	btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
-			SLAB_DESTROY_BY_RCU, NULL);
-	if (!btrfs_bit_radix_cachep)
-		goto fail;
 	return 0;
 fail:
 	btrfs_destroy_cachep();
-- 
cgit v1.1


From e63b6a6c0ffa2ebd8617cc1a10969000296831aa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 21 Apr 2009 12:38:30 -0700
Subject: Btrfs: Fix a trivial warning using max() of u64 vs ULL.

A small warning popped up on ia64 because inode-map.c was comparing a
u64 object id with the ULL FIRST_FREE_OBJECTID.  My first thought was
that all the OBJECTID constants should contain the u64 cast because
btrfs code deals entirely in u64s.  But then I saw how large that was,
and figured I'd just fix the max() call.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d..9abbced 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+	search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
 	search_key.type = 0;
 	search_key.offset = 0;
-- 
cgit v1.1


From 21380931eb4da4e29ac663d0221581282cbba208 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 21 Apr 2009 12:38:29 -0700
Subject: Btrfs: Fix a bunch of printk() warnings.

Just happened to notice a bunch of %llu vs u64 warnings.  Here's a patch
to cast them all.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c          |  6 +++---
 fs/btrfs/extent-tree.c      | 21 ++++++++++++++-------
 fs/btrfs/free-space-cache.c | 15 ++++++++++-----
 fs/btrfs/ioctl.c            |  6 ++++--
 fs/btrfs/super.c            | 15 +++++++++------
 5 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aa0c259..0ff16d3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1671,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (features) {
 		printk(KERN_ERR "BTRFS: couldn't mount because of "
 		       "unsupported optional features (%Lx).\n",
-		       features);
+		       (unsigned long long)features);
 		err = -EINVAL;
 		goto fail_iput;
 	}
@@ -1681,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
 		       "unsupported option features (%Lx).\n",
-		       features);
+		       (unsigned long long)features);
 		err = -EINVAL;
 		goto fail_iput;
 	}
@@ -2273,7 +2273,7 @@ int close_ctree(struct btrfs_root *root)
 
 	if (fs_info->delalloc_bytes) {
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-		       fs_info->delalloc_bytes);
+		       (unsigned long long)fs_info->delalloc_bytes);
 	}
 	if (fs_info->total_ref_cache_size) {
 		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2895a83..e496644 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1844,10 +1844,14 @@ again:
 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
 		       ", %llu bytes_used, %llu bytes_reserved, "
 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
-		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
-		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
-		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		       "%llu total\n", (unsigned long long)bytes,
+		       (unsigned long long)data_sinfo->bytes_delalloc,
+		       (unsigned long long)data_sinfo->bytes_used,
+		       (unsigned long long)data_sinfo->bytes_reserved,
+		       (unsigned long long)data_sinfo->bytes_pinned,
+		       (unsigned long long)data_sinfo->bytes_readonly,
+		       (unsigned long long)data_sinfo->bytes_may_use,
+		       (unsigned long long)data_sinfo->total_bytes);
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
@@ -2824,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-	       " may_use=%llu, used=%llu\n", info->total_bytes,
-	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
-	       info->bytes_used);
+	       " may_use=%llu, used=%llu\n",
+	       (unsigned long long)info->total_bytes,
+	       (unsigned long long)info->bytes_pinned,
+	       (unsigned long long)info->bytes_delalloc,
+	       (unsigned long long)info->bytes_may_use,
+	       (unsigned long long)info->bytes_used);
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 768b952..0bc9365 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			printk(KERN_ERR "couldn't find space %llu to free\n",
 			       (unsigned long long)offset);
 			printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-			       block_group->cached, block_group->key.objectid,
-			       block_group->key.offset);
+			       block_group->cached,
+			       (unsigned long long)block_group->key.objectid,
+			       (unsigned long long)block_group->key.offset);
 			btrfs_dump_free_space(block_group, bytes);
 		} else if (info) {
 			printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
 			       "but wanted offset=%llu bytes=%llu\n",
-			       info->offset, info->bytes, offset, bytes);
+			       (unsigned long long)info->offset,
+			       (unsigned long long)info->bytes,
+			       (unsigned long long)offset,
+			       (unsigned long long)bytes);
 		}
 		WARN_ON(1);
 	}
@@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
-		printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
-		       info->bytes);
+		printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+		       (unsigned long long)info->offset,
+		       (unsigned long long)info->bytes);
 	}
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f4e5d2e..48762aa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -483,11 +483,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		*devstr = '\0';
 		devstr = vol_args->name;
 		devid = simple_strtoull(devstr, &end, 10);
-		printk(KERN_INFO "resizing devid %llu\n", devid);
+		printk(KERN_INFO "resizing devid %llu\n",
+		       (unsigned long long)devid);
 	}
 	device = btrfs_find_device(root, devid, NULL, NULL);
 	if (!device) {
-		printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+		printk(KERN_INFO "resizer unable to find device %llu\n",
+		       (unsigned long long)devid);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 30c9a8c..bf0e84c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -196,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				info->max_extent = max_t(u64,
 					info->max_extent, root->sectorsize);
 				printk(KERN_INFO "btrfs: max_extent at %llu\n",
-				       info->max_extent);
+				       (unsigned long long)info->max_extent);
 			}
 			break;
 		case Opt_max_inline:
@@ -211,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 						root->sectorsize);
 				}
 				printk(KERN_INFO "btrfs: max_inline at %llu\n",
-					info->max_inline);
+					(unsigned long long)info->max_inline);
 			}
 			break;
 		case Opt_alloc_start:
@@ -221,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
-					info->alloc_start);
+					(unsigned long long)info->alloc_start);
 			}
 			break;
 		case Opt_noacl:
@@ -420,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	if (info->max_extent != (u64)-1)
-		seq_printf(seq, ",max_extent=%llu", info->max_extent);
+		seq_printf(seq, ",max_extent=%llu",
+			   (unsigned long long)info->max_extent);
 	if (info->max_inline != 8192 * 1024)
-		seq_printf(seq, ",max_inline=%llu", info->max_inline);
+		seq_printf(seq, ",max_inline=%llu",
+			   (unsigned long long)info->max_inline);
 	if (info->alloc_start != 0)
-		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+		seq_printf(seq, ",alloc_start=%llu",
+			   (unsigned long long)info->alloc_start);
 	if (info->thread_pool_size !=  min_t(unsigned long,
 					     num_online_cpus() + 2, 8))
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
-- 
cgit v1.1


From fd1b52435a6d9663de896e8437ef067372916ef3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Apr 2009 18:10:06 +0200
Subject: quota: remove obsolete comments in fs/quota/Makefile

Get rid of useless comments and the equally useless obj-y
initialization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/Makefile | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 385a083..68d4f6d 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,12 +1,3 @@
-#
-# Makefile for the Linux filesystems.
-#
-# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
-# Rewritten to use lists instead of if-statements.
-#
-
-obj-y :=
-
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
-- 
cgit v1.1


From a069e9cee1dba2f847839d325f46ce6976ed1b76 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 9 Apr 2009 18:07:10 +0200
Subject: ext2: missing unlock in ext2_quota_write()

The inode->i_mutex should be unlocked.

Found by smatch (http://repo.or.cz/w/smatch.git).  Compile tested.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f983225..5c4afe6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
-- 
cgit v1.1


From 7b1a14bbb0e547aaa4d30cc376e6c8c12539ab0f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 10:49:53 -0400
Subject: Btrfs: fix acl caching

Linus noticed the btrfs code to cache acls wasn't properly caching
a NULL acl when the inode didn't have any acls.  This meant the common
case of no acls resulted in expensive btree searches every time the
kernel checked permissions (which is quite often).

This is a modified version of Linus' original patch:

Properly set initial acl fields to BTRFS_ACL_NOT_CACHED in the inode.
This forces an acl lookup when permission checks are done.

Fix btrfs_get_acl to avoid lookups and locking when the inode acls fields
are set to null.

Fix btrfs_get_acl to use the right return value from __btrfs_getxattr
when deciding to cache a NULL acl.  It was storing a NULL acl when
__btrfs_getxattr return -ENOENT, but __btrfs_getxattr was actually returning
-ENODATA for this case.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   | 18 +++++++++++++-----
 fs/btrfs/inode.c |  4 ++--
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7fdd184..cbba000 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		return ERR_PTR(-EINVAL);
 	}
 
+	/* Handle the cached NULL acl case without locking */
+	acl = ACCESS_ONCE(*p_acl);
+	if (!acl)
+		return acl;
+
 	spin_lock(&inode->i_lock);
-	if (*p_acl != BTRFS_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*p_acl);
+	acl = *p_acl;
+	if (acl != BTRFS_ACL_NOT_CACHED)
+		acl = posix_acl_dup(acl);
 	spin_unlock(&inode->i_lock);
 
-	if (acl)
+	if (acl != BTRFS_ACL_NOT_CACHED)
 		return acl;
 
-
 	size = __btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
 		value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 			btrfs_update_cached_acl(inode, p_acl, acl);
 		}
 		kfree(value);
-	} else if (size == -ENOENT) {
+	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
+		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
 		btrfs_update_cached_acl(inode, p_acl, acl);
+	} else {
+		acl = ERR_PTR(-EIO);
 	}
 
 	return acl;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 98bd506..dc812ec 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3047,8 +3047,8 @@ static noinline void init_btrfs_i(struct inode *inode)
 {
 	struct btrfs_inode *bi = BTRFS_I(inode);
 
-	bi->i_acl = NULL;
-	bi->i_default_acl = NULL;
+	bi->i_acl = BTRFS_ACL_NOT_CACHED;
+	bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
 
 	bi->generation = 0;
 	bi->sequence = 0;
-- 
cgit v1.1


From 46a53cca826e71effe59e3cb4f383622c33ebdcb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 11:47:50 -0400
Subject: Btrfs: look for acls during btrfs_read_locked_inode

This changes btrfs_read_locked_inode() to peek ahead in the btree for acl items.
If it is certain a given inode has no acls, it will set the in memory acl
fields to null to avoid acl lookups completely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc812ec..90c23eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2016,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 }
 
 /*
+ * very simple check to peek ahead in the leaf looking for xattrs.  If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+					  int slot, u64 objectid)
+{
+	u32 nritems = btrfs_header_nritems(leaf);
+	struct btrfs_key found_key;
+	int scanned = 0;
+
+	slot++;
+	while (slot < nritems) {
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* we found a different objectid, there must not be acls */
+		if (found_key.objectid != objectid)
+			return 0;
+
+		/* we found an xattr, assume we've got an acl */
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+			return 1;
+
+		/*
+		 * we found a key greater than an xattr key, there can't
+		 * be any acls later on
+		 */
+		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+			return 0;
+
+		slot++;
+		scanned++;
+
+		/*
+		 * it goes inode, inode backrefs, xattrs, extents,
+		 * so if there are a ton of hard links to an inode there can
+		 * be a lot of backrefs.  Don't waste time searching too hard,
+		 * this is just an optimization
+		 */
+		if (scanned >= 8)
+			break;
+	}
+	/* we hit the end of the leaf before we found an xattr or
+	 * something larger than an xattr.  We have to assume the inode
+	 * has acls
+	 */
+	return 1;
+}
+
+/*
  * read an inode from the btree into the in-memory inode
  */
 void btrfs_read_locked_inode(struct inode *inode)
@@ -2026,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	int maybe_acls;
 	u64 alloc_group_block;
 	u32 rdev;
 	int ret;
@@ -2072,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 
+	/*
+	 * try to precache a NULL acl entry for files that don't have
+	 * any xattrs or acls
+	 */
+	maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+	if (!maybe_acls) {
+		BTRFS_I(inode)->i_acl = NULL;
+		BTRFS_I(inode)->i_default_acl = NULL;
+	}
+
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
 	btrfs_free_path(path);
-- 
cgit v1.1


From 69838727bcd819a8fd73a88447801221788b0c6d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 28 Apr 2009 20:24:29 +0200
Subject: bio: fix memcpy corruption in bio_copy_user_iov()

st driver uses blk_rq_map_user() in order to just build a request out
of page frames. In this case, map_data->offset is a non zero value and
iov[0].iov_base is NULL. We need to increase nr_pages for that.

Cc: stable@kernel.org
Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 7bbc98f..9871164 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -817,6 +817,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 		len += iov[i].iov_len;
 	}
 
+	if (offset)
+		nr_pages++;
+
 	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.1


From 802b352f2934f799ec2e159f61db6506094a936e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 27 Apr 2009 21:24:28 -0700
Subject: ecryptfs: fix printk format warning

fs/ecryptfs/inode.c:670: warning: format '%d' expects type 'int', but argument 3 has type 'size_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: Dustin Kirkland <kirkland@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 93bc0f8..1940edd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -667,7 +667,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 	lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
 	if (lower_buf == NULL) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%d] bytes\n", __func__, lower_bufsiz);
+		       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
 		rc = -ENOMEM;
 		goto out;
 	}
-- 
cgit v1.1


From ac20100df7a7a042423dcb8847f42d9f6ddb8d00 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 27 Apr 2009 13:31:12 -0500
Subject: eCryptfs: Fix min function comparison warning

This warning shows up on 64 bit builds:

fs/ecryptfs/inode.c:693: warning: comparison of distinct pointer types
lacks a cast

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1940edd..2f0945d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -690,7 +690,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 		}
 		/* Check for bufsiz <= 0 done in sys_readlinkat() */
 		rc = copy_to_user(buf, plaintext_name,
-				  min((unsigned) bufsiz, plaintext_name_size));
+				  min((size_t) bufsiz, plaintext_name_size));
 		if (rc)
 			rc = -EFAULT;
 		else
-- 
cgit v1.1