From 0176260fc30842e358cf34afa7dcd9413db44822 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 10 Jan 2009 06:09:52 -0800
Subject: btrfs: fix for write_super_lockfs/unlockfs error handling

Commit c4be0c1dc4cdc37b175579be1460f15ac6495e9a added the ability for
write_super_lockfs to return errors, and renamed them to match.  But
btrfs didn't get converted.

Do the minimal conversion to make it compile again.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/super.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b4c101d9..0a14b49 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -605,18 +605,20 @@ out:
 	return ret;
 }
 
-static void btrfs_write_super_lockfs(struct super_block *sb)
+static int btrfs_freeze(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 	mutex_lock(&root->fs_info->cleaner_mutex);
+	return 0;
 }
 
-static void btrfs_unlockfs(struct super_block *sb)
+static int btrfs_unfreeze(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	return 0;
 }
 
 static struct super_operations btrfs_super_ops = {
@@ -631,8 +633,8 @@ static struct super_operations btrfs_super_ops = {
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
 	.remount_fs	= btrfs_remount,
-	.write_super_lockfs = btrfs_write_super_lockfs,
-	.unlockfs	= btrfs_unlockfs,
+	.freeze_fs	= btrfs_freeze,
+	.unfreeze_fs	= btrfs_unfreeze,
 };
 
 static const struct file_operations btrfs_ctl_fops = {
-- 
cgit v1.1


From 1bcbf31337391a2f54ef6c1e8871c2de5944a7dc Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 15 Jan 2009 13:51:03 -0800
Subject: btrfs & squashfs: Move btrfs and squashfsto's magic number to
 <linux/magic.h>

Use the standard magic.h for btrfs and squashfs.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a14b49..7256cf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/version.h>
+#include <linux/magic.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -51,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 
-#define BTRFS_SUPER_MAGIC 0x9123683E
 
 static struct super_operations btrfs_super_ops;
 
-- 
cgit v1.1


From 1d9e2ae949411c2f329f30e01ea0355cd02c4296 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:58:19 -0500
Subject: Btrfs: Clear the device->running_pending flag before bailing on
 congestion

Btrfs maintains a queue of async bio submissions so the checksumming
threads don't have to wait on get_request_wait.  In order to avoid
extra wakeups, this code has a running_pending flag that is used
to tell new submissions they don't need to wake the thread.

When the threads notice congestion on a single device, they
may decide to requeue the job and move on to other devices.  This
makes sure the running_pending flag is cleared before the
job is requeued.

It should help avoid IO stalls by making sure the task is woken up
when new submissions come in.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b187b53..3451e1c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -220,6 +220,7 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
+			device->running_pending = 0;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
-- 
cgit v1.1


From c071fcfdb60e7abbe95e02460005d6bca165bf24 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:59:08 -0500
Subject: Btrfs: fix ioctl arg size (userland incompatible change!)

The structure used to send device in btrfs ioctl calls was not
properly aligned, and so 32 bit ioctls would not work properly on
64 bit kernels.

We could fix this with compat ioctls, but we're just one byte away
and it doesn't make sense at this stage to carry about the compat ioctls
forever at this stage in the project.

This patch brings the ioctl arg up to an evenly aligned 4k.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.h | 14 ++++++++------
 fs/btrfs/super.c |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 78049ea..b320b10 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,13 +22,20 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 3072
+#define BTRFS_PATH_NAME_MAX 4087
 
+/* this should be 4k */
 struct btrfs_ioctl_vol_args {
 	__s64 fd;
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -52,11 +59,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
 				   struct btrfs_ioctl_vol_args)
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
 
 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
 				  struct btrfs_ioctl_clone_range_args)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b4c101d9..92c9b54 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -582,7 +582,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 {
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
-	int ret = 0;
+	int ret = -ENOTTY;
 	int len;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -594,6 +594,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		goto out;
 	}
 	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
-- 
cgit v1.1


From 19d00cc196a3a66fd074f62b39d219f743b92338 Mon Sep 17 00:00:00 2001
From: Wang Cong <xiyou.wangcong@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: cleanup fs/btrfs/super.c::btrfs_control_ioctl()

- Remove the unused local variable 'len';
- Check return value of kmalloc().

Signed-off-by: Wang Cong <wangcong@zeuux.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 92c9b54..795b624 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -583,17 +583,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
 	int ret = -ENOTTY;
-	int len;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+	if (!vol)
+		return -ENOMEM;
+
 	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
 		ret = -EFAULT;
 		goto out;
 	}
-	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
 
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
-- 
cgit v1.1


From 070604040b86511cc2df0f25f98e26c5529bd928 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: cleanup xattr code

Andrew's review of the xattr code revealed some minor issues that this patch
addresses.  Just an error return fix, got rid of a useless statement and
commented one of the trickier parts of __btrfs_getxattr.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/xattr.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e2..b4fa5f4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -45,9 +45,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 	/* lookup the xattr by name */
 	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
 				strlen(name), 0);
-	if (!di || IS_ERR(di)) {
+	if (!di) {
 		ret = -ENODATA;
 		goto out;
+	} else if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -62,6 +65,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 		ret = -ERANGE;
 		goto out;
 	}
+
+	/*
+	 * The way things are packed into the leaf is like this
+	 * |struct btrfs_dir_item|name|data|
+	 * where name is the xattr name, so security.foo, and data is the
+	 * content of the xattr.  data_ptr points to the location in memory
+	 * where the data starts in the in memory leaf
+	 */
 	data_ptr = (unsigned long)((char *)(di + 1) +
 				   btrfs_dir_name_len(leaf, di));
 	read_extent_buffer(leaf, buffer, data_ptr,
@@ -176,7 +187,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	ret = 0;
 	advance = 0;
 	while (1) {
 		leaf = path->nodes[0];
-- 
cgit v1.1


From 7eaebe7d503c3ef240ac7b3efc5433fe647c0298 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: removed unused #include <version.h>'s

Removed unused #include <version.h>'s in btrfs

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 1 -
 fs/btrfs/compression.c  | 1 -
 fs/btrfs/disk-io.c      | 1 -
 fs/btrfs/extent-tree.c  | 1 -
 fs/btrfs/extent_io.c    | 1 -
 fs/btrfs/extent_map.c   | 1 -
 fs/btrfs/file.c         | 1 -
 fs/btrfs/inode.c        | 1 -
 fs/btrfs/ioctl.c        | 1 -
 fs/btrfs/super.c        | 1 -
 fs/btrfs/volumes.c      | 1 -
 11 files changed, 11 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec0..d5f4e94 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/version.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8..ab07627 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/version.h>
 #include <linux/pagevec.h>
 #include "compat.h"
 #include "ctree.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a3138..37e12f6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/version.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da65..cdc961e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
-#include <linux/version.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d40..c9446d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
-#include <linux/version.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include "extent_io.h"
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33..50da69d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/version.h>
 #include <linux/hardirq.h>
 #include "extent_map.h"
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9026833..fbcbf43 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
-#include <linux/version.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe05..44dbd55 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
-#include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e..988fdc8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
 #include <linux/security.h>
-#include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include "compat.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 795b624..4c0b756 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
 #include <linux/ctype.h>
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
-#include <linux/version.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1c..f6e1fc5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
-#include <linux/version.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
-- 
cgit v1.1


From 119e10cf1b2f6a6cafff74f32373d631489f54c2 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: Remove extra KERN_INFO in the middle of a line

The "devid <xxx> transid <xxx>" printk in btrfs_scan_one_device()
actually follows another printk that doesn't end in a newline (since the
intention is for the two printks to make one line of output), so the
KERN_INFO just ends up messing up the output:

    device label exp <6>devid 1 transid 9 /dev/sda5

Fix this by changing the extra KERN_INFO to KERN_CONT.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f6e1fc5..397c8db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -577,7 +577,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	printk(KERN_CONT "devid %llu transid %llu %s\n",
 	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
-- 
cgit v1.1


From 57506d50ed6db7b0e7ddc9845e86e81f140983d5 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: check return value for kthread_run() correctly

kthread_run() returns the kthread or ERR_PTR(-ENOMEM), not NULL.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 37e12f6..0d8ccd6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1739,13 +1739,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
-	if (!fs_info->cleaner_kthread)
+	if (IS_ERR(fs_info->cleaner_kthread))
 		goto fail_csum_root;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
 						   "btrfs-transaction");
-	if (!fs_info->transaction_kthread)
+	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
 	if (btrfs_super_log_root(disk_super) != 0) {
-- 
cgit v1.1


From c6e308713a47527f88a277ee95b7c5d1db80f77b Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 21 Jan 2009 10:59:08 -0500
Subject: Btrfs: simplify iteration codes

Merge list_for_each* and list_entry to list_for_each_entry*

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c      | 15 ++++-----------
 fs/btrfs/extent-tree.c  |  8 ++------
 fs/btrfs/inode.c        |  5 ++---
 fs/btrfs/ordered-data.c |  4 +---
 fs/btrfs/transaction.c  |  4 +---
 fs/btrfs/volumes.c      | 35 +++++++++--------------------------
 6 files changed, 19 insertions(+), 52 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d8ccd6..26a1877 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1135,7 +1135,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
 	int ret = 0;
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 #if 0
@@ -1143,8 +1142,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	    btrfs_congested_async(info, 0))
 		return 1;
 #endif
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
@@ -1162,13 +1160,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
  */
 static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct btrfs_fs_info *info;
 
 	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 
@@ -1994,7 +1990,6 @@ static int write_dev_supers(struct btrfs_device *device,
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
@@ -2010,8 +2005,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	sb = &root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
 			continue;
@@ -2044,8 +2038,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	}
 
 	total_errors = 0;
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
 		if (!dev->in_fs_metadata || !dev->writeable)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cdc961e..a4e36c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -325,10 +325,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
 	struct list_head *head = &info->space_info;
-	struct list_head *cur;
 	struct btrfs_space_info *found;
-	list_for_each(cur, head) {
-		found = list_entry(cur, struct btrfs_space_info, list);
+	list_for_each_entry(found, head, list) {
 		if (found->flags == flags)
 			return found;
 	}
@@ -3013,7 +3011,6 @@ loop_check:
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 {
 	struct btrfs_block_group_cache *cache;
-	struct list_head *l;
 
 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
@@ -3021,8 +3018,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       (info->full) ? "" : "not ");
 
 	down_read(&info->groups_sem);
-	list_for_each(l, &info->block_groups) {
-		cache = list_entry(l, struct btrfs_block_group_cache, list);
+	list_for_each_entry(cache, &info->block_groups, list) {
 		spin_lock(&cache->lock);
 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
 		       "%llu pinned %llu reserved\n",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44dbd55..45cf03e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1323,12 +1323,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, u64 file_offset,
 			     struct list_head *list)
 {
-	struct list_head *cur;
 	struct btrfs_ordered_sum *sum;
 
 	btrfs_set_trans_block_group(trans, inode);
-	list_for_each(cur, list) {
-		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+
+	list_for_each_entry(sum, list, list) {
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 	}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a209401..77c2411 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 	struct btrfs_sector_sum *sector_sums;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-	struct list_head *cur;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 		return 1;
 
 	mutex_lock(&tree->mutex);
-	list_for_each_prev(cur, &ordered->list) {
-		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
 		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f94..919172de 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -852,11 +852,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_pending_snapshot *pending;
 	struct list_head *head = &trans->transaction->pending_snapshots;
-	struct list_head *cur;
 	int ret;
 
-	list_for_each(cur, head) {
-		pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+	list_for_each_entry(pending, head, list) {
 		ret = create_pending_snapshot(trans, fs_info, pending);
 		BUG_ON(ret);
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 397c8db..fd0bedb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -103,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
 						   u64 devid, u8 *uuid)
 {
 	struct btrfs_device *dev;
-	struct list_head *cur;
 
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (dev->devid == devid &&
 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
@@ -117,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
 
 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
-	struct list_head *cur;
 	struct btrfs_fs_devices *fs_devices;
 
-	list_for_each(cur, &fs_uuids) {
-		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+	list_for_each_entry(fs_devices, &fs_uuids, list) {
 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 			return fs_devices;
 	}
@@ -344,14 +340,11 @@ error:
 
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *tmp;
-	struct list_head *cur;
-	struct btrfs_device *device;
+	struct btrfs_device *device, *next;
 
 	mutex_lock(&uuid_mutex);
 again:
-	list_for_each_safe(cur, tmp, &fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata)
 			continue;
 
@@ -382,14 +375,12 @@ again:
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *cur;
 	struct btrfs_device *device;
 
 	if (--fs_devices->opened > 0)
 		return 0;
 
-	list_for_each(cur, &fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
 			close_bdev_exclusive(device->bdev, device->mode);
 			fs_devices->open_devices--;
@@ -438,7 +429,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct block_device *latest_bdev = NULL;
 	struct buffer_head *bh;
@@ -449,8 +439,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
-	list_for_each(cur, head) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
@@ -1016,14 +1005,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (strcmp(device_path, "missing") == 0) {
-		struct list_head *cur;
 		struct list_head *devices;
 		struct btrfs_device *tmp;
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
-		list_for_each(cur, devices) {
-			tmp = list_entry(cur, struct btrfs_device, dev_list);
+		list_for_each_entry(tmp, devices, dev_list) {
 			if (tmp->in_fs_metadata && !tmp->bdev) {
 				device = tmp;
 				break;
@@ -1279,7 +1266,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct list_head *cur;
 	struct list_head *devices;
 	struct super_block *sb = root->fs_info->sb;
 	u64 total_bytes;
@@ -1303,8 +1289,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
-	list_for_each(cur, devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
 			goto error;
@@ -1703,7 +1688,6 @@ static u64 div_factor(u64 num, int factor)
 int btrfs_balance(struct btrfs_root *dev_root)
 {
 	int ret;
-	struct list_head *cur;
 	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
 	struct btrfs_device *device;
 	u64 old_size;
@@ -1722,8 +1706,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
-	list_for_each(cur, devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
-- 
cgit v1.1


From 3dfdb9348ada18c74c39b9ae7b115e0594792281 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: fix locking issue in btrfs_remove_block_group

We should hold the block_group_cache_lock while modifying the
block groups red-black tree. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a4e36c3..3bed6a7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5952,9 +5952,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	btrfs_remove_free_space_cache(block_group);
+	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+	btrfs_remove_free_space_cache(block_group);
 	down_write(&block_group->space_info->groups_sem);
 	list_del(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
-- 
cgit v1.1


From 5a7be515b1f4569aac601170fc681741434cca92 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: Fix infinite loop in btrfs_extent_post_op

btrfs_extent_post_op calls finish_current_insert and del_pending_extents. They
both may enter infinite loops.

finish_current_insert enters infinite loop if it only finds some backrefs to
update.  The fix is to check for pending backref updates before restarting the
loop.

The infinite loop in del_pending_extents is due to a the skipped variable
not being properly reset before looping around.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3bed6a7..aeaec84 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2156,7 +2156,8 @@ again:
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			if (skipped && all && !num_inserts) {
+			if (skipped && all && !num_inserts &&
+			    list_empty(&update_list)) {
 				skipped = 0;
 				search = 0;
 				continue;
@@ -2544,6 +2545,7 @@ again:
 		if (ret) {
 			if (all && skipped && !nr) {
 				search = 0;
+				skipped = 0;
 				continue;
 			}
 			mutex_unlock(&info->extent_ins_mutex);
-- 
cgit v1.1


From 653249ff9aea51e1ace6bd437389f06e2b84393f Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: remove duplicated #include

Removed duplicated #include "compat.h"in
fs/btrfs/extent-tree.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aeaec84..c643433 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,7 +29,6 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
-#include "compat.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
-- 
cgit v1.1


From 95029d7d598babf62276d9006e575992b1333ba5 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: change/remove typedef

Change one typedef to a regular enum, and remove an unused one.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f..e1fec63 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -454,17 +454,11 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-typedef enum {
+enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE = 0,
 	BTRFS_COMPRESS_ZLIB = 1,
 	BTRFS_COMPRESS_LAST = 2,
-} btrfs_compression_type;
-
-/* we don't understand any encryption methods right now */
-typedef enum {
-	BTRFS_ENCRYPTION_NONE = 0,
-	BTRFS_ENCRYPTION_LAST = 1,
-} btrfs_encryption_type;
+};
 
 struct btrfs_inode_item {
 	/* nfs style generation number */
-- 
cgit v1.1


From 86288a198d8e4e8411ff02f9ab848245e8f11257 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: fix stop searching test in replace_one_extent

replace_one_extent searches tree leaves for references to a given extent. It
stops searching if it goes beyond the last possible position.

The last possible position is computed by adding the starting offset of a found
file extent to the full size of the extent. The code uses physical size of the
extent as the full size. This is incorrect when compression is used.

The fix is get the full size from ram_bytes field of file extent item.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c643433..1d7f043 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4440,7 +4440,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 	u64 lock_end = 0;
 	u64 num_bytes;
 	u64 ext_offset;
-	u64 first_pos;
+	u64 search_end = (u64)-1;
 	u32 nritems;
 	int nr_scaned = 0;
 	int extent_locked = 0;
@@ -4448,7 +4448,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 	int ret;
 
 	memcpy(&key, leaf_key, sizeof(key));
-	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
 		if (key.objectid < ref_path->owner_objectid ||
 		    (key.objectid == ref_path->owner_objectid &&
@@ -4497,7 +4496,7 @@ next:
 			if ((key.objectid > ref_path->owner_objectid) ||
 			    (key.objectid == ref_path->owner_objectid &&
 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
-			    (key.offset >= first_pos + extent_key->offset))
+			    key.offset >= search_end)
 				break;
 		}
 
@@ -4530,8 +4529,10 @@ next:
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
 		ext_offset = btrfs_file_extent_offset(leaf, fi);
 
-		if (first_pos > key.offset - ext_offset)
-			first_pos = key.offset - ext_offset;
+		if (search_end == (u64)-1) {
+			search_end = key.offset - ext_offset +
+				btrfs_file_extent_ram_bytes(leaf, fi);
+		}
 
 		if (!extent_locked) {
 			lock_start = key.offset;
@@ -4720,7 +4721,7 @@ next:
 		}
 skip:
 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-		    key.offset >= first_pos + extent_key->offset)
+		    key.offset >= search_end)
 			break;
 
 		cond_resched();
-- 
cgit v1.1


From 7e6628544abad773222d8b177f738ac2db1859de Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 21 Jan 2009 10:49:16 -0500
Subject: Btrfs: open_ctree() error handling can oops on fs_info

a bug in open_ctree:

struct btrfs_root *open_ctree(..)
{
....
	if (!extent_root || !tree_root || !fs_info ||
	    !chunk_root || !dev_root || !csum_root) {
		err = -ENOMEM;
		goto fail;
//When code flow goes to "fail", fs_info may be NULL or uninitialized.
	}
....

fail:
	btrfs_close_devices(fs_info->fs_devices);// !
	btrfs_mapping_tree_free(&fs_info->mapping_tree);// !

	kfree(extent_root);
	kfree(tree_root);
	bdi_destroy(&fs_info->bdi);// !
...
)

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 26a1877..3cf1725 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1823,13 +1823,14 @@ fail_sb_buffer:
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
-fail:
+
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	bdi_destroy(&fs_info->bdi);
 
+fail:
 	kfree(extent_root);
 	kfree(tree_root);
-	bdi_destroy(&fs_info->bdi);
 	kfree(fs_info);
 	kfree(chunk_root);
 	kfree(dev_root);
-- 
cgit v1.1


From 7237f1833601dcc435a64176c2c347ec4bd959f9 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 21 Jan 2009 12:54:03 -0500
Subject: Btrfs: fix tree logs parallel sync

To improve performance, btrfs_sync_log merges tree log sync
requests. But it wrongly merges sync requests for different
tree logs. If multiple tree logs are synced at the same time,
only one of them actually gets synced.

This patch has following changes to fix the bug:

Move most tree log related fields in btrfs_fs_info to
btrfs_root. This allows merging sync requests separately
for each tree log.

Don't insert root item into the log root tree immediately
after log tree is allocated. Root item for log tree is
inserted when log tree get synced for the first time. This
allows syncing the log root tree without first syncing all
log trees.

At tree-log sync, btrfs_sync_log first sync the log tree;
then updates corresponding root item in the log root tree;
sync the log root tree; then update the super block.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       |  13 +-
 fs/btrfs/disk-io.c     |  79 +++++++++--
 fs/btrfs/disk-io.h     |   2 +
 fs/btrfs/extent-tree.c |  10 +-
 fs/btrfs/file.c        |   4 +-
 fs/btrfs/tree-log.c    | 350 +++++++++++++++++++++++--------------------------
 6 files changed, 248 insertions(+), 210 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e1fec63..de103a8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,9 +695,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
-
 	wait_queue_head_t async_submit_wait;
-	wait_queue_head_t tree_log_wait;
 
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
@@ -724,10 +722,6 @@ struct btrfs_fs_info {
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
-	atomic_t tree_log_writers;
-	atomic_t tree_log_commit;
-	unsigned long tree_log_batch;
-	u64 tree_log_transid;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -827,7 +821,14 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+
 	struct mutex log_mutex;
+	wait_queue_head_t log_writer_wait;
+	wait_queue_head_t log_commit_wait[2];
+	atomic_t log_writers;
+	atomic_t log_commit[2];
+	unsigned long log_transid;
+	unsigned long log_batch;
 
 	u64 objectid;
 	u64 last_trans;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3cf1725..7feac5a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -849,6 +849,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	init_waitqueue_head(&root->log_writer_wait);
+	init_waitqueue_head(&root->log_commit_wait[0]);
+	init_waitqueue_head(&root->log_commit_wait[1]);
+	atomic_set(&root->log_commit[0], 0);
+	atomic_set(&root->log_commit[1], 0);
+	atomic_set(&root->log_writers, 0);
+	root->log_batch = 0;
+	root->log_transid = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
@@ -933,15 +941,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct extent_buffer *leaf;
 
 	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
@@ -950,12 +959,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
 	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
 	root->ref_cows = 0;
 
-	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
-					    0, BTRFS_TREE_LOG_OBJECTID,
-					    trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, BTRFS_TREE_LOG_OBJECTID,
+				      trans->transid, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		kfree(root);
+		return ERR_CAST(leaf);
+	}
 
+	root->node = leaf;
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
 	btrfs_set_header_bytenr(root->node, root->node->start);
@@ -967,7 +987,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			    BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
-	fs_info->log_root_tree = root;
+	return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *log_root;
+
+	log_root = alloc_log_tree(trans, fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+	WARN_ON(fs_info->log_root_tree);
+	fs_info->log_root_tree = log_root;
+	return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root)
+{
+	struct btrfs_root *log_root;
+	struct btrfs_inode_item *inode_item;
+
+	log_root = alloc_log_tree(trans, root->fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+
+	log_root->last_trans = trans->transid;
+	log_root->root_key.offset = root->root_key.objectid;
+
+	inode_item = &log_root->root_item.inode;
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+	btrfs_set_root_generation(&log_root->root_item, trans->transid);
+
+	WARN_ON(root->log_root);
+	root->log_root = log_root;
+	root->log_transid = 0;
 	return 0;
 }
 
@@ -1530,10 +1591,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
-	init_waitqueue_head(&fs_info->tree_log_wait);
-	atomic_set(&fs_info->tree_log_commit, 0);
-	atomic_set(&fs_info->tree_log_writers, 0);
-	fs_info->tree_log_transid = 0;
 
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404..494a56e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1d7f043..3b26f09 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2698,13 +2698,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	/* if metadata always pin */
 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-			struct btrfs_block_group_cache *cache;
-
-			/* btrfs_free_reserved_extent */
-			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-			BUG_ON(!cache);
-			btrfs_add_free_space(cache, bytenr, num_bytes);
-			put_block_group(cache);
+			mutex_lock(&root->fs_info->pinned_mutex);
+			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			update_reserved_extents(root, bytenr, num_bytes, 0);
 			return 0;
 		}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fbcbf43..3e8023e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1214,10 +1214,10 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
-	root->fs_info->tree_log_batch++;
+	root->log_batch++;
 	filemap_fdatawrite(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	root->fs_info->tree_log_batch++;
+	root->log_batch++;
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2..4f26f3e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  */
 
 /*
- * btrfs_add_log_tree adds a new per-subvolume log tree into the
- * tree of log tree roots.  This must be called with a tree log transaction
- * running (see start_log_trans).
- */
-static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root)
-{
-	struct btrfs_key key;
-	struct btrfs_root_item root_item;
-	struct btrfs_inode_item *inode_item;
-	struct extent_buffer *leaf;
-	struct btrfs_root *new_root = root;
-	int ret;
-	u64 objectid = root->root_key.objectid;
-
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID,
-				      trans->transid, 0, 0, 0);
-	if (IS_ERR(leaf)) {
-		ret = PTR_ERR(leaf);
-		return ret;
-	}
-
-	btrfs_set_header_nritems(leaf, 0);
-	btrfs_set_header_level(leaf, 0);
-	btrfs_set_header_bytenr(leaf, leaf->start);
-	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
-
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
-			    BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(leaf);
-
-	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-
-	btrfs_set_root_bytenr(&root_item, leaf->start);
-	btrfs_set_root_generation(&root_item, trans->transid);
-	btrfs_set_root_level(&root_item, 0);
-	btrfs_set_root_refs(&root_item, 0);
-	btrfs_set_root_used(&root_item, 0);
-
-	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
-	root_item.drop_level = 0;
-
-	btrfs_tree_unlock(leaf);
-	free_extent_buffer(leaf);
-	leaf = NULL;
-
-	btrfs_set_root_dirid(&root_item, 0);
-
-	key.objectid = BTRFS_TREE_LOG_OBJECTID;
-	key.offset = objectid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
-				&root_item);
-	if (ret)
-		goto fail;
-
-	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
-					       &key);
-	BUG_ON(!new_root);
-
-	WARN_ON(root->log_root);
-	root->log_root = new_root;
-
-	/*
-	 * log trees do not get reference counted because they go away
-	 * before a real commit is actually done.  They do store pointers
-	 * to file data extents, and those reference counts still get
-	 * updated (along with back refs to the log tree).
-	 */
-	new_root->ref_cows = 0;
-	new_root->last_trans = trans->transid;
-
-	/*
-	 * we need to make sure the root block for this new tree
-	 * is marked as dirty in the dirty_log_pages tree.  This
-	 * is how it gets flushed down to disk at tree log commit time.
-	 *
-	 * the tree logging mutex keeps others from coming in and changing
-	 * the new_root->node, so we can safely access it here
-	 */
-	set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
-			 new_root->node->start + new_root->node->len - 1,
-			 GFP_NOFS);
-
-fail:
-	return ret;
-}
-
-/*
  * start a sub transaction and setup the log tree
  * this increments the log tree writer count to make the people
  * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	int ret;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		root->log_batch++;
+		atomic_inc(&root->log_writers);
+		mutex_unlock(&root->log_mutex);
+		return 0;
+	}
 	mutex_lock(&root->fs_info->tree_log_mutex);
 	if (!root->fs_info->log_root_tree) {
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		ret = btrfs_add_log_tree(trans, root);
 		BUG_ON(ret);
 	}
-	atomic_inc(&root->fs_info->tree_log_writers);
-	root->fs_info->tree_log_batch++;
 	mutex_unlock(&root->fs_info->tree_log_mutex);
+	root->log_batch++;
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
 	return 0;
 }
 
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
 	if (!root->log_root)
 		return -ENOENT;
 
-	mutex_lock(&root->fs_info->tree_log_mutex);
+	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
 		ret = 0;
-		atomic_inc(&root->fs_info->tree_log_writers);
-		root->fs_info->tree_log_batch++;
+		atomic_inc(&root->log_writers);
 	}
-	mutex_unlock(&root->fs_info->tree_log_mutex);
+	mutex_unlock(&root->log_mutex);
 	return ret;
 }
 
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
  */
 static int end_log_trans(struct btrfs_root *root)
 {
-	atomic_dec(&root->fs_info->tree_log_writers);
-	smp_mb();
-	if (waitqueue_active(&root->fs_info->tree_log_wait))
-		wake_up(&root->fs_info->tree_log_wait);
+	if (atomic_dec_and_test(&root->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&root->log_writer_wait))
+			wake_up(&root->log_writer_wait);
+	}
 	return 0;
 }
 
@@ -1902,26 +1813,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 		}
 	}
 	btrfs_free_path(path);
-	if (wc->free)
-		free_extent_buffer(log->node);
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_root *log)
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	int ret;
+
+	if (log->log_transid == 1) {
+		/* insert root item on the first sync */
+		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	} else {
+		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	}
+	return ret;
+}
+
+static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 {
 	DEFINE_WAIT(wait);
-	u64 transid = log->fs_info->tree_log_transid;
+	int index = transid % 2;
 
+	/*
+	 * we only allow two pending log transactions at a time,
+	 * so we know that if ours is more than 2 older than the
+	 * current transaction, we're done
+	 */
 	do {
-		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&log->fs_info->tree_log_mutex);
-		if (atomic_read(&log->fs_info->tree_log_commit))
+		prepare_to_wait(&root->log_commit_wait[index],
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (root->log_transid < transid + 2 &&
+		    atomic_read(&root->log_commit[index]))
 			schedule();
-		finish_wait(&log->fs_info->tree_log_wait, &wait);
-		mutex_lock(&log->fs_info->tree_log_mutex);
-	} while (transid == log->fs_info->tree_log_transid &&
-		atomic_read(&log->fs_info->tree_log_commit));
+		finish_wait(&root->log_commit_wait[index], &wait);
+		mutex_lock(&root->log_mutex);
+	} while (root->log_transid < transid + 2 &&
+		 atomic_read(&root->log_commit[index]));
+	return 0;
+}
+
+static int wait_for_writer(struct btrfs_root *root)
+{
+	DEFINE_WAIT(wait);
+	while (atomic_read(&root->log_writers)) {
+		prepare_to_wait(&root->log_writer_wait,
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (atomic_read(&root->log_writers))
+			schedule();
+		mutex_lock(&root->log_mutex);
+		finish_wait(&root->log_writer_wait, &wait);
+	}
 	return 0;
 }
 
@@ -1933,57 +1883,114 @@ static int wait_log_commit(struct btrfs_root *log)
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root)
 {
+	int index1;
+	int index2;
 	int ret;
-	unsigned long batch;
 	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
 
-	mutex_lock(&log->fs_info->tree_log_mutex);
-	if (atomic_read(&log->fs_info->tree_log_commit)) {
-		wait_log_commit(log);
-		goto out;
+	mutex_lock(&root->log_mutex);
+	index1 = root->log_transid % 2;
+	if (atomic_read(&root->log_commit[index1])) {
+		wait_log_commit(root, root->log_transid);
+		mutex_unlock(&root->log_mutex);
+		return 0;
 	}
-	atomic_set(&log->fs_info->tree_log_commit, 1);
+	atomic_set(&root->log_commit[index1], 1);
+
+	/* wait for previous tree log sync to complete */
+	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+		wait_log_commit(root, root->log_transid - 1);
 
 	while (1) {
-		batch = log->fs_info->tree_log_batch;
-		mutex_unlock(&log->fs_info->tree_log_mutex);
+		unsigned long batch = root->log_batch;
+		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
-		mutex_lock(&log->fs_info->tree_log_mutex);
-
-		while (atomic_read(&log->fs_info->tree_log_writers)) {
-			DEFINE_WAIT(wait);
-			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
-					TASK_UNINTERRUPTIBLE);
-			mutex_unlock(&log->fs_info->tree_log_mutex);
-			if (atomic_read(&log->fs_info->tree_log_writers))
-				schedule();
-			mutex_lock(&log->fs_info->tree_log_mutex);
-			finish_wait(&log->fs_info->tree_log_wait, &wait);
-		}
-		if (batch == log->fs_info->tree_log_batch)
+		mutex_lock(&root->log_mutex);
+		wait_for_writer(root);
+		if (batch == root->log_batch)
 			break;
 	}
 
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
-	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
-			       &root->fs_info->log_root_tree->dirty_log_pages);
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+
+	root->log_batch = 0;
+	root->log_transid++;
+	log->log_transid = root->log_transid;
+	smp_mb();
+	/*
+	 * log tree has been flushed to disk, new modifications of
+	 * the log will be written to new positions. so it's safe to
+	 * allow log writers to go in.
+	 */
+	mutex_unlock(&root->log_mutex);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_writers);
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&log_root_tree->log_writer_wait))
+			wake_up(&log_root_tree->log_writer_wait);
+	}
+
+	index2 = log_root_tree->log_transid % 2;
+	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		wait_log_commit(log_root_tree, log_root_tree->log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out;
+	}
+	atomic_set(&log_root_tree->log_commit[index2], 1);
+
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+		wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+
+	wait_for_writer(log_root_tree);
+
+	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+				&log_root_tree->dirty_log_pages);
 	BUG_ON(ret);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
-				 log->fs_info->log_root_tree->node->start);
+				log_root_tree->node->start);
 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
-		       btrfs_header_level(log->fs_info->log_root_tree->node));
+				btrfs_header_level(log_root_tree->node));
+
+	log_root_tree->log_batch = 0;
+	log_root_tree->log_transid++;
+	smp_mb();
+
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	/*
+	 * nobody else is going to jump in and write the the ctree
+	 * super here because the log_commit atomic below is protecting
+	 * us.  We must be called with a transaction handle pinning
+	 * the running transaction open, so a full commit can't hop
+	 * in and cause problems either.
+	 */
+	write_ctree_super(trans, root->fs_info->tree_root, 2);
 
-	write_ctree_super(trans, log->fs_info->tree_root, 2);
-	log->fs_info->tree_log_transid++;
-	log->fs_info->tree_log_batch = 0;
-	atomic_set(&log->fs_info->tree_log_commit, 0);
+	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
-	if (waitqueue_active(&log->fs_info->tree_log_wait))
-		wake_up(&log->fs_info->tree_log_wait);
+	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
-	mutex_unlock(&log->fs_info->tree_log_mutex);
+	atomic_set(&root->log_commit[index1], 0);
+	smp_mb();
+	if (waitqueue_active(&root->log_commit_wait[index1]))
+		wake_up(&root->log_commit_wait[index1]);
 	return 0;
 }
 
@@ -2019,38 +2026,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 				   start, end, GFP_NOFS);
 	}
 
-	log = root->log_root;
-	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-			     &log->root_key);
-	BUG_ON(ret);
+	if (log->log_transid > 0) {
+		ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+				     &log->root_key);
+		BUG_ON(ret);
+	}
 	root->log_root = NULL;
-	kfree(root->log_root);
+	free_extent_buffer(log->node);
+	kfree(log);
 	return 0;
 }
 
 /*
- * helper function to update the item for a given subvolumes log root
- * in the tree of log roots
- */
-static int update_log_root(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *log)
-{
-	u64 bytenr = btrfs_root_bytenr(&log->root_item);
-	int ret;
-
-	if (log->node->start == bytenr)
-		return 0;
-
-	btrfs_set_root_bytenr(&log->root_item, log->node->start);
-	btrfs_set_root_generation(&log->root_item, trans->transid);
-	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
-	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
-				&log->root_key, &log->root_item);
-	BUG_ON(ret);
-	return ret;
-}
-
-/*
  * If both a file and directory are logged, and unlinks or renames are
  * mixed in, we have a few interesting corners:
  *
@@ -2711,11 +2698,6 @@ next_slot:
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
-
-	mutex_lock(&root->fs_info->tree_log_mutex);
-	ret = update_log_root(trans, log);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->tree_log_mutex);
 out:
 	return 0;
 }
-- 
cgit v1.1


From 35054394c4b3cecd52577c2662c84da1f3e73525 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 21 Jan 2009 13:11:13 -0500
Subject: Btrfs: stop providing a bmap operation to avoid swapfile corruptions

Swapfiles use bmap to build a list of extents belonging to the file,
and they assume these extents won't change over the life of the file.
They also use resulting list to do IO directly to the block device.

This causes problems for btrfs in a few ways:

btrfs returns logical block numbers through bmap, and these are not suitable
for IO.  They might translate to different devices, raid etc.

COW means that file block mappings are going to change frequently.

Using swapfiles on btrfs will lead to corruption, so we're avoiding the
problem for now by dropping bmap support entirely.  A later commit
will add fiemap support for people that really want to know how
a file is laid out.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 45cf03e..2e25d69 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4156,11 +4156,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	return -EINVAL;
 }
 
-static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
-{
-	return extent_bmap(mapping, iblock, btrfs_get_extent);
-}
-
 int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -4985,13 +4980,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
 
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file.  They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen.  So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
 	.sync_page	= block_sync_page,
-	.bmap		= btrfs_bmap,
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
-- 
cgit v1.1


From 1506fcc8189cdd4b95e06df7845a09f18b4526a6 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 21 Jan 2009 14:39:14 -0500
Subject: Btrfs: fiemap support

Now that bmap support is gone, this is the only way to get extent
mappings for userland.  These are still not valid for IO, but they
can tell us if a file has holes or how much fragmentation there is.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/btrfs/extent_io.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |  2 ++
 fs/btrfs/inode.c     |  7 ++++
 3 files changed, 101 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9446d4..a3b0676 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2854,6 +2854,98 @@ out:
 	return sector;
 }
 
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent)
+{
+	int ret;
+	u64 off = start;
+	u64 max = start + len;
+	u32 flags = 0;
+	u64 disko = 0;
+	struct extent_map *em = NULL;
+	int end = 0;
+	u64 em_start = 0, em_len = 0;
+	unsigned long emflags;
+	ret = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+		GFP_NOFS);
+	em = get_extent(inode, NULL, 0, off, max - off, 0);
+	if (!em)
+		goto out;
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+	while (!end) {
+		off = em->start + em->len;
+		if (off >= max)
+			end = 1;
+
+		em_start = em->start;
+		em_len = em->len;
+
+		disko = 0;
+		flags = 0;
+
+		switch (em->block_start) {
+		case EXTENT_MAP_LAST_BYTE:
+			end = 1;
+			flags |= FIEMAP_EXTENT_LAST;
+			break;
+		case EXTENT_MAP_HOLE:
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
+			break;
+		case EXTENT_MAP_INLINE:
+			flags |= (FIEMAP_EXTENT_DATA_INLINE |
+				  FIEMAP_EXTENT_NOT_ALIGNED);
+			break;
+		case EXTENT_MAP_DELALLOC:
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+			break;
+		default:
+			disko = em->block_start;
+			break;
+		}
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			flags |= FIEMAP_EXTENT_ENCODED;
+
+		emflags = em->flags;
+		free_extent_map(em);
+		em = NULL;
+
+		if (!end) {
+			em = get_extent(inode, NULL, 0, off, max - off, 0);
+			if (!em)
+				goto out;
+			if (IS_ERR(em)) {
+				ret = PTR_ERR(em);
+				goto out;
+			}
+			emflags = em->flags;
+		}
+		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					em_len, flags);
+		if (ret)
+			goto out_free;
+	}
+out_free:
+	free_extent_map(em);
+out:
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+			GFP_NOFS);
+	return ret;
+}
+
 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a..e80c6d9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -193,6 +193,8 @@ int extent_commit_write(struct extent_io_tree *tree,
 			unsigned from, unsigned to);
 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		get_extent_t *get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent);
 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2e25d69..288c2cd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4156,6 +4156,12 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	return -EINVAL;
 }
 
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+}
+
 int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -5021,6 +5027,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.fallocate	= btrfs_fallocate,
+	.fiemap		= btrfs_fiemap,
 };
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
-- 
cgit v1.1


From 335debee07f2d4187a6073d7764ed56bb2ae52f4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:27:30 +0300
Subject: fs/Kconfig: move btrfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/btrfs/Kconfig | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 fs/btrfs/Kconfig

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 0000000..f8fcf99
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,18 @@
+config BTRFS_FS
+	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+	depends on EXPERIMENTAL
+	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	help
+	  Btrfs is a new filesystem with extents, writable snapshotting,
+	  support for multiple devices and many more features.
+
+	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+	  FINALIZED.  You should say N here unless you are interested in
+	  testing Btrfs with non-critical data.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
-- 
cgit v1.1


From a717531942f488209dded30f6bc648167bcefa72 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 22 Jan 2009 09:23:10 -0500
Subject: Btrfs: do less aggressive btree readahead

Just before reading a leaf, btrfs scans the node for blocks that are
close by and reads them too.  It tries to build up a large window
of IO looking for blocks that are within a max distance from the top
and bottom of the IO window.

This patch changes things to just look for blocks within 64k of the
target block.  It will trigger less IO and make for lower latencies on
the read size.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c07..2603ee5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1210,8 +1210,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 	struct btrfs_disk_key disk_key;
 	u32 nritems;
 	u64 search;
-	u64 lowest_read;
-	u64 highest_read;
+	u64 target;
 	u64 nread = 0;
 	int direction = path->reada;
 	struct extent_buffer *eb;
@@ -1235,8 +1234,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		return;
 	}
 
-	highest_read = search;
-	lowest_read = search;
+	target = search;
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
@@ -1256,24 +1254,15 @@ static noinline void reada_for_search(struct btrfs_root *root,
 				break;
 		}
 		search = btrfs_node_blockptr(node, nr);
-		if ((search >= lowest_read && search <= highest_read) ||
-		    (search < lowest_read && lowest_read - search <= 16384) ||
-		    (search > highest_read && search - highest_read <= 16384)) {
+		if ((search <= target && target - search <= 65536) ||
+		    (search > target && search - target <= 65536)) {
 			readahead_tree_block(root, search, blocksize,
 				     btrfs_node_ptr_generation(node, nr));
 			nread += blocksize;
 		}
 		nscan++;
-		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+		if ((nread > 65536 || nscan > 32))
 			break;
-
-		if (nread > (256 * 1024) || nscan > 128)
-			break;
-
-		if (search < lowest_read)
-			lowest_read = search;
-		if (search > highest_read)
-			highest_read = search;
 	}
 }
 
-- 
cgit v1.1


From 89f135d8b53bcccafd91a075366d2704ba257cf3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Jan 2009 15:34:27 -0500
Subject: Btrfs: fix readdir on 32 bit machines

After btrfs_readdir has gone through all the directory items, it
sets the directory f_pos to the largest possible int.  This way
applications that mix readdir with creating new files don't
end up in an endless loop finding the new directory items as they go.

It was a workaround for a bug in git, but the assumption was that if git
could make this looping mistake than it would be a common problem.

The largest possible int chosen was INT_LIMIT(typeof(file->f_pos),
and it is possible for that to be a larger number than 32 bit glibc
expects to come out of readdir.

This patches switches that to INT_LIMIT(off_t), which should keep
applications happy on 32 and 64 bit machines.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 288c2cd..2bb65e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3263,7 +3263,7 @@ skip:
 
 	/* Reached end of directory/root. Bump pos past the last item. */
 	if (key_type == BTRFS_DIR_INDEX_KEY)
-		filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+		filp->f_pos = INT_LIMIT(off_t);
 	else
 		filp->f_pos++;
 nopos:
-- 
cgit v1.1


From a68370515356a3eddbfaf7f56418b3cf85d76c2c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:19:41 -0500
Subject: Btrfs: Catch missed bios in the async bio submission thread

The async bio submission thread was missing some bios that were
added after it had decided there was no work left to do.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 10 +++++++++-
 fs/btrfs/volumes.c      | 11 +++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d5f4e94..f2e80f3 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -349,6 +349,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 {
 	struct btrfs_worker_thread *worker = work->worker;
 	unsigned long flags;
+	int wake = 0;
 
 	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
@@ -367,10 +368,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
 			       &worker->workers->worker_list);
 		spin_unlock_irqrestore(&worker->workers->lock, flags);
 	}
+	if (!worker->working) {
+		wake = 1;
+		worker->working = 1;
+	}
 
 	spin_unlock_irqrestore(&worker->lock, flags);
-
+	if (wake)
+		wake_up_process(worker->task);
 out:
+
 	return 0;
 }
 
@@ -397,6 +404,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	}
 
 	spin_lock_irqsave(&worker->lock, flags);
+
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
 	list_add_tail(&work->list, &worker->pending);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fd0bedb..bcd14eb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -154,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 loop:
 	spin_lock(&device->io_lock);
 
+loop_lock:
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
@@ -203,7 +204,7 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) &&
+		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
 
@@ -215,7 +216,8 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
-			device->running_pending = 0;
+
+			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
@@ -224,6 +226,11 @@ loop:
 	}
 	if (again)
 		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
 done:
 	return 0;
 }
-- 
cgit v1.1


From bef62ef339c15d7721da88958b03f7b544464722 Mon Sep 17 00:00:00 2001
From: Christian Hesse <mail@earthworm.de>
Date: Wed, 4 Feb 2009 09:28:28 -0500
Subject: Btrfs: make btrfs acls selectable This patch adds a menu entry to
 kconfig to enable acls for btrfs. This allows you to enable FS_POSIX_ACL at
 kernel compile time.

(updated by Jeff Mahoney to make the changes in fs/btrfs/Kconfig instead)

Signed-off-by: Christian Hesse <mail@earthworm.de>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf99..7bb3c02 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
 	  module will be called btrfs.
 
 	  If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+	bool "Btrfs POSIX Access Control Lists"
+	depends on BTRFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
-- 
cgit v1.1


From 0279b4cd86685b5eea467c1b74ce94f0add2c0a3 Mon Sep 17 00:00:00 2001
From: Jim Owens <jowens@hp.com>
Date: Wed, 4 Feb 2009 09:29:13 -0500
Subject: Btrfs: selinux support Add call to LSM security initialization and
 save resulting security xattr for new inodes.

Add xattr support to symlink inode ops.

Set inode->i_op for existing special files.

Signed-off-by: jim owens <jowens@hp.com>
---
 fs/btrfs/inode.c | 23 +++++++++++++++++++----
 fs/btrfs/xattr.c | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/xattr.h |  2 ++
 3 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bb65e9..4a79e1c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -90,6 +90,16 @@ static noinline int cow_file_range(struct inode *inode,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
 
+static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
+{
+	int err;
+
+	err = btrfs_init_acl(inode, dir);
+	if (!err)
+		err = btrfs_xattr_security_init(inode, dir);
+	return err;
+}
+
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
  * are countless ways this is incorrect, but it is better than nothing.
@@ -2037,6 +2047,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
+		inode->i_op = &btrfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 	}
@@ -3584,7 +3595,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -3647,7 +3658,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -3770,7 +3781,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	drop_on_err = 1;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err)
 		goto out_fail;
 
@@ -4732,7 +4743,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -5043,4 +5054,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
 };
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b4fa5f4..312b943 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
+#include <linux/security.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
@@ -330,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 		return -EOPNOTSUPP;
 	return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
 }
+
+int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *suffix;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
+		       GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+	} else {
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+		err = __btrfs_setxattr(inode, name, value, len, 0);
+		kfree(name);
+	}
+
+	kfree(suffix);
+	kfree(value);
+	return err;
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f..c71e9c3 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 
+extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
+
 #endif /* __XATTR__ */
-- 
cgit v1.1


From b51912c91fcf7581cc7b4550f1bb96422809d9ed Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:23:24 -0500
Subject: Btrfs: async threads should try harder to find work

Tracing shows the delay between when an async thread goes to sleep
and when more work is added is often very short.  This commit adds
a little bit of delay and extra checking to the code right before
we schedule out.

It allows more work to be added to the worker
without requiring notifications from other procs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 50 +++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/disk-io.c      |  2 ++
 2 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index f2e80f3..c84ca1f 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -19,7 +19,8 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-# include <linux/freezer.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
 #include "async-thread.h"
 
 #define WORK_QUEUED_BIT 0
@@ -142,6 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
+again_locked:
 		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
@@ -164,14 +166,50 @@ static int worker_loop(void *arg)
 			check_idle_worker(worker);
 
 		}
-		worker->working = 0;
 		if (freezing(current)) {
+			worker->working = 0;
+			spin_unlock_irq(&worker->lock);
 			refrigerator();
 		} else {
-			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&worker->lock);
-			if (!kthread_should_stop())
+			if (!kthread_should_stop()) {
+				cpu_relax();
+				/*
+				 * we've dropped the lock, did someone else
+				 * jump_in?
+				 */
+				smp_mb();
+				if (!list_empty(&worker->pending))
+					continue;
+
+				/*
+				 * this short schedule allows more work to
+				 * come in without the queue functions
+				 * needing to go through wake_up_process()
+				 *
+				 * worker->working is still 1, so nobody
+				 * is going to try and wake us up
+				 */
+				schedule_timeout(1);
+				smp_mb();
+				if (!list_empty(&worker->pending))
+					continue;
+
+				/* still no more work?, sleep for real */
+				spin_lock_irq(&worker->lock);
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (!list_empty(&worker->pending))
+					goto again_locked;
+
+				/*
+				 * this makes sure we get a wakeup when someone
+				 * adds something new to the queue
+				 */
+				worker->working = 0;
+				spin_unlock_irq(&worker->lock);
+
 				schedule();
+			}
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
@@ -355,8 +393,8 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
+	atomic_inc(&worker->num_pending);
 
 	/* by definition we're busy, take ourselves off the idle
 	 * list
@@ -405,9 +443,9 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	spin_lock_irqsave(&worker->lock, flags);
 
+	list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
-	list_add_tail(&work->list, &worker->pending);
 
 	/*
 	 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7feac5a..9c38100 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1679,6 +1679,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * low idle thresh
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_meta_workers.idle_thresh = 4;
+
 	fs_info->endio_write_workers.idle_thresh = 64;
 	fs_info->endio_meta_write_workers.idle_thresh = 64;
 
-- 
cgit v1.1


From b7a9f29fcf4e53e9ca7982331649fa2013e69c99 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:23:45 -0500
Subject: Btrfs: sort references by byte number during btrfs_inc_ref

When a block goes through cow, we update the reference counts of
everything that block points to.  The internal pointers of the block
can be in just about any order, and it is likely to have clusters of
things that are close together and clusters of things that are not.

To help reduce the seeks that come with updating all of these reference
counts, sort them by byte number before actual updates are done.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 79 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b26f09..7a22f2e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/sort.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -1521,15 +1522,50 @@ out:
 	return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
-		  u32 *nr_extents)
+/* when a block goes through cow, we update the reference counts of
+ * everything that block points to.  The internal pointers of the block
+ * can be in just about any order, and it is likely to have clusters of
+ * things that are close together and clusters of things that are not.
+ *
+ * To help reduce the seeks that come with updating all of these reference
+ * counts, sort them by byte number before actual updates are done.
+ *
+ * struct refsort is used to match byte number to slot in the btree block.
+ * we sort based on the byte number and then use the slot to actually
+ * find the item.
+ */
+struct refsort {
+	u64 bytenr;
+	u32 slot;
+};
+
+/*
+ * for passing into sort()
+ */
+static int refsort_cmp(const void *a_void, const void *b_void)
+{
+	const struct refsort *a = a_void;
+	const struct refsort *b = b_void;
+
+	if (a->bytenr < b->bytenr)
+		return -1;
+	if (a->bytenr > b->bytenr)
+		return 1;
+	return 0;
+}
+
+
+noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *orig_buf,
+			   struct extent_buffer *buf, u32 *nr_extents)
 {
 	u64 bytenr;
 	u64 ref_root;
 	u64 orig_root;
 	u64 ref_generation;
 	u64 orig_generation;
+	struct refsort *sorted;
 	u32 nritems;
 	u32 nr_file_extents = 0;
 	struct btrfs_key key;
@@ -1538,6 +1574,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int level;
 	int ret = 0;
 	int faili = 0;
+	int refi = 0;
+	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
 			    u64, u64, u64, u64, u64, u64, u64, u64);
 
@@ -1549,6 +1587,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
+	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+	BUG_ON(!sorted);
+
 	if (root->ref_cows) {
 		process_func = __btrfs_inc_extent_ref;
 	} else {
@@ -1561,6 +1602,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		process_func = __btrfs_update_extent_ref;
 	}
 
+	/*
+	 * we make two passes through the items.  In the first pass we
+	 * only record the byte number and slot.  Then we sort based on
+	 * byte number and do the actual work based on the sorted results
+	 */
 	for (i = 0; i < nritems; i++) {
 		cond_resched();
 		if (level == 0) {
@@ -1577,6 +1623,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				continue;
 
 			nr_file_extents++;
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		}
+	}
+	/*
+	 * if refi == 0, we didn't actually put anything into the sorted
+	 * array and we're done
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		cond_resched();
+		slot = sorted[i].slot;
+		bytenr = sorted[i].bytenr;
+
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
 
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
@@ -1585,25 +1657,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   key.objectid);
 
 			if (ret) {
-				faili = i;
+				faili = slot;
 				WARN_ON(1);
 				goto fail;
 			}
 		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   level - 1);
 			if (ret) {
-				faili = i;
+				faili = slot;
 				WARN_ON(1);
 				goto fail;
 			}
 		}
 	}
 out:
+	kfree(sorted);
 	if (nr_extents) {
 		if (level == 0)
 			*nr_extents = nr_file_extents;
@@ -1612,6 +1684,7 @@ out:
 	}
 	return 0;
 fail:
+	kfree(sorted);
 	WARN_ON(1);
 	return ret;
 }
-- 
cgit v1.1


From 3935127c50c84106d654ef14962cff28c660bc62 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:24:05 -0500
Subject: Btrfs: disable leak debugging checks in extent_io.c

extent_io.c has debugging code to report and free leaked extent_state
and extent_buffer objects at rmmod time.  This helps track down
leaks and it saves you from rebooting just to properly remove the
kmem_cache object.

But, the code runs under a fairly expensive spinlock and the checks to
see if it is currently enabled are not entirely consistent.  Some use
#ifdef and some #if.

This changes everything to #if and disables the leak checking.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a3b0676..2ea7f05 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,7 +30,7 @@ static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
 #define LEAK_DEBUG 0
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
 #endif
 
@@ -119,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 #endif
 
@@ -129,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&state->leak_list, &states);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -144,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 		unsigned long flags;
 #endif
 		WARN_ON(state->tree);
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 		spin_lock_irqsave(&leak_lock, flags);
 		list_del(&state->leak_list);
 		spin_unlock_irqrestore(&leak_lock, flags);
@@ -2983,7 +2983,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 #endif
 
@@ -2991,7 +2991,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	eb->start = start;
 	eb->len = len;
 	mutex_init(&eb->mutex);
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -3003,7 +3003,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 	spin_lock_irqsave(&leak_lock, flags);
 	list_del(&eb->leak_list);
-- 
cgit v1.1


From c487685d7c18a8481900755aa5c56a7a74193101 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:24:25 -0500
Subject: Btrfs: hash_lock is no longer needed

Before metadata is written to disk, it is updated to reflect that writeout
has begun.  Once this update is done, the block must be cow'd before it
can be modified again.

This update was originally synchronized by using a per-fs spinlock.  Today
the buffers for the metadata blocks are locked before writeout begins,
and everyone that tests the flag has the buffer locked as well.

So, the per-fs spinlock (called hash_lock for no good reason) is no
longer required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 7 +------
 fs/btrfs/ctree.h   | 1 -
 fs/btrfs/disk-io.c | 4 ----
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2603ee5..3b6e35a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -388,16 +388,14 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	spin_lock(&root->fs_info->hash_lock);
 	if (btrfs_header_generation(buf) == trans->transid &&
 	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
-		spin_unlock(&root->fs_info->hash_lock);
 		WARN_ON(prealloc_dest);
 		return 0;
 	}
-	spin_unlock(&root->fs_info->hash_lock);
+
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0,
@@ -1376,14 +1374,11 @@ again:
 			int wret;
 
 			/* is a cow on this block not required */
-			spin_lock(&root->fs_info->hash_lock);
 			if (btrfs_header_generation(b) == trans->transid &&
 			    btrfs_header_owner(b) == root->root_key.objectid &&
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
-				spin_unlock(&root->fs_info->hash_lock);
 				goto cow_done;
 			}
-			spin_unlock(&root->fs_info->hash_lock);
 
 			/* ok, we have to cow, is our old prealloc the right
 			 * size?
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index de103a8..f2b8d26 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -703,7 +703,6 @@ struct btrfs_fs_info {
 	struct super_block *sb;
 	struct inode *btree_inode;
 	struct backing_dev_info bdi;
-	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c38100..5492716 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1503,7 +1503,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
@@ -2361,7 +2360,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_buffer *eb;
 	unsigned long len;
@@ -2376,9 +2374,7 @@ int btree_lock_page_hook(struct page *page)
 		goto out;
 
 	btrfs_tree_lock(eb);
-	spin_lock(&root->fs_info->hash_lock);
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-	spin_unlock(&root->fs_info->hash_lock);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
-- 
cgit v1.1


From b4ce94de9b4d64e8ab3cf155d13653c666e22b9b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:25:08 -0500
Subject: Btrfs: Change btree locking to use explicit blocking points

Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.

So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.

This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.

We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.

The basic idea is:

btrfs_tree_lock() returns with the spin lock held

btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock.  The buffer is
still considered locked by all of the btrfs code.

If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.

Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time.  So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.

btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.

btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.

ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 234 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/ctree.h       |   4 +
 fs/btrfs/disk-io.c     |  10 ++-
 fs/btrfs/extent-tree.c |   5 ++
 fs/btrfs/extent_io.c   |  18 ++--
 fs/btrfs/extent_io.h   |  16 +++-
 fs/btrfs/inode.c       |   3 +
 fs/btrfs/locking.c     | 208 +++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/locking.h     |   6 ++
 fs/btrfs/tree-defrag.c |   1 +
 fs/btrfs/tree-log.c    |   4 +
 11 files changed, 470 insertions(+), 39 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3b6e35a..3af7773 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
 	return path;
 }
 
+/*
+ * set all locked nodes in the path to blocking locks.  This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_set_lock_blocking(p->nodes[i]);
+	}
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_clear_lock_blocking(p->nodes[i]);
+	}
+}
+
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
+	/* cow is set to blocking by btrfs_init_new_buffer */
+
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
@@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+	if (parent)
+		btrfs_set_lock_blocking(parent);
+	btrfs_set_lock_blocking(buf);
+
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0,
 				 prealloc_dest);
@@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (parent_nritems == 1)
 		return 0;
 
+	btrfs_set_lock_blocking(parent);
+
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
 
@@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			search_start = last_block;
 
 		btrfs_tree_lock(cur);
+		btrfs_set_lock_blocking(cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
@@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		return 0;
 
 	mid = path->nodes[level];
+
 	WARN_ON(!path->locks[level]);
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
@@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
 		btrfs_tree_lock(child);
+		btrfs_set_lock_blocking(child);
 		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
@@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
+
 		path->locks[level] = 0;
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
@@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
 		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left, 0);
 		if (wret) {
@@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	right = read_node_slot(root, parent, pslot + 1);
 	if (right) {
 		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right, 0);
 		if (wret) {
@@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		u32 left_nr;
 
 		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+
 		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	 */
 	if (right) {
 		u32 right_nr;
+
 		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+
 		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1265,6 +1310,68 @@ static noinline void reada_for_search(struct btrfs_root *root,
 }
 
 /*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+				      struct btrfs_path *path, int level)
+{
+	int slot;
+	int nritems;
+	struct extent_buffer *parent;
+	struct extent_buffer *eb;
+	u64 gen;
+	u64 block1 = 0;
+	u64 block2 = 0;
+	int ret = 0;
+	int blocksize;
+
+	parent = path->nodes[level - 1];
+	if (!parent)
+		return 0;
+
+	nritems = btrfs_header_nritems(parent);
+	slot = path->slots[level];
+	blocksize = btrfs_level_size(root, level);
+
+	if (slot > 0) {
+		block1 = btrfs_node_blockptr(parent, slot - 1);
+		gen = btrfs_node_ptr_generation(parent, slot - 1);
+		eb = btrfs_find_tree_block(root, block1, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block1 = 0;
+		free_extent_buffer(eb);
+	}
+	if (slot < nritems) {
+		block2 = btrfs_node_blockptr(parent, slot + 1);
+		gen = btrfs_node_ptr_generation(parent, slot + 1);
+		eb = btrfs_find_tree_block(root, block2, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block2 = 0;
+		free_extent_buffer(eb);
+	}
+	if (block1 || block2) {
+		ret = -EAGAIN;
+		btrfs_release_path(root, path);
+		if (block1)
+			readahead_tree_block(root, block1, blocksize, 0);
+		if (block2)
+			readahead_tree_block(root, block2, blocksize, 0);
+
+		if (block1) {
+			eb = read_tree_block(root, block1, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+		if (block1) {
+			eb = read_tree_block(root, block2, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+	}
+	return ret;
+}
+
+
+/*
  * when we walk down the tree, it is usually safe to unlock the higher layers
  * in the tree.  The exceptions are when our path goes through slot 0, because
  * operations on the tree might require changing key pointers higher up in the
@@ -1315,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 }
 
 /*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node.  This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+	int i;
+
+	if (path->keep_locks || path->lowest_level)
+		return;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		btrfs_tree_unlock(path->nodes[i]);
+		path->locks[i] = 0;
+	}
+}
+
+/*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
  * level of the path (level 0)
@@ -1385,6 +1518,7 @@ again:
 			 */
 			if (prealloc_block.objectid &&
 			    prealloc_block.offset != b->len) {
+				btrfs_set_path_blocking(p);
 				btrfs_free_reserved_extent(root,
 					   prealloc_block.objectid,
 					   prealloc_block.offset);
@@ -1409,6 +1543,8 @@ again:
 				goto again;
 			}
 
+			btrfs_set_path_blocking(p);
+
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
 					       p->slots[level + 1],
@@ -1430,6 +1566,22 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
+		btrfs_clear_path_blocking(p);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 *
+		 * If cow is true, then we might be changing slot zero,
+		 * which may require changing the parent.  So, we can't
+		 * drop the lock until after we know which slot we're
+		 * operating on.
+		 */
+		if (!cow)
+			btrfs_unlock_up_safe(p, level + 1);
+
 		ret = check_block(root, p, level);
 		if (ret) {
 			ret = -1;
@@ -1437,6 +1589,7 @@ cow_done:
 		}
 
 		ret = bin_search(b, key, level, &slot);
+
 		if (level != 0) {
 			if (ret && slot > 0)
 				slot -= 1;
@@ -1444,7 +1597,16 @@ cow_done:
 			if ((p->search_for_split || ins_len > 0) &&
 			    btrfs_header_nritems(b) >=
 			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
-				int sret = split_node(trans, root, p, level);
+				int sret;
+
+				sret = reada_for_balance(root, p, level);
+				if (sret)
+					goto again;
+
+				btrfs_set_path_blocking(p);
+				sret = split_node(trans, root, p, level);
+				btrfs_clear_path_blocking(p);
+
 				BUG_ON(sret > 0);
 				if (sret) {
 					ret = sret;
@@ -1453,8 +1615,16 @@ cow_done:
 				b = p->nodes[level];
 				slot = p->slots[level];
 			} else if (ins_len < 0) {
-				int sret = balance_level(trans, root, p,
-							 level);
+				int sret;
+
+				sret = reada_for_balance(root, p, level);
+				if (sret)
+					goto again;
+
+				btrfs_set_path_blocking(p);
+				sret = balance_level(trans, root, p, level);
+				btrfs_clear_path_blocking(p);
+
 				if (sret) {
 					ret = sret;
 					goto done;
@@ -1488,7 +1658,7 @@ cow_done:
 				 * of the btree by dropping locks before
 				 * we read.
 				 */
-				if (level > 1) {
+				if (level > 0) {
 					btrfs_release_path(NULL, p);
 					if (tmp)
 						free_extent_buffer(tmp);
@@ -1503,6 +1673,7 @@ cow_done:
 						free_extent_buffer(tmp);
 					goto again;
 				} else {
+					btrfs_set_path_blocking(p);
 					if (tmp)
 						free_extent_buffer(tmp);
 					if (should_reada)
@@ -1512,14 +1683,29 @@ cow_done:
 					b = read_node_slot(root, b, slot);
 				}
 			}
-			if (!p->skip_locking)
-				btrfs_tree_lock(b);
+			if (!p->skip_locking) {
+				int lret;
+
+				btrfs_clear_path_blocking(p);
+				lret = btrfs_try_spin_lock(b);
+
+				if (!lret) {
+					btrfs_set_path_blocking(p);
+					btrfs_tree_lock(b);
+					btrfs_clear_path_blocking(p);
+				}
+			}
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 &&
 			    btrfs_leaf_free_space(root, b) < ins_len) {
-				int sret = split_leaf(trans, root, key,
+				int sret;
+
+				btrfs_set_path_blocking(p);
+				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
+				btrfs_clear_path_blocking(p);
+
 				BUG_ON(sret > 0);
 				if (sret) {
 					ret = sret;
@@ -1533,12 +1719,16 @@ cow_done:
 	}
 	ret = 1;
 done:
+	/*
+	 * we don't really know what they plan on doing with the path
+	 * from here on, so for now just mark it as blocking
+	 */
+	btrfs_set_path_blocking(p);
 	if (prealloc_block.objectid) {
 		btrfs_free_reserved_extent(root,
 			   prealloc_block.objectid,
 			   prealloc_block.offset);
 	}
-
 	return ret;
 }
 
@@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
 	BUG_ON(ret);
 
+	btrfs_set_lock_blocking(eb);
+
 	parent = eb;
 	while (1) {
 		level = btrfs_header_level(parent);
@@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 			eb = read_tree_block(root, bytenr, blocksize,
 					     generation);
 			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
 		}
 
 		/*
@@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 				eb = read_tree_block(root, bytenr, blocksize,
 						generation);
 				btrfs_tree_lock(eb);
+				btrfs_set_lock_blocking(eb);
 			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size)
 		goto out_unlock;
@@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size) {
 		ret = 1;
@@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
+	/*
+	 * make sure any changes to the path from split_leaf leave it
+	 * in a blocking state
+	 */
+	btrfs_set_path_blocking(path);
+
 	leaf = path->nodes[0];
 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 
@@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 out:
+	btrfs_unlock_up_safe(path, 1);
 	return ret;
 }
 
@@ -3705,12 +3910,14 @@ find_next_key:
 		 */
 		if (slot >= nritems) {
 			path->slots[level] = slot;
+			btrfs_set_path_blocking(path);
 			sret = btrfs_find_next_key(root, path, min_key, level,
 						  cache_only, min_trans);
 			if (sret == 0) {
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
+				btrfs_clear_path_blocking(path);
 				goto out;
 			}
 		}
@@ -3722,16 +3929,20 @@ find_next_key:
 			unlock_up(path, level, 1);
 			goto out;
 		}
+		btrfs_set_path_blocking(path);
 		cur = read_node_slot(root, cur, slot);
 
 		btrfs_tree_lock(cur);
+
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
+		btrfs_clear_path_blocking(path);
 	}
 out:
 	if (ret == 0)
 		memcpy(min_key, &found_key, sizeof(found_key));
+	btrfs_set_path_blocking(path);
 	return ret;
 }
 
@@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	if (ret < 0)
 		return ret;
 
+	btrfs_set_path_blocking(path);
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	/*
 	 * by releasing the path above we dropped all our locks.  A balance
@@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			free_extent_buffer(next);
 		}
 
+		/* the path was set to blocking above */
 		if (level == 1 && (path->locks[1] || path->skip_locking) &&
 		    path->reada)
 			reada_for_search(root, path, level, slot, 0);
@@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (!path->skip_locking) {
 			WARN_ON(!btrfs_tree_locked(c));
 			btrfs_tree_lock(next);
+			btrfs_set_lock_blocking(next);
 		}
 		break;
 	}
@@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			path->locks[level] = 1;
 		if (!level)
 			break;
+
+		btrfs_set_path_blocking(path);
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
 		if (!path->skip_locking) {
 			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
 			btrfs_tree_lock(next);
+			btrfs_set_lock_blocking(next);
 		}
 	}
 done:
@@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 
 	while (1) {
 		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
 			ret = btrfs_prev_leaf(root, path);
 			if (ret != 0)
 				return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2b8d26..531db11 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
 int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5492716..5aebddd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
 	if (ret == 0)
-		buf->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 	else
 		WARN_ON(1);
 	return buf;
@@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
 		WARN_ON(!btrfs_tree_locked(buf));
+
+		/* ugh, clear_extent_buffer_dirty can be expensive */
+		btrfs_set_lock_blocking(buf);
+
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	}
@@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
+	btrfs_set_lock_blocking(buf);
+
 	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	int ret;
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 	if (ret == 0)
-		buf->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a22f2e..ed1e25d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	btrfs_set_header_generation(buf, trans->transid);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
+
+	btrfs_set_lock_blocking(buf);
 	btrfs_set_buffer_uptodate(buf);
+
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		set_extent_dirty(&root->dirty_log_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
@@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	}
 	trans->blocks_used++;
+	/* this returns a buffer locked for blocking */
 	return buf;
 }
 
@@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 
 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
 		btrfs_tree_lock(next);
+		btrfs_set_lock_blocking(next);
 
 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
 					      &refs);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2ea7f05..dd5df53 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	eb->start = start;
 	eb->len = len;
-	mutex_init(&eb->mutex);
+	spin_lock_init(&eb->lock);
+	init_waitqueue_head(&eb->lock_wq);
+
 #if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
@@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		unlock_page(p);
 	}
 	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	spin_lock(&tree->buffer_lock);
 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 	unsigned long num_pages;
 
 	num_pages = num_extent_pages(eb->start, eb->len);
-	eb->flags &= ~EXTENT_UPTODATE;
+	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			      GFP_NOFS);
@@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 	struct page *page;
 	int pg_uptodate = 1;
 
-	if (eb->flags & EXTENT_UPTODATE)
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 
-	if (eb->flags & EXTENT_UPTODATE)
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
-			eb->flags |= EXTENT_UPTODATE;
+			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 		goto unlock_exit;
 	}
 
@@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (!ret)
-		eb->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	return ret;
 
 unlock_exit:
@@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
-		WARN_ON(!mutex_is_locked(&eb->mutex));
 	}
 	err = map_private_extent_buffer(eb, start, min_len, token, map,
 				       map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e80c6d9..1f9df88 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
 
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
 	unsigned long map_start;
 	unsigned long map_len;
 	struct page *first_page;
+	unsigned long bflags;
 	atomic_t refs;
-	int flags;
 	struct list_head leak_list;
 	struct rb_node rb_node;
-	struct mutex mutex;
+
+	/* the spinlock is used to protect most operations */
+	spinlock_t lock;
+
+	/*
+	 * when we keep the lock held while blocking, waiters go onto
+	 * the wq
+	 */
+	wait_queue_head_t lock_wq;
 };
 
 struct extent_map_tree;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a79e1c..ebd7d6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
 #include "tree-log.h"
 #include "ref-cache.h"
 #include "compression.h"
+#include "locking.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
 	btrfs_free_path(path);
@@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		goto failed;
 	}
 
+	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				  struct btrfs_inode_item);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae77..68fd9cc 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
 #include "locking.h"
 
 /*
- * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
- * and the spin is not tuned very extensively.  The spinning does make a big
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
+ * btrfs_header_level() isn't free, so don't call it when lockdep isn't
+ * on
  */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void spin_nested(struct extent_buffer *eb)
+{
+	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+}
+#else
+static inline void spin_nested(struct extent_buffer *eb)
+{
+	spin_lock(&eb->lock);
+}
+#endif
 
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
 {
-	int i;
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		spin_unlock(&eb->lock);
+	}
+	/* exit with the spin lock released and the bit set */
+}
 
-	if (mutex_trylock(&eb->mutex))
-		return 0;
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		spin_nested(eb);
+		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		smp_mb__after_clear_bit();
+	}
+	/* exit with the spin lock held */
+}
+
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+	int i;
 	for (i = 0; i < 512; i++) {
 		cpu_relax();
-		if (mutex_trylock(&eb->mutex))
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		if (need_resched())
+			break;
+	}
+	return 0;
+}
+
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+	int i;
+
+	spin_nested(eb);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		return 1;
+	spin_unlock(&eb->lock);
+
+	/* spin for a bit on the BLOCKING flag */
+	for (i = 0; i < 2; i++) {
+		if (!btrfs_spin_on_block(eb))
+			break;
+
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
+	return 0;
+}
+
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+			       int sync, void *key)
+{
+	autoremove_wake_function(wait, mode, sync, key);
+	return 1;
+}
+
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+	DEFINE_WAIT(wait);
+	wait.func = btrfs_wake_function;
+
+	while(1) {
+		spin_nested(eb);
+
+		/* nobody is blocking, exit with the spinlock held */
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
 			return 0;
+
+		/*
+		 * we have the spinlock, but the real owner is blocking.
+		 * wait for them
+		 */
+		spin_unlock(&eb->lock);
+
+		/*
+		 * spin for a bit, and if the blocking flag goes away,
+		 * loop around
+		 */
+		if (btrfs_spin_on_block(eb))
+			continue;
+
+		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			schedule();
+
+		finish_wait(&eb->lock_wq, &wait);
 	}
-	cpu_relax();
-	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
 	return 0;
 }
 
+/*
+ * Very quick trylock, this does not spin or schedule.  It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
 int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-	return mutex_trylock(&eb->mutex);
+	if (spin_trylock(&eb->lock)) {
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+			/*
+			 * we've got the spinlock, but the real owner is
+			 * blocking.  Drop the spinlock and return failure
+			 */
+			spin_unlock(&eb->lock);
+			return 0;
+		}
+		return 1;
+	}
+	/* someone else has the spinlock giveup */
+	return 0;
 }
 
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-	mutex_unlock(&eb->mutex);
+	/*
+	 * if we were a blocking owner, we don't have the spinlock held
+	 * just clear the bit and look for waiters
+	 */
+	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		smp_mb__after_clear_bit();
+	else
+		spin_unlock(&eb->lock);
+
+	if (waitqueue_active(&eb->lock_wq))
+		wake_up(&eb->lock_wq);
 	return 0;
 }
 
 int btrfs_tree_locked(struct extent_buffer *eb)
 {
-	return mutex_is_locked(&eb->mutex);
+	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+			spin_is_locked(&eb->lock);
 }
 
 /*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
 {
 	int i;
 	struct extent_buffer *eb;
+
 	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
 		eb = path->nodes[i];
 		if (!eb)
 			break;
 		smp_mb();
-		if (!list_empty(&eb->mutex.wait_list))
+		if (spin_is_contended(&eb->lock) ||
+		    waitqueue_active(&eb->lock_wq))
 			return 1;
 	}
 	return 0;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef..d92e707 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,12 @@
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
 int btrfs_tree_locked(struct extent_buffer *eb);
+
 int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
 int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c..98d25fa 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		u32 nritems;
 
 		root_node = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(root_node);
 		nritems = btrfs_header_nritems(root_node);
 		root->defrag_max.objectid = 0;
 		/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4f26f3e..2079429 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				btrfs_tree_lock(next);
 				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
@@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		next = path->nodes[*level];
 		btrfs_tree_lock(next);
 		clean_tree_block(trans, root, next);
+		btrfs_set_lock_blocking(next);
 		btrfs_wait_tree_block_writeback(next);
 		btrfs_tree_unlock(next);
 
@@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 				btrfs_tree_lock(next);
 				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
@@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			btrfs_tree_lock(next);
 			clean_tree_block(trans, log, next);
+			btrfs_set_lock_blocking(next);
 			btrfs_wait_tree_block_writeback(next);
 			btrfs_tree_unlock(next);
 
-- 
cgit v1.1


From bd56b30205bc09da0beb80d4ba3d4c7309792da5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:27:02 -0500
Subject: Btrfs: Make btrfs_drop_snapshot work in larger and more efficient
 chunks

Every transaction in btrfs creates a new snapshot, and then schedules the
snapshot from the last transaction for deletion.  Snapshot deletion
works by walking down the btree and dropping the reference counts
on each btree block during the walk.

If if a given leaf or node has a reference count greater than one,
the reference count is decremented and the subtree pointed to by that
node is ignored.

If the reference count is one, walking continues down into that node
or leaf, and the references of everything it points to are decremented.

The old code would try to work in small pieces, walking down the tree
until it found the lowest leaf or node to free and then returning.  This
was very friendly to the rest of the FS because it didn't have a huge
impact on other operations.

But it wouldn't always keep up with the rate that new commits added new
snapshots for deletion, and it wasn't very optimal for the extent
allocation tree because it wasn't finding leaves that were close together
on disk and processing them at the same time.

This changes things to walk down to a level 1 node and then process it
in bulk.  All the leaf pointers are sorted and the leaves are dropped
in order based on their extent number.

The extent allocation tree and commit code are now fast enough for
this kind of bulk processing to work without slowing the rest of the FS
down.  Overall it does less IO and is better able to keep up with
snapshot deletions under high load.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 306 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/inode.c       |   2 +
 fs/btrfs/ref-cache.c   |   1 +
 fs/btrfs/ref-cache.h   |   1 -
 4 files changed, 265 insertions(+), 45 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ed1e25d..1d3e926 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1533,6 +1533,11 @@ out:
  * struct refsort is used to match byte number to slot in the btree block.
  * we sort based on the byte number and then use the slot to actually
  * find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr.  Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
  */
 struct refsort {
 	u64 bytenr;
@@ -3457,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
+	struct refsort *sorted;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int nritems;
 	int ret;
+	int refi = 0;
+	int slot;
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
 	leaf_owner = btrfs_header_owner(leaf);
 	leaf_generation = btrfs_header_generation(leaf);
 
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+	/* we do this loop twice.  The first time we build a list
+	 * of the extents we have a reference on, then we sort the list
+	 * by bytenr.  The second time around we actually do the
+	 * extent freeing.
+	 */
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 		cond_resched();
 
 		btrfs_item_key_to_cpu(leaf, &key, i);
+
+		/* only extents have references, skip everything else */
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 			continue;
+
 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+
+		/* inline extents live in the btree, they don't have refs */
 		if (btrfs_file_extent_type(leaf, fi) ==
 		    BTRFS_FILE_EXTENT_INLINE)
 			continue;
-		/*
-		 * FIXME make sure to insert a trans record that
-		 * repeats the snapshot del on crash
-		 */
+
 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
+		/* holes don't have refs */
 		if (disk_bytenr == 0)
 			continue;
 
+		sorted[refi].bytenr = disk_bytenr;
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		u64 disk_bytenr;
+
+		disk_bytenr = sorted[i].bytenr;
+		slot = sorted[i].slot;
+
+		cond_resched();
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
@@ -3497,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		wake_up(&root->fs_info->transaction_throttle);
 		cond_resched();
 	}
+out:
+	kfree(sorted);
 	return 0;
 }
 
@@ -3506,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
 	int i;
 	int ret;
-	struct btrfs_extent_info *info = ref->extents;
+	struct btrfs_extent_info *info;
+	struct refsort *sorted;
+
+	if (ref->nritems == 0)
+		return 0;
 
+	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+	for (i = 0; i < ref->nritems; i++) {
+		sorted[i].bytenr = ref->extents[i].bytenr;
+		sorted[i].slot = i;
+	}
+	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+
+	/*
+	 * the items in the ref were sorted when the ref was inserted
+	 * into the ref cache, so this is already in order
+	 */
 	for (i = 0; i < ref->nritems; i++) {
+		info = ref->extents + sorted[i].slot;
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
@@ -3566,6 +3626,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
 }
 
 /*
+ * this is used while deleting old snapshots, and it drops the refs
+ * on a whole subtree starting from a level 1 node.
+ *
+ * The idea is to sort all the leaf pointers, and then drop the
+ * ref on all the leaves in order.  Most of the time the leaves
+ * will have ref cache entries, so no leaf IOs will be required to
+ * find the extents they have references on.
+ *
+ * For each leaf, any references it has are also dropped in order
+ *
+ * This ends up dropping the references in something close to optimal
+ * order for reading and modifying the extent allocation tree.
+ */
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path)
+{
+	u64 bytenr;
+	u64 root_owner;
+	u64 root_gen;
+	struct extent_buffer *eb = path->nodes[1];
+	struct extent_buffer *leaf;
+	struct btrfs_leaf_ref *ref;
+	struct refsort *sorted = NULL;
+	int nritems = btrfs_header_nritems(eb);
+	int ret;
+	int i;
+	int refi = 0;
+	int slot = path->slots[1];
+	u32 blocksize = btrfs_level_size(root, 0);
+	u32 refs;
+
+	if (nritems == 0)
+		goto out;
+
+	root_owner = btrfs_header_owner(eb);
+	root_gen = btrfs_header_generation(eb);
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+
+	/*
+	 * step one, sort all the leaf pointers so we don't scribble
+	 * randomly into the extent allocation tree
+	 */
+	for (i = slot; i < nritems; i++) {
+		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	/*
+	 * nritems won't be zero, but if we're picking up drop_snapshot
+	 * after a crash, slot might be > 0, so double check things
+	 * just in case.
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	/*
+	 * the first loop frees everything the leaves point to
+	 */
+	for (i = 0; i < refi; i++) {
+		u64 ptr_gen;
+
+		bytenr = sorted[i].bytenr;
+
+		/*
+		 * check the reference count on this leaf.  If it is > 1
+		 * we just decrement it below and don't update any
+		 * of the refs the leaf points to.
+		 */
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		BUG_ON(ret);
+		if (refs != 1)
+			continue;
+
+		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+
+		/*
+		 * the leaf only had one reference, which means the
+		 * only thing pointing to this leaf is the snapshot
+		 * we're deleting.  It isn't possible for the reference
+		 * count to increase again later
+		 *
+		 * The reference cache is checked for the leaf,
+		 * and if found we'll be able to drop any refs held by
+		 * the leaf without needing to read it in.
+		 */
+		ref = btrfs_lookup_leaf_ref(root, bytenr);
+		if (ref && ref->generation != ptr_gen) {
+			btrfs_free_leaf_ref(root, ref);
+			ref = NULL;
+		}
+		if (ref) {
+			ret = cache_drop_leaf_ref(trans, root, ref);
+			BUG_ON(ret);
+			btrfs_remove_leaf_ref(root, ref);
+			btrfs_free_leaf_ref(root, ref);
+		} else {
+			/*
+			 * the leaf wasn't in the reference cache, so
+			 * we have to read it.
+			 */
+			leaf = read_tree_block(root, bytenr, blocksize,
+					       ptr_gen);
+			ret = btrfs_drop_leaf_ref(trans, root, leaf);
+			BUG_ON(ret);
+			free_extent_buffer(leaf);
+		}
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+
+	/*
+	 * run through the loop again to free the refs on the leaves.
+	 * This is faster than doing it in the loop above because
+	 * the leaves are likely to be clustered together.  We end up
+	 * working in nice chunks on the extent allocation tree.
+	 */
+	for (i = 0; i < refi; i++) {
+		bytenr = sorted[i].bytenr;
+		ret = __btrfs_free_extent(trans, root, bytenr,
+					blocksize, eb->start,
+					root_owner, root_gen, 0, 1);
+		BUG_ON(ret);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+out:
+	kfree(sorted);
+
+	/*
+	 * update the path to show we've processed the entire level 1
+	 * node.  This will get saved into the root's drop_snapshot_progress
+	 * field so these drops are not repeated again if this transaction
+	 * commits.
+	 */
+	path->slots[1] = nritems;
+	return 0;
+}
+
+/*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
@@ -3580,7 +3786,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
-	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -3607,17 +3812,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(cur))
 			break;
+
+		/* the new code goes down to level 1 and does all the
+		 * leaves pointed to that node in bulk.  So, this check
+		 * for level 0 will always be false.
+		 *
+		 * But, the disk format allows the drop_snapshot_progress
+		 * field in the root to leave things in a state where
+		 * a leaf will need cleaning up here.  If someone crashes
+		 * with the old code and then boots with the new code,
+		 * we might find a leaf here.
+		 */
 		if (*level == 0) {
 			ret = btrfs_drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
+
+		/*
+		 * once we get to level one, process the whole node
+		 * at once, including everything below it.
+		 */
+		if (*level == 1) {
+			ret = drop_level_one_refs(trans, root, path);
+			BUG_ON(ret);
+			break;
+		}
+
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
+
+		/*
+		 * if there is more than one reference, we don't need
+		 * to read that node to drop any references it has.  We
+		 * just drop the ref we hold on that node and move on to the
+		 * next slot in this level.
+		 */
 		if (refs != 1) {
 			parent = path->nodes[*level];
 			root_owner = btrfs_header_owner(parent);
@@ -3636,46 +3870,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
 			continue;
 		}
+
 		/*
-		 * at this point, we have a single ref, and since the
-		 * only place referencing this extent is a dead root
-		 * the reference count should never go higher.
-		 * So, we don't need to check it again
+		 * we need to keep freeing things in the next level down.
+		 * read the block and loop around to process it
 		 */
-		if (*level == 1) {
-			ref = btrfs_lookup_leaf_ref(root, bytenr);
-			if (ref && ref->generation != ptr_gen) {
-				btrfs_free_leaf_ref(root, ref);
-				ref = NULL;
-			}
-			if (ref) {
-				ret = cache_drop_leaf_ref(trans, root, ref);
-				BUG_ON(ret);
-				btrfs_remove_leaf_ref(root, ref);
-				btrfs_free_leaf_ref(root, ref);
-				*level = 0;
-				break;
-			}
-		}
-		next = btrfs_find_tree_block(root, bytenr, blocksize);
-		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
-			free_extent_buffer(next);
-
-			next = read_tree_block(root, bytenr, blocksize,
-					       ptr_gen);
-			cond_resched();
-#if 0
-			/*
-			 * this is a debugging check and can go away
-			 * the ref should never go all the way down to 1
-			 * at this point
-			 */
-			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
-						&refs);
-			BUG_ON(ret);
-			WARN_ON(refs != 1);
-#endif
-		}
+		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
 			free_extent_buffer(path->nodes[*level-1]);
@@ -3700,11 +3900,16 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
+	/*
+	 * cleanup and free the reference on the last node
+	 * we processed
+	 */
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
+
 	*level += 1;
 	BUG_ON(ret);
 
@@ -3824,6 +4029,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
 			struct btrfs_disk_key disk_key;
+
+			/*
+			 * there is more work to do in this level.
+			 * Update the drop_progress marker to reflect
+			 * the work we've done so far, and then bump
+			 * the slot number
+			 */
 			node = path->nodes[i];
 			path->slots[i]++;
 			*level = i;
@@ -3835,6 +4047,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 			return 0;
 		} else {
 			struct extent_buffer *parent;
+
+			/*
+			 * this whole node is done, free our reference
+			 * on it and go up one level
+			 */
 			if (path->nodes[*level] == root->node)
 				parent = path->nodes[*level];
 			else
@@ -4849,6 +5066,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
+
 		ret = btrfs_add_leaf_ref(root, ref, 0);
 		WARN_ON(ret);
 		btrfs_free_leaf_ref(root, ref);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ebd7d6c..95ea58c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2441,6 +2441,8 @@ next_node:
 			ref->generation = leaf_gen;
 			ref->nritems = 0;
 
+			btrfs_sort_leaf_ref(ref);
+
 			ret = btrfs_add_leaf_ref(root, ref, 0);
 			WARN_ON(ret);
 			btrfs_free_leaf_ref(root, ref);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4..d0cc62b 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183..bc283ad 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 			   int shared);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-
 #endif
-- 
cgit v1.1


From 8c087b5183adab186a298f2d6ed39aefdcae413c Mon Sep 17 00:00:00 2001
From: Chris Ball <cjb@laptop.org>
Date: Wed, 4 Feb 2009 09:29:54 -0500
Subject: Btrfs: Handle SGID bit when creating inodes Before this patch, new
 files/dirs would ignore the SGID bit on their parent directory and always be
 owned by the creating user's uid/gid.

Signed-off-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95ea58c..5792816d4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3472,7 +3472,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		root->highest_inode = objectid;
 
 	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
+
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
-- 
cgit v1.1


From 811449496b3e3caa9a8cf43feacbade0153324f2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 4 Feb 2009 09:18:33 -0500
Subject: Btrfs: join the transaction in __btrfs_setxattr

With selinux on we end up calling __btrfs_setxattr when we create an inode,
which calls btrfs_start_transaction().  The problem is we've already called
that in btrfs_new_inode, and in btrfs_start_transaction we end up doing a
wait_current_trans().  If btrfs-transaction has started committing it will wait
for all handles to finish, while the other process is waiting for the
transaction to commit.  This is fixed by using btrfs_join_transaction, which
won't wait for the transaction to commit.  Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 312b943..a9d3bf4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -98,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
 	if (!path)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
 	/* first lets see if we already have this xattr */
-- 
cgit v1.1


From f03d9301f15fb69cdf1eb59d53c9fb72f68ecccc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:31:06 -0500
Subject: Btrfs: Don't try to compress pages past i_size

The compression code had some checks to make sure we were only
compressing bytes inside of i_size, but it wasn't catching every
case.  To make things worse, some incorrect math about the number
of bytes remaining would make it try to compress more pages than the
file really had.

The fix used here is to fall back to the non-compression code in this
case, which does all the proper cleanup of delalloc and other accounting.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5792816d4..9b43a6f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -360,6 +360,19 @@ again:
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+	/*
+	 * we don't want to send crud past the end of i_size through
+	 * compression, that's just a waste of CPU time.  So, if the
+	 * end of the file is before the start of our current
+	 * requested range of bytes, we bail out to the uncompressed
+	 * cleanup code that can deal with all of this.
+	 *
+	 * It isn't really the fastest way to fix things, but this is a
+	 * very uncommon corner.
+	 */
+	if (actual_end <= start)
+		goto cleanup_and_bail_uncompressed;
+
 	total_compressed = actual_end - start;
 
 	/* we want to make sure that amount of ram required to uncompress
@@ -504,6 +517,7 @@ again:
 			goto again;
 		}
 	} else {
+cleanup_and_bail_uncompressed:
 		/*
 		 * No compression, but we still need to write the pages in
 		 * the file we've been given so far.  redirty the locked
-- 
cgit v1.1


From 06d9a8d7c24fe22836bf0b0f82db59d6f98e271e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:30:58 -0500
Subject: Btrfs: Change btrfs_truncate_inode_items to stop when it hits the
 inode

btrfs_truncate_inode_items is setup to stop doing btree searches when
it has finished removing the items for the inode.  It used to detect the
end of the inode by looking for an objectid that didn't match the
one we were searching for.

But, this would result in an extra search through the btree, which
adds extra balancing and cow costs to the operation.

This commit adds a check to see if we found the inode item, which means
we can stop searching early.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b43a6f..ddb0f0e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2504,7 +2504,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	u32 found_type;
+	u32 found_type = (u8)-1;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
@@ -2691,6 +2691,8 @@ next:
 			if (pending_del_nr)
 				goto del_pending;
 			btrfs_release_path(root, path);
+			if (found_type == BTRFS_INODE_ITEM_KEY)
+				break;
 			goto search_again;
 		}
 
@@ -2707,6 +2709,8 @@ del_pending:
 			BUG_ON(ret);
 			pending_del_nr = 0;
 			btrfs_release_path(root, path);
+			if (found_type == BTRFS_INODE_ITEM_KEY)
+				break;
 			goto search_again;
 		}
 	}
-- 
cgit v1.1


From 4d081c41a4f98aecb5e86ef7d3e644cc7b52131f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:31:28 -0500
Subject: Btrfs: change btrfs_del_leaf to drop locks earlier

btrfs_del_leaf does two things.  First it removes the pointer in the
parent, and then it frees the block that has the leaf.  It has the
parent node locked for both operations.

But, it only needs the parent locked while it is deleting the pointer.
After that it can safely free the block without the parent locked.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3af7773..f6916ce 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3630,15 +3630,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+	u64 parent_start = path->nodes[1]->start;
+	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
 
 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
 	if (ret)
 		return ret;
 
+	/*
+	 * btrfs_free_extent is expensive, we want to make sure we
+	 * aren't holding any locks when we call it
+	 */
+	btrfs_unlock_up_safe(path, 0);
+
 	ret = btrfs_free_extent(trans, root, bytenr,
 				btrfs_level_size(root, 0),
-				path->nodes[1]->start,
-				btrfs_header_owner(path->nodes[1]),
+				parent_start, parent_owner,
 				root_gen, 0, 1);
 	return ret;
 }
-- 
cgit v1.1


From 12f4daccfc3732280debba8f9ba49720372de831 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:31:42 -0500
Subject: Btrfs: fix btrfs_unlock_up_safe to walk the entire path

btrfs_unlock_up_safe would break out at the first NULL node entry or
unlocked node it found in the path.

Some of the callers have missing nodes at the lower levels of the path, so this
commit fixes things to check all the nodes in the path before returning.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f6916ce..0d1e3b9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1439,9 +1439,9 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
 		if (!path->nodes[i])
-			break;
+			continue;
 		if (!path->locks[i])
-			break;
+			continue;
 		btrfs_tree_unlock(path->nodes[i]);
 		path->locks[i] = 0;
 	}
-- 
cgit v1.1


From 7b78c170dc4f538cc7ee66f47b3aac3f3974a36c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:12:46 -0500
Subject: Btrfs: Only prep for btree deletion balances when nodes are mostly
 empty

Whenever an item deletion is done, we need to balance all the nodes
in the tree to make sure we don't end up with an empty node if a pointer
is deleted.  This balance prep happens from the root of the tree down
so we can drop our locks as we go.

reada_for_balance was triggering read-ahead on neighboring nodes even
when no balancing was required.  This adds an extra check to avoid
calling balance_level() and avoid reada_for_balance() when a balance
won't be required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1e3b9..551177c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1518,18 +1518,19 @@ again:
 			 */
 			if (prealloc_block.objectid &&
 			    prealloc_block.offset != b->len) {
-				btrfs_set_path_blocking(p);
+				btrfs_release_path(root, p);
 				btrfs_free_reserved_extent(root,
 					   prealloc_block.objectid,
 					   prealloc_block.offset);
 				prealloc_block.objectid = 0;
+				goto again;
 			}
 
 			/*
 			 * for higher level blocks, try not to allocate blocks
 			 * with the block and the parent locks held.
 			 */
-			if (level > 1 && !prealloc_block.objectid &&
+			if (level > 0 && !prealloc_block.objectid &&
 			    btrfs_path_lock_waiting(p, level)) {
 				u32 size = b->len;
 				u64 hint = b->start;
@@ -1614,7 +1615,9 @@ cow_done:
 				}
 				b = p->nodes[level];
 				slot = p->slots[level];
-			} else if (ins_len < 0) {
+			} else if (ins_len < 0 &&
+				   btrfs_header_nritems(b) <
+				   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
 				int sret;
 
 				sret = reada_for_balance(root, p, level);
-- 
cgit v1.1


From 9b0d3ace33ef225690a86d7130feadb22367f496 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Feb 2009 09:33:00 -0500
Subject: Btrfs: don't return congestion in write_cache_pages as often

On fast devices that go from congested to uncongested very quickly, pdflush
is waiting too often in congestion_wait, and the FS is backing off to
easily in write_cache_pages.

For now, fix this on the btrfs side by only checking congestion after
some bios have already gone down.  Longer term a real fix is needed
for pdflush, but that is a larger project.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dd5df53..37d43b5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2377,11 +2377,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	int scanned = 0;
 	int range_whole = 0;
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-- 
cgit v1.1


From 806638bce99f51deccbfedbe86ab3c5cf55a1d35 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 5 Feb 2009 09:08:14 -0500
Subject: Btrfs: Fix memory leak in cache_drop_leaf_ref

The code wasn't doing a kfree on the sorted array

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1d3e926..7527523 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3582,6 +3582,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		info++;
 	}
 
+	kfree(sorted);
 	return 0;
 }
 
-- 
cgit v1.1


From 42f15d77df8a7e8a2feb15041d5d30710ee7f951 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Feb 2009 11:35:57 -0500
Subject: Btrfs: Make sure dir is non-null before doing S_ISGID checks

The S_ISGID check in btrfs_new_inode caused an oops during subvol creation
because sometimes the dir is null.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ddb0f0e..8f07062 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3491,7 +3491,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
 	inode->i_uid = current_fsuid();
 
-	if (dir->i_mode & S_ISGID) {
+	if (dir && (dir->i_mode & S_ISGID)) {
 		inode->i_gid = dir->i_gid;
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
-- 
cgit v1.1


From 284b066af41579f62649048fdec5c5e7091703e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Feb 2009 16:22:03 -0500
Subject: Btrfs: don't use spin_is_contended

Btrfs was using spin_is_contended to see if it should drop locks before
doing extent allocations during btrfs_search_slot.  The idea was to avoid
expensive searches in the tree unless the lock was actually contended.

But, spin_is_contended is specific to the ticket spinlocks on x86, so this
is causing compile errors everywhere else.

In practice, the contention could easily appear some time after we started
doing the extent allocation, and it makes more sense to always drop the lock
instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   |  3 +--
 fs/btrfs/locking.c | 22 ----------------------
 fs/btrfs/locking.h |  2 --
 3 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 551177c..35443cc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1530,8 +1530,7 @@ again:
 			 * for higher level blocks, try not to allocate blocks
 			 * with the block and the parent locks held.
 			 */
-			if (level > 0 && !prealloc_block.objectid &&
-			    btrfs_path_lock_waiting(p, level)) {
+			if (level > 0 && !prealloc_block.objectid) {
 				u32 size = b->len;
 				u64 hint = b->start;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 68fd9cc..9ebe938 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -236,25 +236,3 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
 			spin_is_locked(&eb->lock);
 }
-
-/*
- * btrfs_search_slot uses this to decide if it should drop its locks
- * before doing something expensive like allocating free blocks for cow.
- */
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
-{
-	int i;
-	struct extent_buffer *eb;
-
-	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
-		eb = path->nodes[i];
-		if (!eb)
-			break;
-		smp_mb();
-		if (spin_is_contended(&eb->lock) ||
-		    waitqueue_active(&eb->lock_wq))
-			return 1;
-	}
-	return 0;
-}
-
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d92e707..6bb0afb 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -26,8 +26,6 @@ int btrfs_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
-
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
 #endif
-- 
cgit v1.1


From eb099670895f22970cd143875467c2768d6d87e5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 12 Feb 2009 09:27:38 -0500
Subject: Btrfs: make sure all pending extent operations are complete

Theres a slight problem with finish_current_insert, if we set all to 1 and then
go through and don't actually skip any of the extents on the pending list, we
could exit right after we've added new extents.

This is a problem because by inserting the new extents we could have gotten new
COW's to happen and such, so we may have some pending updates to do or even
more inserts to do after that.

So this patch will only exit if we have never skipped any of the extents in the
pending list, and we have no extents to insert, this will make sure that all of
the pending work is truly done before we return.  I've been running with this
patch for a few days with all of my other testing and have not seen issues.
Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523..376656f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
-	finish_current_insert(trans, root->fs_info->extent_root, 1);
-	del_pending_extents(trans, root->fs_info->extent_root, 1);
+	u64 start;
+	u64 end;
+	int ret;
+
+	while(1) {
+		finish_current_insert(trans, root->fs_info->extent_root, 1);
+		del_pending_extents(trans, root->fs_info->extent_root, 1);
+
+		/* is there more work to do? */
+		ret = find_first_extent_bit(&root->fs_info->pending_del,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		ret = find_first_extent_bit(&root->fs_info->extent_ins,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		break;
+	}
 	return 0;
 }
 
@@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 end;
 	u64 priv;
 	u64 search = 0;
-	u64 skipped = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct pending_extent_op *extent_op, *tmp;
 	struct list_head insert_list, update_list;
 	int ret;
-	int num_inserts = 0, max_inserts;
+	int num_inserts = 0, max_inserts, restart = 0;
 
 	path = btrfs_alloc_path();
 	INIT_LIST_HEAD(&insert_list);
@@ -2233,19 +2249,19 @@ again:
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			if (skipped && all && !num_inserts &&
+			if (restart && !num_inserts &&
 			    list_empty(&update_list)) {
-				skipped = 0;
+				restart = 0;
 				search = 0;
 				continue;
 			}
-			mutex_unlock(&info->extent_ins_mutex);
 			break;
 		}
 
 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
-			skipped = 1;
+			if (all)
+				restart = 1;
 			search = end + 1;
 			if (need_resched()) {
 				mutex_unlock(&info->extent_ins_mutex);
@@ -2264,7 +2280,7 @@ again:
 			list_add_tail(&extent_op->list, &insert_list);
 			search = end + 1;
 			if (num_inserts == max_inserts) {
-				mutex_unlock(&info->extent_ins_mutex);
+				restart = 1;
 				break;
 			}
 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2280,7 +2296,6 @@ again:
 	 * somebody marked this thing for deletion then just unlock it and be
 	 * done, the free_extents will handle it
 	 */
-	mutex_lock(&info->extent_ins_mutex);
 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
 				  extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2302,6 +2317,10 @@ again:
 	if (!list_empty(&update_list)) {
 		ret = update_backrefs(trans, extent_root, path, &update_list);
 		BUG_ON(ret);
+
+		/* we may have COW'ed new blocks, so lets start over */
+		if (all)
+			restart = 1;
 	}
 
 	/*
@@ -2309,9 +2328,9 @@ again:
 	 * need to make sure everything is cleaned then reset everything and
 	 * go back to the beginning
 	 */
-	if (!num_inserts && all && skipped) {
+	if (!num_inserts && restart) {
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		INIT_LIST_HEAD(&update_list);
 		INIT_LIST_HEAD(&insert_list);
 		goto again;
@@ -2368,27 +2387,19 @@ again:
 	BUG_ON(ret);
 
 	/*
-	 * if we broke out of the loop in order to insert stuff because we hit
-	 * the maximum number of inserts at a time we can handle, then loop
-	 * back and pick up where we left off
+	 * if restart is set for whatever reason we need to go back and start
+	 * searching through the pending list again.
+	 *
+	 * We just inserted some extents, which could have resulted in new
+	 * blocks being allocated, which would result in new blocks needing
+	 * updates, so if all is set we _must_ restart to get the updated
+	 * blocks.
 	 */
-	if (num_inserts == max_inserts) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		num_inserts = 0;
-		goto again;
-	}
-
-	/*
-	 * again, if we need to make absolutely sure there are no more pending
-	 * extent operations left and we know that we skipped some, go back to
-	 * the beginning and do it all again
-	 */
-	if (all && skipped) {
+	if (restart || all) {
 		INIT_LIST_HEAD(&insert_list);
 		INIT_LIST_HEAD(&update_list);
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		num_inserts = 0;
 		goto again;
 	}
@@ -2709,6 +2720,8 @@ again:
 		goto again;
 	}
 
+	if (!err)
+		finish_current_insert(trans, extent_root, 0);
 	return err;
 }
 
-- 
cgit v1.1


From b288052e1779261ae80138074989ef50358c4e58 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:37:35 -0500
Subject: Btrfs: process mount options on mount -o remount,

Btrfs wasn't parsing any new mount options during remount, making it
difficult to set mount options on a root drive.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2..66b8341 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -511,6 +511,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
+	ret = btrfs_parse_options(root, data);
+	if (ret)
+		return -EINVAL;
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 
-- 
cgit v1.1


From 536ac8ae86e68bb5574d7cc81c7d229a86b82601 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:41:38 -0500
Subject: Btrfs: use larger metadata clusters in ssd mode

Larger metadata clusters can significantly improve writeback performance
on ssd drives with large erasure blocks.  The larger clusters make it
more likely a given IO will completely overwrite the ssd block, so it
doesn't have to do an internal rwm cycle.

On spinning media, lager metadata clusters end up spreading out the
metadata more over time, which makes fsck slower, so we don't want this
to be the default.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 376656f..c59e120 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2872,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 64 * 1024;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
-- 
cgit v1.1


From e1df36d2f18254d0690a0fbe036cece74ec311b8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:45:08 -0500
Subject: Btrfs: don't clean old snapshots on sync(1)

Cleaning old snapshots can make sync(1) somewhat slow, and some users
and applications still use it in a global fsync kind of workload.

This patch changes btrfs not to clean old snapshots during sync, which is
safe from a FS consistency point of view.  The major downside is that it
makes it difficult to tell when old snapshots have been reaped and
the space they were using has been reclaimed.  A new ioctl will be added
for this purpose instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 66b8341..19a4daf 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_start_delalloc_inodes(root);
 	btrfs_wait_ordered_extents(root, 0);
 
-	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
-- 
cgit v1.1


From b335b0034e252e79ec2e9c6697f5d663c4627bec Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 12 Feb 2009 10:06:04 -0500
Subject: Btrfs: Avoid using __GFP_HIGHMEM with slab allocator

btrfs_releasepage may call kmem_cache_alloc indirectly,
and provide same GFP flags it gets to kmem_cache_alloc.
So it's possible to use __GFP_HIGHMEM with the slab
allocator.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f07062..638bcb5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4263,7 +4263,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
-	return __btrfs_releasepage(page, gfp_flags);
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
-- 
cgit v1.1


From 7951f3cefbd711f4429a0cd014aa83a844c399a0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 12 Feb 2009 10:06:15 -0500
Subject: Btrfs: balance_level checks !child after access

The BUG_ON() is in the wrong spot.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc..6674692 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -917,9 +917,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
+		BUG_ON(!child);
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
-		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
 
-- 
cgit v1.1


From e00f7308658622fbd483cb0d9fe41165bf9050d0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 12 Feb 2009 14:11:25 -0500
Subject: Btrfs: remove btrfs_init_path

btrfs_init_path was initially used when the path objects were on the
stack.  Now all the work is done by btrfs_alloc_path and btrfs_init_path
isn't required.

This patch removes it, and just uses kmem_cache_zalloc to zero out the object.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 11 ++---------
 fs/btrfs/ctree.h     |  1 -
 fs/btrfs/inode-map.c |  1 -
 fs/btrfs/inode.c     |  2 --
 4 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6674692..c8f4c54 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-	memset(p, 0, sizeof(*p));
-}
-
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
-	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
-	if (path) {
-		btrfs_init_path(path);
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	if (path)
 		path->reada = 1;
-	}
 	return path;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db11..3f7a805 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1834,7 +1834,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
 void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa7987..cc7334d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	search_key.type = 0;
 	search_key.offset = 0;
 
-	btrfs_init_path(path);
 	start_found = 0;
 	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 638bcb5..3cee77a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
-	btrfs_init_path(path);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
-- 
cgit v1.1


From a48ddf08ba9bab91efd95e458737afa9d7699623 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 12 Feb 2009 14:25:23 -0500
Subject: Btrfs: remove unused code in split_state()

These two lines are not used, remove them.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b5..ebe6b29 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
-- 
cgit v1.1


From 3f3420df505e47751ef76a652b5cb660e5360d6f Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 12 Feb 2009 10:16:03 -0500
Subject: Btrfs: fs/btrfs/volumes.c: remove useless kzalloc

The call to kzalloc is followed by a kmalloc whose result is stored in the
same variable.

The semantic match that finds the problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
statement S;
expression E;
identifier f,l;
position p1,p2;
expression *ptr != NULL;
@@

(
if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S
|
x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...);
...
if (x == NULL) S
)
<... when != x
     when != if (...) { <+...x...+> }
x->f = E
...>
(
 return \(0\|<+...x...+>\|ptr\);
|
 return@p2 ...;
)

@script:python@
p1 << r.p1;
p2 << r.p2;
@@

print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14eb..c793b6f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		free_extent_map(em);
 	}
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (!map)
-		return -ENOMEM;
-
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-- 
cgit v1.1


From 4008c04a07c73ec3cb1be4c1391d2159a8f75d6d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 14:09:45 -0500
Subject: Btrfs: make a lockdep class for the extent buffer locks

Btrfs is currently using spin_lock_nested with a nested value based
on the tree depth of the block.  But, this doesn't quite work because
the max tree depth is bigger than what spin_lock_nested can deal with,
and because locks are sometimes taken before the level field is filled in.

The solution here is to use lockdep_set_class_and_name instead, and to
set the class before unlocking the pages when the block is read from the
disk and just after init of a freshly allocated tree block.

btrfs_clear_path_blocking is also changed to take the locks in the proper
order, and it also makes sure all the locks currently held are properly
set to blocking before it tries to retake the spinlocks.  Otherwise, lockdep
gets upset about bad lock orderin.

The lockdep magic cam from Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 45 ++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ctree.h       | 10 +++-------
 fs/btrfs/disk-io.c     | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/disk-io.h     | 10 ++++++++++
 fs/btrfs/extent-tree.c |  7 +++++--
 fs/btrfs/locking.c     | 11 -----------
 fs/btrfs/volumes.c     |  2 ++
 7 files changed, 99 insertions(+), 32 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c8f4c54..42491d7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -62,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 
 /*
  * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
  */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held)
 {
 	int i;
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* lockdep really cares that we take all of these spinlocks
+	 * in the right order.  If any of the locks in the path are not
+	 * currently blocking, it is going to complain.  So, make really
+	 * really sure by forcing the path to blocking before we clear
+	 * the path blocking.
+	 */
+	if (held)
+		btrfs_set_lock_blocking(held);
+	btrfs_set_path_blocking(p);
+#endif
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 		if (p->nodes[i] && p->locks[i])
 			btrfs_clear_lock_blocking(p->nodes[i]);
 	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (held)
+		btrfs_clear_lock_blocking(held);
+#endif
 }
 
 /* this also releases the path */
@@ -279,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 						  trans->transid, level, &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-					    buf->len);
+					    buf->len, level);
 	} else {
 		cow = btrfs_alloc_free_block(trans, root, buf->len,
 					     parent_start,
@@ -1559,7 +1583,7 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
-		btrfs_clear_path_blocking(p);
+		btrfs_clear_path_blocking(p, NULL);
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -1598,7 +1622,7 @@ cow_done:
 
 				btrfs_set_path_blocking(p);
 				sret = split_node(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -1618,7 +1642,7 @@ cow_done:
 
 				btrfs_set_path_blocking(p);
 				sret = balance_level(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				if (sret) {
 					ret = sret;
@@ -1681,13 +1705,13 @@ cow_done:
 			if (!p->skip_locking) {
 				int lret;
 
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 				lret = btrfs_try_spin_lock(b);
 
 				if (!lret) {
 					btrfs_set_path_blocking(p);
 					btrfs_tree_lock(b);
-					btrfs_clear_path_blocking(p);
+					btrfs_clear_path_blocking(p, b);
 				}
 			}
 		} else {
@@ -1699,7 +1723,7 @@ cow_done:
 				btrfs_set_path_blocking(p);
 				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -3919,7 +3943,6 @@ find_next_key:
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
-				btrfs_clear_path_blocking(path);
 				goto out;
 			}
 		}
@@ -3939,7 +3962,7 @@ find_next_key:
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
-		btrfs_clear_path_blocking(path);
+		btrfs_clear_path_blocking(path, NULL);
 	}
 out:
 	if (ret == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f7a805..766b31a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-#ifdef CONFIG_LOCKDEP
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
+#define BTRFS_MAX_LEVEL 8
 
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize);
+					    u64 bytenr, u32 blocksize,
+					    int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1835,7 +1832,6 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd..adda739 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -75,6 +75,40 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+	/* leaf */
+	"btrfs-extent-00",
+	"btrfs-extent-01",
+	"btrfs-extent-02",
+	"btrfs-extent-03",
+	"btrfs-extent-04",
+	"btrfs-extent-05",
+	"btrfs-extent-06",
+	"btrfs-extent-07",
+	/* highest possible level */
+	"btrfs-extent-08",
+};
+#endif
+
 /*
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
@@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+	lockdep_set_class_and_name(&eb->lock,
+			   &btrfs_eb_class[level],
+			   btrfs_eb_name[level]);
+}
+#endif
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	found_level = btrfs_header_level(eb);
 
+	btrfs_set_buffer_lockdep_class(eb, found_level);
+
 	ret = csum_tree_block(root, eb, 1);
 	if (ret)
 		ret = -EIO;
@@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	dev_root->track_dirty = 1;
-
 	if (ret)
 		goto fail_extent_root;
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56e..95029db 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+						 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c59e120..cd86bff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3416,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize)
+					    u64 bytenr, u32 blocksize,
+					    int level)
 {
 	struct extent_buffer *buf;
 
@@ -3424,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 
@@ -3467,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
-	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+				    blocksize, level);
 	return buf;
 }
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe938..85506c4 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
 #include "extent_io.h"
 #include "locking.h"
 
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
-	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
 static inline void spin_nested(struct extent_buffer *eb)
 {
 	spin_lock(&eb->lock);
 }
-#endif
 
 /*
  * Setting a lock to blocking will drop the spinlock and set the
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c793b6f..1316139 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3102,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	if (!sb)
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(sb, 0);
+
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
-- 
cgit v1.1


From 2456242530a21cfee82646ebeeda65d3f74faa4c Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 12 Feb 2009 14:14:53 -0500
Subject: Btrfs: hold trans_mutex when using btrfs_record_root_in_trans

btrfs_record_root_in_trans needs the trans_mutex held to make sure two
callers don't race to setup the root in a given transaction.  This adds
it to all the places that were missing it.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 fs/btrfs/transaction.c | 2 ++
 fs/btrfs/tree-log.c    | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd86bff..0a5d796 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5658,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
+		mutex_lock(&extent_root->fs_info->trans_mutex);
 		btrfs_record_root_in_trans(found_root);
+		mutex_unlock(&extent_root->fs_info->trans_mutex);
 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			/*
 			 * try to update data extent references while
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de..4112d53 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
+			mutex_lock(&root->fs_info->trans_mutex);
 			btrfs_record_root_in_trans(root);
+			mutex_unlock(&root->fs_info->trans_mutex);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2079429..9c462fb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2832,7 +2832,9 @@ again:
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
+		mutex_lock(&fs_info->trans_mutex);
 		btrfs_record_root_in_trans(wc.replay_dest);
+		mutex_unlock(&fs_info->trans_mutex);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
-- 
cgit v1.1


From 2cfbd50b536c878e58ab3681c4e944fa3d99b415 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 20 Feb 2009 10:55:10 -0500
Subject: Btrfs: check file pointer in btrfs_sync_file

fsync can be called by NFS with a null file pointer, and btrfs was
oopsing in this case.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023e..872f104 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1222,7 +1222,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file->private_data)
+	if (file && file->private_data)
 		btrfs_ioctl_trans_end(file);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1231,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0)
 		goto out;
 
@@ -1245,7 +1245,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
@@ -1253,7 +1253,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
-- 
cgit v1.1


From 6a63209fc02d5483371f07e4913ee8abad608051 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 20 Feb 2009 11:00:09 -0500
Subject: Btrfs: add better -ENOSPC handling

This is a step in the direction of better -ENOSPC handling.  Instead of
checking the global bytes counter we check the space_info bytes counters to
make sure we have enough space.

If we don't we go ahead and try to allocate a new chunk, and then if that fails
we return -ENOSPC.  This patch adds two counters to btrfs_space_info,
bytes_delalloc and bytes_may_use.

bytes_delalloc account for extents we've actually setup for delalloc and will
be allocated at some point down the line.

bytes_may_use is to keep track of how many bytes we may use for delalloc at
some point.  When we actually set the extent_bit for the delalloc bytes we
subtract the reserved bytes from the bytes_may_use counter.  This keeps us from
not actually being able to allocate space for any delalloc bytes.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/btrfs_inode.h |   8 ++
 fs/btrfs/ctree.h       |  40 ++++++---
 fs/btrfs/extent-tree.c | 215 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/file.c        |  16 +++-
 fs/btrfs/inode.c       |  62 ++++----------
 fs/btrfs/ioctl.c       |   6 +-
 6 files changed, 271 insertions(+), 76 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693..72677ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
+	/* the space_info for where this inode's data allocations are done */
+	struct btrfs_space_info *space_info;
+
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
+	/* total number of bytes that may be used for this inode for
+	 * delalloc
+	 */
+	u64 reserved_bytes;
+
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 766b31a..82491ba 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -596,13 +596,27 @@ struct btrfs_block_group_item {
 
 struct btrfs_space_info {
 	u64 flags;
-	u64 total_bytes;
-	u64 bytes_used;
-	u64 bytes_pinned;
-	u64 bytes_reserved;
-	u64 bytes_readonly;
-	int full;
-	int force_alloc;
+
+	u64 total_bytes;	/* total bytes in the space */
+	u64 bytes_used;		/* total bytes used on disk */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	/* delalloc accounting */
+	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+				   this space is not necessarily reserved yet
+				   by the allocator */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc */
+
+	int full;		/* indicates that we cannot allocate any more
+				   chunks for this space */
+	int force_alloc;	/* set if we need to force a chunk alloc for
+				   this space */
+
 	struct list_head list;
 
 	/* for block groups in our same type */
@@ -1782,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				 u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
@@ -2027,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0a5d796..e11875e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
 
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force);
+
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -1909,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
+	found->bytes_delalloc = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -1972,6 +1977,196 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 alloc_profile;
+
+	if (data) {
+		alloc_profile = info->avail_data_alloc_bits &
+			info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+	} else if (root == root->fs_info->chunk_root) {
+		alloc_profile = info->avail_system_alloc_bits &
+			info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+	} else {
+		alloc_profile = info->avail_metadata_alloc_bits &
+			info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+	}
+
+	return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+	u64 alloc_target;
+
+	alloc_target = btrfs_get_alloc_profile(root, 1);
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       alloc_target);
+}
+
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 alloc_target, thresh;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+	/*
+	 * if the metadata area isn't maxed out then there is no sense in
+	 * checking how much is used, since we can always allocate a new chunk
+	 */
+	if (!meta_sinfo->full)
+		return 0;
+
+	spin_lock(&meta_sinfo->lock);
+	thresh = meta_sinfo->total_bytes * 95;
+
+	do_div(thresh, 100);
+
+	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		spin_unlock(&meta_sinfo->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&meta_sinfo->lock);
+
+	return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	int ret = 0;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+	    data_sinfo->bytes_may_use < bytes) {
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+			struct btrfs_trans_handle *trans;
+
+			data_sinfo->force_alloc = 1;
+			spin_unlock(&data_sinfo->lock);
+
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     bytes + 2 * 1024 * 1024,
+					     alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		spin_unlock(&data_sinfo->lock);
+		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+		       ", %llu bytes_used, %llu bytes_reserved, "
+		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		return -ENOSPC;
+	}
+	data_sinfo->bytes_may_use += bytes;
+	BTRFS_I(inode)->reserved_bytes += bytes;
+	spin_unlock(&data_sinfo->lock);
+
+	return btrfs_check_metadata_free_space(root);
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_may_use -= bytes;
+	BTRFS_I(inode)->reserved_bytes -= bytes;
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				  u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* get the space info for where this inode will be storing its data */
+	data_sinfo = BTRFS_I(inode)->space_info;
+
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_delalloc += bytes;
+
+	/*
+	 * we are adding a delalloc extent without calling
+	 * btrfs_check_data_free_space first.  This happens on a weird
+	 * writepage condition, but shouldn't hurt our accounting
+	 */
+	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+		BTRFS_I(inode)->reserved_bytes = 0;
+	} else {
+		data_sinfo->bytes_may_use -= bytes;
+		BTRFS_I(inode)->reserved_bytes -= bytes;
+	}
+
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes)
+{
+	struct btrfs_space_info *info;
+
+	info = BTRFS_I(inode)->space_info;
+
+	spin_lock(&info->lock);
+	info->bytes_delalloc -= bytes;
+	spin_unlock(&info->lock);
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
@@ -3105,6 +3300,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+	       " may_use=%llu, used=%llu\n", info->total_bytes,
+	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+	       info->bytes_used);
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
@@ -3131,24 +3330,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 search_start = 0;
-	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
 
-	if (data) {
-		alloc_profile = info->avail_data_alloc_bits &
-			info->data_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-	} else if (root == root->fs_info->chunk_root) {
-		alloc_profile = info->avail_system_alloc_bits &
-			info->system_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-	} else {
-		alloc_profile = info->avail_metadata_alloc_bits &
-			info->metadata_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-	}
+	data = btrfs_get_alloc_profile(root, data);
 again:
-	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 872f104..dc78954 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1091,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_check_free_space(root, write_bytes, 0);
+		ret = btrfs_check_data_free_space(root, inode, write_bytes);
 		if (ret)
 			goto out;
 
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			btrfs_drop_pages(pages, num_pages);
 			goto out;
 		}
@@ -1111,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
 		btrfs_drop_pages(pages, num_pages);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1136,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (ret)
+		err = ret;
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3cee77a..7d4f948 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -102,34 +102,6 @@ static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
 }
 
 /*
- * a very lame attempt at stopping writes when the FS is 85% full.  There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del)
-{
-	u64 total;
-	u64 used;
-	u64 thresh;
-	int ret = 0;
-
-	spin_lock(&root->fs_info->delalloc_lock);
-	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
-	if (for_del)
-		thresh = total * 90;
-	else
-		thresh = total * 85;
-
-	do_div(thresh, 100);
-
-	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
-		ret = -ENOSPC;
-	spin_unlock(&root->fs_info->delalloc_lock);
-	return ret;
-}
-
-/*
  * this does all the hard work for inserting an inline extent into
  * the btree.  The caller should have done a btrfs_drop_extents so that
  * no overlapping inline items exist in the btree
@@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			       (unsigned long long)end - start + 1,
 			       (unsigned long long)
 			       root->fs_info->delalloc_bytes);
+			btrfs_delalloc_free_space(root, inode, (u64)-1);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
+			btrfs_delalloc_free_space(root, inode,
+						  end - start + 1);
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
@@ -2245,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	root = BTRFS_I(dir)->root;
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
@@ -2261,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2284,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2304,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (ret && !err)
@@ -2818,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		return err;
 
@@ -3014,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
+	bi->reserved_bytes = 0;
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
@@ -3035,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
+	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
@@ -3455,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
+	btrfs_set_inode_space_info(root, inode);
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -3602,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 
@@ -3665,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	trans = btrfs_start_transaction(root, 1);
@@ -3733,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -ENOENT;
 
 	btrfs_inc_nlink(inode);
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	err = btrfs_set_inode_index(dir, &index);
@@ -3779,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_unlock;
 
@@ -4336,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
@@ -4349,6 +4318,7 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
@@ -4631,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -EXDEV;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto out_unlock;
 
@@ -4749,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_fail;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8..bca729f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_commit;
 
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_unlock;
 
@@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
 	if (ret)
 		return -ENOSPC;
 
-- 
cgit v1.1


From 4e06bdd6cbd5105376e7caf4e683ed131e777389 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 20 Feb 2009 10:59:53 -0500
Subject: Btrfs: try committing transaction before returning ENOSPC

This fixes a problem where we could return -ENOSPC when we may actually have
plenty of space, the space is just pinned.  Instead of returning -ENOSPC
immediately, commit the transaction first and then try and do the allocation
again.

This patch also does chunk allocation for metadata if we pass the 80%
threshold for metadata space.  This will help with stack usage since the chunk
allocation will happen early on, instead of when the allocation is happening.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e11875e..6b5966a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2017,26 +2017,49 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_space_info *meta_sinfo;
 	u64 alloc_target, thresh;
+	int committed = 0, ret;
 
 	/* get the space info for where the metadata will live */
 	alloc_target = btrfs_get_alloc_profile(root, 0);
 	meta_sinfo = __find_space_info(info, alloc_target);
 
-	/*
-	 * if the metadata area isn't maxed out then there is no sense in
-	 * checking how much is used, since we can always allocate a new chunk
-	 */
-	if (!meta_sinfo->full)
-		return 0;
-
+again:
 	spin_lock(&meta_sinfo->lock);
-	thresh = meta_sinfo->total_bytes * 95;
+	if (!meta_sinfo->full)
+		thresh = meta_sinfo->total_bytes * 80;
+	else
+		thresh = meta_sinfo->total_bytes * 95;
 
 	do_div(thresh, 100);
 
 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		struct btrfs_trans_handle *trans;
+		if (!meta_sinfo->full) {
+			meta_sinfo->force_alloc = 1;
+			spin_unlock(&meta_sinfo->lock);
+
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     2 * 1024 * 1024, alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			goto again;
+		}
 		spin_unlock(&meta_sinfo->lock);
+
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
 		return -ENOSPC;
 	}
 	spin_unlock(&meta_sinfo->lock);
@@ -2052,7 +2075,7 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
-	int ret = 0;
+	int ret = 0, committed = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -2065,13 +2088,14 @@ again:
 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
 	    data_sinfo->bytes_may_use < bytes) {
+		struct btrfs_trans_handle *trans;
+
 		/*
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
 		if (!data_sinfo->full) {
 			u64 alloc_target;
-			struct btrfs_trans_handle *trans;
 
 			data_sinfo->force_alloc = 1;
 			spin_unlock(&data_sinfo->lock);
@@ -2090,6 +2114,19 @@ again:
 			goto again;
 		}
 		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+
 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
 		       ", %llu bytes_used, %llu bytes_reserved, "
 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-- 
cgit v1.1


From b9447ef80bd301b932ac4d85c9622e929de5fd62 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Mar 2009 11:45:38 -0400
Subject: Btrfs: fix spinlock assertions on UP systems

btrfs_tree_locked was being used to make sure a given extent_buffer was
properly locked in a few places.  But, it wasn't correct for UP compiled
kernels.

This switches it to using assert_spin_locked instead, and renames it to
btrfs_assert_tree_locked to better reflect how it was really being used.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 10 +++++-----
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/locking.c     |  6 +++---
 fs/btrfs/locking.h     |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 42491d7..37f31b5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -277,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 
 	if (parent)
 		parent_start = parent->start;
@@ -2365,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
@@ -2562,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems == 0)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
@@ -4101,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 		next = read_node_slot(root, c, slot);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(c));
+			btrfs_assert_tree_locked(c);
 			btrfs_tree_lock(next);
 			btrfs_set_lock_blocking(next);
 		}
@@ -4126,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+			btrfs_assert_tree_locked(path->nodes[level]);
 			btrfs_tree_lock(next);
 			btrfs_set_lock_blocking(next);
 		}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index adda739..3e18175 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -857,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
-		WARN_ON(!btrfs_tree_locked(buf));
+		btrfs_assert_tree_locked(buf);
 
 		/* ugh, clear_extent_buffer_dirty can be expensive */
 		btrfs_set_lock_blocking(buf);
@@ -2361,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	btrfs_set_lock_blocking(buf);
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b5966a..9abf81f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4418,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	BUG_ON(!btrfs_tree_locked(parent));
+	btrfs_assert_tree_locked(parent);
 	parent_level = btrfs_header_level(parent);
 	extent_buffer_get(parent);
 	path->nodes[parent_level] = parent;
 	path->slots[parent_level] = btrfs_header_nritems(parent);
 
-	BUG_ON(!btrfs_tree_locked(node));
+	btrfs_assert_tree_locked(node);
 	level = btrfs_header_level(node);
 	extent_buffer_get(node);
 	path->nodes[level] = node;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 85506c4..47b0a88 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -220,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
 	return 0;
 }
 
-int btrfs_tree_locked(struct extent_buffer *eb)
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
-			spin_is_locked(&eb->lock);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		assert_spin_locked(&eb->lock);
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afb..6c4ce45 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_tree_locked(struct extent_buffer *eb);
 
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
 #endif
-- 
cgit v1.1


From 4184ea7f908d95f329febc3665cf66da8568b467 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Mar 2009 12:39:20 -0400
Subject: Btrfs: Fix locking around adding new space_info

Storage allocated to different raid levels in btrfs is tracked by
a btrfs_space_info structure, and all of the current space_infos are
collected into a list_head.

Most filesystems have 3 or 4 of these structs total, and the list is
only changed when new raid levels are added or at unmount time.

This commit adds rcu locking on the list head, and properly frees
things at unmount time.  It also clears the space_info->full flag
whenever new space is added to the FS.

The locking for the space info list goes like this:

reads: protected by rcu_read_lock()
writes: protected by the chunk_mutex

At unmount time we don't need special locking because all the readers
are gone.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  9 +++++++++
 fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/volumes.c     |  2 ++
 3 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 82491ba..5e1d4e3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -784,7 +784,14 @@ struct btrfs_fs_info {
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * the space_info list is almost entirely read only.  It only changes
+	 * when we add a new raid type to the FS, and that happens
+	 * very rarely.  RCU is used to protect it.
+	 */
 	struct list_head space_info;
+
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
@@ -1797,6 +1804,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
 int btrfs_check_metadata_free_space(struct btrfs_root *root);
 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9abf81f..fefe83a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -20,6 +20,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/sort.h>
+#include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -330,13 +331,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 {
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
-	list_for_each_entry(found, head, list) {
-		if (found->flags == flags)
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags == flags) {
+			rcu_read_unlock();
 			return found;
+		}
 	}
+	rcu_read_unlock();
 	return NULL;
 }
 
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		found->full = 0;
+	rcu_read_unlock();
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -1903,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
-	list_add(&found->list, &info->space_info);
 	INIT_LIST_HEAD(&found->block_groups);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
@@ -1917,6 +1937,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
+	list_add_rcu(&found->list, &info->space_info);
 	return 0;
 }
 
@@ -6320,6 +6341,7 @@ out:
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
 	struct rb_node *n;
 
 	spin_lock(&info->block_group_cache_lock);
@@ -6341,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		spin_lock(&info->block_group_cache_lock);
 	}
 	spin_unlock(&info->block_group_cache_lock);
+
+	/* now that all the block groups are freed, go through and
+	 * free all the space_info structs.  This is only called during
+	 * the final stages of unmount, and so we know nobody is
+	 * using them.  We call synchronize_rcu() once before we start,
+	 * just to be on the safe side.
+	 */
+	synchronize_rcu();
+
+	while(!list_empty(&info->space_info)) {
+		space_info = list_entry(info->space_info.next,
+					struct btrfs_space_info,
+					list);
+
+		list_del(&space_info->list);
+		kfree(space_info);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1316139..7aa3810 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1459,6 +1459,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	device->fs_devices->total_rw_bytes += diff;
 
 	device->total_bytes = new_size;
+	btrfs_clear_space_info_full(device->dev_root->fs_info);
+
 	return btrfs_update_device(trans, device);
 }
 
-- 
cgit v1.1


From 913d952eb573c3d1f7487e83b5590e13e7cae2bd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Mar 2009 13:17:18 -0400
Subject: Btrfs: Clear space_info full when adding new devices

The full flag on the space info structs tells the allocator not to try
and allocate more chunks because the devices in the FS are fully allocated.

When more devices are added, we need to clear the full flag so the allocator
knows it has more space available.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7aa3810..dd06e18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1374,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		ret = btrfs_add_device(trans, root, device);
 	}
 
+	/*
+	 * we've got more storage, clear any full flags on the space
+	 * infos
+	 */
+	btrfs_clear_space_info_full(root->fs_info);
+
 	unlock_chunks(root);
 	btrfs_commit_transaction(trans, root);
 
-- 
cgit v1.1