From 26bdef541d26fd6a5ddffdf8949ace22f94f809f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 16 Nov 2011 11:28:01 +0300
Subject: btrfs scrub: handle -ENOMEM from init_ipath()

init_ipath() can return an ERR_PTR(-ENOMEM).

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 fs/btrfs/scrub.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fab420d..c27bcb6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 	btrfs_release_path(swarn->path);
 
 	ipath = init_ipath(4096, local_root, swarn->path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto err;
+	}
 	ret = paths_from_inode(inum, ipath);
 
 	if (ret < 0)
-- 
cgit v1.1


From aa38a711a893accf5b5192f3d705a120deaa81e0 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 18 Nov 2011 17:43:00 +0800
Subject: Btrfs: fix deadlock on metadata reservation when evicting a inode

When I ran the xfstests, I found the test tasks was blocked on meta-data
reservation.

By debugging, I found the reason of this bug:
   start transaction
        |
	v
   reserve meta-data space
	|
	v
   flush delay allocation -> iput inode -> evict inode
	^					|
	|					v
   wait for delay allocation flush <- reserve meta-data space

And besides that, the flush on evicting inode will block the thread, which
is reclaiming the memory, and make oom happen easily.

Fix this bug by skipping the flush step when evicting inode.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/extent-tree.c | 22 ++++++++++++++++++----
 fs/btrfs/inode.c       |  2 +-
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 04a5dfc..50634abe 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2369,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 int btrfs_block_rsv_refill(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+				   struct btrfs_block_rsv *block_rsv,
+				   u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5d86877..b7e5f68 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3887,9 +3887,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved)
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+					   struct btrfs_block_rsv *block_rsv,
+					   u64 min_reserved, int flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3908,7 +3908,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -3917,6 +3917,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv,
+			   u64 min_reserved)
+{
+	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+				   struct btrfs_block_rsv *block_rsv,
+				   u64 min_reserved)
+{
+	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
+
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ad26b1..c5ccec2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3490,7 +3490,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_refill(root, rsv, min_size);
+		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
 
 		/*
 		 * Try and steal from the global reserve since we will
-- 
cgit v1.1


From ece7d20e8be6730fbb29f4550de6b19b1a3a9387 Mon Sep 17 00:00:00 2001
From: Mike Fleetwood <mike.fleetwood@googlemail.com>
Date: Fri, 18 Nov 2011 18:55:01 +0000
Subject: Btrfs: Don't error on resizing FS to same size

It seems overly harsh to fail a resize of a btrfs file system to the
same size when a shrink or grow would succeed.  User app GParted trips
over this error.  Allow it by bypassing the shrink or grow operation.

Signed-off-by: Mike Fleetwood <mike.fleetwood@googlemail.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a90e749..72d4616 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1278,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
-	} else {
+	} else if (new_size < old_size) {
 		ret = btrfs_shrink_device(device, new_size);
 	}
 
-- 
cgit v1.1


From b772a86ea6d932ac29d5e50e67c977653c832f8a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 28 Nov 2011 16:43:00 +0800
Subject: Btrfs: fix oops when calling statfs on readonly device

To reproduce this bug:

  # dd if=/dev/zero of=img bs=1M count=256
  # mkfs.btrfs img
  # losetup -r /dev/loop1 img
  # mount /dev/loop1 /mnt
  OOPS!!

It triggered BUG_ON(!nr_devices) in btrfs_calc_avail_data_space().

To fix this, instead of checking write-only devices, we check all open
deivces:

  # df -h /dev/loop1
  Filesystem            Size  Used Avail Use% Mounted on
  /dev/loop1            250M   28K  238M   1% /mnt

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8bd9d6d..1a3ce9e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1083,7 +1083,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	int i = 0, nr_devices;
 	int ret;
 
-	nr_devices = fs_info->fs_devices->rw_devices;
+	nr_devices = fs_info->fs_devices->open_devices;
 	BUG_ON(!nr_devices);
 
 	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1105,8 +1105,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	else
 		min_stripe_size = BTRFS_STRIPE_LEN;
 
-	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-		if (!device->in_fs_metadata)
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->in_fs_metadata || !device->bdev)
 			continue;
 
 		avail_space = device->total_bytes - device->bytes_used;
-- 
cgit v1.1


From f2d0f6765d6332f9be732965a0c6f3b8a55082b4 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Mon, 28 Nov 2011 12:04:43 -0200
Subject: Btrfs: initialize new bitmaps' list

We're failing to create clusters with bitmaps because
setup_cluster_no_bitmap checks that the list is empty before inserting
the bitmap entry in the list for setup_cluster_bitmap, but the list
field is only initialized when it is restored from the on-disk free
space cache, or when it is written out to disk.

Besides a potential race condition due to the multiple use of the list
field, filesystem performance severely degrades over time: as we use
up all non-bitmap free extents, the try-to-set-up-cluster dance is
done at every metadata block allocation.  For every block group, we
fail to set up a cluster, and after failing on them all up to twice,
we fall back to the much slower unclustered allocation.

To make matters worse, before the unclustered allocation, we try to
create new block groups until we reach the 1% threshold, which
introduces additional bitmaps and thus block groups that we'll iterate
over at each metadata block request.
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6e5b7e4..ff179b1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1470,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
 {
 	info->offset = offset_to_bitmap(ctl, offset);
 	info->bytes = 0;
+	INIT_LIST_HEAD(&info->list);
 	link_free_space(ctl, info);
 	ctl->total_bitmaps++;
 
-- 
cgit v1.1


From b78d09bceb524ee6481c21b77bda22d766b10e6a Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: reset cluster's max_size when creating bitmap

The field that indicates the size of the largest contiguous chunk of
free space in the cluster is not initialized when setting up bitmaps,
it's only increased when we find a larger contiguous chunk.  We end up
retaining a larger value than appropriate for highly-fragmented
clusters, which may cause pointless searches for large contiguous
groups, and even cause clusters that do not meet the density
requirements to be set up.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ff179b1..ec23d43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2320,6 +2320,7 @@ again:
 
 	if (!found) {
 		start = i;
+		cluster->max_size = 0;
 		found = true;
 	}
 
-- 
cgit v1.1


From 1b22bad779be7fe07242be04749ec969164528b8 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: start search for new cluster at the beginning

Instead of starting at zero (offset is always zero), request a cluster
starting at search_start, that denotes the beginning of the current
block group.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b7e5f68..97c1206 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5301,10 +5301,8 @@ alloc:
 			spin_lock(&last_ptr->refill_lock);
 			if (last_ptr->block_group &&
 			    (last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data))) {
-				offset = 0;
+			    !block_group_bits(last_ptr->block_group, data)))
 				goto refill_cluster;
-			}
 
 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
 						 num_bytes, search_start);
@@ -5355,7 +5353,7 @@ refill_cluster:
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
-					       offset, num_bytes,
+					       search_start, num_bytes,
 					       empty_cluster + empty_size);
 			if (ret == 0) {
 				/*
-- 
cgit v1.1


From 425d83156ca27f74e2cc3f370138038c3c8947f8 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: skip block groups without enough space for a cluster

We test whether a block group has enough free space to hold the
requested block, but when we're doing clustered allocation, we can
save some cycles by testing whether it has enough room for the cluster
upfront, otherwise we end up attempting to set up a cluster and
failing.  Only in the NO_EMPTY_SIZE loop do we attempt an unclustered
allocation, and by then we'll have zeroed the cluster size, so this
patch won't stop us from using the block group as a last resort.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 97c1206..71c8e70 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5278,7 +5278,7 @@ alloc:
 		spin_lock(&block_group->free_space_ctl->tree_lock);
 		if (cached &&
 		    block_group->free_space_ctl->free_space <
-		    num_bytes + empty_size) {
+		    num_bytes + empty_cluster + empty_size) {
 			spin_unlock(&block_group->free_space_ctl->tree_lock);
 			goto loop;
 		}
-- 
cgit v1.1


From be064d113906f04ea13088a8260e1e68ae0a4050 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: skip allocation attempt from empty cluster

If we don't have a cluster, don't bother trying to allocate from it,
jumping right away to the attempt to allocate a new cluster.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71c8e70..813c6bb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5299,9 +5299,9 @@ alloc:
 			 * people trying to start a new cluster
 			 */
 			spin_lock(&last_ptr->refill_lock);
-			if (last_ptr->block_group &&
-			    (last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data)))
+			if (!last_ptr->block_group ||
+			    last_ptr->block_group->ro ||
+			    !block_group_bits(last_ptr->block_group, data))
 				goto refill_cluster;
 
 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
-- 
cgit v1.1


From f4a8e6563ea5366f563cb741a27fe90c5fa7f0fc Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 1 Dec 2011 09:30:36 -0500
Subject: Btrfs: fix meta data raid-repair merge problem

Commit 4a54c8c16 introduced raid-repair, killing the individual
readpage_io_failed_hook entries from inode.c and disk-io.c. Commit
4bb31e92 introduced new readahead code, adding a readpage_io_failed_hook to
disk-io.c.

The raid-repair commit had logic to disable raid-repair, if
readpage_io_failed_hook is set. Thus, the readahead commit effectively
disabled raid-repair for meta data.

This commit changes the logic to always attempt raid-repair when needed and
call the readpage_io_failed_hook in case raid-repair fails. This is much
more straight forward and should have been like that from the beginning.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9472d3d..be1bf62 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2287,14 +2287,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		if (!uptodate) {
 			int failed_mirror;
 			failed_mirror = (int)(unsigned long)bio->bi_bdev;
-			if (tree->ops && tree->ops->readpage_io_failed_hook)
-				ret = tree->ops->readpage_io_failed_hook(
-						bio, page, start, end,
-						failed_mirror, state);
-			else
-				ret = bio_readpage_error(bio, page, start, end,
-							 failed_mirror, NULL);
+			/*
+			 * The generic bio_readpage_error handles errors the
+			 * following way: If possible, new read requests are
+			 * created and submitted and will end up in
+			 * end_bio_extent_readpage as well (if we're lucky, not
+			 * in the !uptodate case). In that case it returns 0 and
+			 * we just go on with the next page in our bio. If it
+			 * can't handle the error it will return -EIO and we
+			 * remain responsible for that page.
+			 */
+			ret = bio_readpage_error(bio, page, start, end,
+							failed_mirror, NULL);
 			if (ret == 0) {
+error_handled:
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
@@ -2302,6 +2308,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 				uncache_state(&cached);
 				continue;
 			}
+			if (tree->ops && tree->ops->readpage_io_failed_hook) {
+				ret = tree->ops->readpage_io_failed_hook(
+							bio, page, start, end,
+							failed_mirror, state);
+				if (ret == 0)
+					goto error_handled;
+			}
 		}
 
 		if (uptodate) {
-- 
cgit v1.1


From 062c05c46bd4358aad7a0e0cb5ffeb98ab935286 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 7 Dec 2011 19:50:42 -0500
Subject: Btrfs: try to allocate from cluster even at LOOP_NO_EMPTY_SIZE

If we reach LOOP_NO_EMPTY_SIZE, we won't even try to use a cluster that
others might have set up.  Odds are that there won't be one, but if
someone else succeeded in setting it up, we might as well use it, even
if we don't try to set up a cluster again.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813c6bb..db0b23b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5285,15 +5285,10 @@ alloc:
 		spin_unlock(&block_group->free_space_ctl->tree_lock);
 
 		/*
-		 * Ok we want to try and use the cluster allocator, so lets look
-		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
-		 * have tried the cluster allocator plenty of times at this
-		 * point and not have found anything, so we are likely way too
-		 * fragmented for the clustering stuff to find anything, so lets
-		 * just skip it and let the allocator find whatever block it can
-		 * find
+		 * Ok we want to try and use the cluster allocator, so
+		 * lets look there
 		 */
-		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+		if (last_ptr) {
 			/*
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
@@ -5342,6 +5337,20 @@ alloc:
 			}
 			spin_unlock(&last_ptr->lock);
 refill_cluster:
+			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
+			 * set up a new clusters, so lets just skip it
+			 * and let the allocator find whatever block
+			 * it can find.  If we reach this point, we
+			 * will have tried the cluster allocator
+			 * plenty of times and not have found
+			 * anything, so we are likely way too
+			 * fragmented for the clustering stuff to find
+			 * anything.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
 			/*
 			 * this cluster didn't work out, free it and
 			 * start over
@@ -5389,6 +5398,7 @@ refill_cluster:
 			goto loop;
 		}
 
+unclustered_alloc:
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
 		/*
-- 
cgit v1.1


From 274bd4fb3ed6b72c1d77ef8850511f09fc6b8e4d Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 7 Dec 2011 20:08:40 -0500
Subject: Btrfs: try cluster but don't advance in search list

When we find an existing cluster, we switch to its block group as the
current block group, possibly skipping multiple blocks in the process.
Furthermore, under heavy contention, multiple threads may fail to
allocate from a cluster and then release just-created clusters just to
proceed to create new ones in a different block group.

This patch tries to allocate from an existing cluster regardless of its
block group, and doesn't switch to that group, instead proceeding to
try to allocate a cluster from the group it was iterating before the
attempt.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 74 +++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 43 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db0b23b..05e1386 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5106,11 +5106,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
+	struct btrfs_block_group_cache *used_block_group;
 	int empty_cluster = 2 * 1024 * 1024;
 	int allowed_chunk_alloc = 0;
 	int done_chunk_alloc = 0;
 	struct btrfs_space_info *space_info;
-	int last_ptr_loop = 0;
 	int loop = 0;
 	int index = 0;
 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5172,6 +5172,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 ideal_cache:
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
+		used_block_group = block_group;
 		/*
 		 * we don't want to use the block group if it doesn't match our
 		 * allocation bits, or if its not cached.
@@ -5209,6 +5210,7 @@ search:
 		u64 offset;
 		int cached;
 
+		used_block_group = block_group;
 		btrfs_get_block_group(block_group);
 		search_start = block_group->key.objectid;
 
@@ -5294,49 +5296,33 @@ alloc:
 			 * people trying to start a new cluster
 			 */
 			spin_lock(&last_ptr->refill_lock);
-			if (!last_ptr->block_group ||
-			    last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data))
+			used_block_group = last_ptr->block_group;
+			if (used_block_group != block_group &&
+			    (!used_block_group ||
+			     used_block_group->ro ||
+			     !block_group_bits(used_block_group, data))) {
+				used_block_group = block_group;
 				goto refill_cluster;
+			}
+
+			if (used_block_group != block_group)
+				btrfs_get_block_group(used_block_group);
 
-			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
-						 num_bytes, search_start);
+			offset = btrfs_alloc_from_cluster(used_block_group,
+			  last_ptr, num_bytes, used_block_group->key.objectid);
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
 				goto checks;
 			}
 
-			spin_lock(&last_ptr->lock);
-			/*
-			 * whoops, this cluster doesn't actually point to
-			 * this block group.  Get a ref on the block
-			 * group is does point to and try again
-			 */
-			if (!last_ptr_loop && last_ptr->block_group &&
-			    last_ptr->block_group != block_group &&
-			    index <=
-				 get_block_group_index(last_ptr->block_group)) {
-
-				btrfs_put_block_group(block_group);
-				block_group = last_ptr->block_group;
-				btrfs_get_block_group(block_group);
-				spin_unlock(&last_ptr->lock);
-				spin_unlock(&last_ptr->refill_lock);
-
-				last_ptr_loop = 1;
-				search_start = block_group->key.objectid;
-				/*
-				 * we know this block group is properly
-				 * in the list because
-				 * btrfs_remove_block_group, drops the
-				 * cluster before it removes the block
-				 * group from the list
-				 */
-				goto have_block_group;
+			WARN_ON(last_ptr->block_group != used_block_group);
+			if (used_block_group != block_group) {
+				btrfs_put_block_group(used_block_group);
+				used_block_group = block_group;
 			}
-			spin_unlock(&last_ptr->lock);
 refill_cluster:
+			BUG_ON(used_block_group != block_group);
 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
 			 * set up a new clusters, so lets just skip it
 			 * and let the allocator find whatever block
@@ -5357,8 +5343,6 @@ refill_cluster:
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
-			last_ptr_loop = 0;
-
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
@@ -5425,14 +5409,14 @@ checks:
 		search_start = stripe_align(root, offset);
 		/* move on to the next group */
 		if (search_start + num_bytes >= search_end) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+			btrfs_add_free_space(used_block_group, offset, num_bytes);
 			goto loop;
 		}
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
-		    block_group->key.objectid + block_group->key.offset) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+		    used_block_group->key.objectid + used_block_group->key.offset) {
+			btrfs_add_free_space(used_block_group, offset, num_bytes);
 			goto loop;
 		}
 
@@ -5440,14 +5424,14 @@ checks:
 		ins->offset = num_bytes;
 
 		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
+			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
 						  alloc_type);
 		if (ret == -EAGAIN) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+			btrfs_add_free_space(used_block_group, offset, num_bytes);
 			goto loop;
 		}
 
@@ -5456,15 +5440,19 @@ checks:
 		ins->offset = num_bytes;
 
 		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
+			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
+		if (used_block_group != block_group)
+			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
 		BUG_ON(index != get_block_group_index(block_group));
+		if (used_block_group != block_group)
+			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
-- 
cgit v1.1


From a5d16333612718569ffd26064270e535cb9c3928 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 20:08:40 -0500
Subject: Btrfs: check if the to-be-added device is writable

If we call ioctl(BTRFS_IOC_ADD_DEV) directly, we'll succeed in adding
a readonly device to a btrfs filesystem, and btrfs will write to
that device, emitting kernel errors:

[ 3109.833692] lost page write due to I/O error on loop2
[ 3109.833720] lost page write due to I/O error on loop2
...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c37433d..0a8c8f8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1611,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
 				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-- 
cgit v1.1


From 1cf4ffdb3289624a6462c94f2ce05545b32ef736 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 20:08:40 -0500
Subject: Btrfs: drop spin lock when memory alloc fails

Drop spin lock in convert_extent_bit() when memory alloc fails,
otherwise, it will be a deadlock.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index be1bf62..49f3c9d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
 	node = tree_search(tree, start);
 	if (!node) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 		err = insert_state(tree, prealloc, start, end, &bits);
 		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
 	 */
 	if (state->start < start) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 		err = split_state(tree, state, prealloc, start);
 		BUG_ON(err == -EEXIST);
 		prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
 			this_end = last_start - 1;
 
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		/*
 		 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
 	 */
 	if (state->start <= end && state->end > end) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
-- 
cgit v1.1


From 5dbc8fca8ef5d719014f22345d990e957dcfc692 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 Dec 2011 11:07:37 -0500
Subject: Btrfs: fix btrfs_end_bio to deal with write errors to a single mirror

btrfs_end_bio checks the number of errors on a bio against the max
number of errors allowed before sending any EIOs up to the higher
levels.

If we got enough copies of the bio done for a given raid level, it is
supposed to clear the bio error flag and return success.

We have pointers to the original bio sent down by the higher layers and
pointers to any cloned bios we made for raid purposes.  If the original
bio happens to be the one that got an io error, but not the last one to
finish, it might not have the BIO_UPTODATE bit set.

Then, when the last bio does finish, we'll call bio_end_io on the
original bio.  It won't have the uptodate bit set and we'll end up
sending EIO to the higher layers.

We already had a check for this, it just was conditional on getting the
IO error on the very last bio.  Make the check unconditional so we eat
the EIOs properly.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0a8c8f8..91ea57a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3258,7 +3258,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		 */
 		if (atomic_read(&bbio->error) > bbio->max_errors) {
 			err = -EIO;
-		} else if (err) {
+		} else {
 			/*
 			 * this bio is actually up to date, we didn't
 			 * go over the max number of errors
-- 
cgit v1.1


From f4a2f4c548296168832ad4ab7e7f7b0cd0bf1214 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 14 Dec 2011 20:12:01 -0500
Subject: Btrfs: fix wrong i_size when truncating a file to a larger size

Btrfsck report error 100 after the 83th case of xfstests was run, it means
the i_size of the file is wrong.

The reason of this bug is that:
Btrfs increased i_size of the file at the beginning, but it failed to expand
the file, and failed to update the i_size to the old size because there is no
enough space in the file system, so we found a wrong i_size.

This patch fixes this bug by updating the i_size just when we pass the file
expanding and get enough space to update i-node.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5ccec2..4bbceb9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3370,6 +3370,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
 static int btrfs_setsize(struct inode *inode, loff_t newsize)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 	loff_t oldsize = i_size_read(inode);
 	int ret;
 
@@ -3377,16 +3379,20 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
 		return 0;
 
 	if (newsize > oldsize) {
-		i_size_write(inode, newsize);
-		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
 		truncate_pagecache(inode, oldsize, newsize);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
-		if (ret) {
-			btrfs_setsize(inode, oldsize);
+		if (ret)
 			return ret;
-		}
 
-		mark_inode_dirty(inode);
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		i_size_write(inode, newsize);
+		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+		ret = btrfs_update_inode(trans, root, inode);
+
+		btrfs_end_transaction_throttle(trans, root);
 	} else {
 
 		/*
-- 
cgit v1.1


From 3642320e07444cc46327b24977d752f99706dac2 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 14 Dec 2011 20:12:02 -0500
Subject: Btrfs: fix wrong disk space information of the files

Btrfsck report errors after the 83th case of xfstests was run, The error
number is 400, it means the used disk space of the file is wrong.

The reason of this bug is that:
The file truncation may fail when the space of the file system is not enough,
and leave some file extents, whose offset are beyond the end of the files.
When we want to expand those files, we will drop those file extents, and
put in dummy file extents, and then we should update the i-node. But btrfs
forgets to do it.

This patch adds the forgotten i-node update.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4bbceb9..f1c4bce 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3327,7 +3327,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
-			trans = btrfs_start_transaction(root, 2);
+			trans = btrfs_start_transaction(root, 3);
 			if (IS_ERR(trans)) {
 				err = PTR_ERR(trans);
 				break;
@@ -3337,6 +3337,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 						 cur_offset + hole_size,
 						 &hint_byte, 1);
 			if (err) {
+				btrfs_update_inode(trans, root, inode);
 				btrfs_end_transaction(trans, root);
 				break;
 			}
@@ -3346,6 +3347,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 					0, hole_size, 0, hole_size,
 					0, 0, 0);
 			if (err) {
+				btrfs_update_inode(trans, root, inode);
 				btrfs_end_transaction(trans, root);
 				break;
 			}
@@ -3353,6 +3355,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			btrfs_drop_extent_cache(inode, hole_start,
 					last_byte - 1, 0);
 
+			btrfs_update_inode(trans, root, inode);
 			btrfs_end_transaction(trans, root);
 		}
 		free_extent_map(em);
-- 
cgit v1.1


From 39fb26c398ddf8d7794a85e896cfe1a42e55524b Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 14 Dec 2011 20:12:02 -0500
Subject: Btrfs: fix inaccurate available space on raid0 profile

When we use raid0 as the data profile, df command may show us a very
inaccurate value of the available space, which may be much less than the
real one. It may make the users puzzled. Fix it by changing the calculation
of the available space, and making it be more similar to a fake chunk
allocation.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a3ce9e..41495dce 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1079,7 +1079,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	u64 avail_space;
 	u64 used_space;
 	u64 min_stripe_size;
-	int min_stripes = 1;
+	int min_stripes = 1, num_stripes = 1;
 	int i = 0, nr_devices;
 	int ret;
 
@@ -1093,12 +1093,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 	/* calc min stripe number for data space alloction */
 	type = btrfs_get_alloc_profile(root, 1);
-	if (type & BTRFS_BLOCK_GROUP_RAID0)
+	if (type & BTRFS_BLOCK_GROUP_RAID0) {
 		min_stripes = 2;
-	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_stripes = nr_devices;
+	} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
 		min_stripes = 2;
-	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		num_stripes = 2;
+	} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
 		min_stripes = 4;
+		num_stripes = 4;
+	}
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
 		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
@@ -1167,13 +1171,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	i = nr_devices - 1;
 	avail_space = 0;
 	while (nr_devices >= min_stripes) {
+		if (num_stripes > nr_devices)
+			num_stripes = nr_devices;
+
 		if (devices_info[i].max_avail >= min_stripe_size) {
 			int j;
 			u64 alloc_size;
 
-			avail_space += devices_info[i].max_avail * min_stripes;
+			avail_space += devices_info[i].max_avail * num_stripes;
 			alloc_size = devices_info[i].max_avail;
-			for (j = i + 1 - min_stripes; j <= i; j++)
+			for (j = i + 1 - num_stripes; j <= i; j++)
 				devices_info[j].max_avail -= alloc_size;
 		}
 		i--;
-- 
cgit v1.1


From f8e9e0b07be0464e12366631da3da73a1a62449c Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Dec 2011 20:12:02 -0500
Subject: btrfs: keep orphans for subvolume deletion

Since we have the free space caches, btrfs_orphan_cleanup also runs for
the tree_root. Unfortunately this also cleans up the orphans used to mark
subvol deletions in progress.

Currently if a subvol deletion gets interrupted twice by umount/mount, the
deletion will not be continued and the space permanently lost, though it
would be possible to write a tool to recover those lost subvol deletions.
This patch checks if the orphan belongs to a subvol (dead root) and skips
the deletion.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1c4bce..4a31493 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2158,6 +2158,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		if (ret && ret != -ESTALE)
 			goto out;
 
+		if (ret == -ESTALE && root == root->fs_info->tree_root) {
+			struct btrfs_root *dead_root;
+			struct btrfs_fs_info *fs_info = root->fs_info;
+			int is_dead_root = 0;
+
+			/*
+			 * this is an orphan in the tree root. Currently these
+			 * could come from 2 sources:
+			 *  a) a snapshot deletion in progress
+			 *  b) a free space cache inode
+			 * We need to distinguish those two, as the snapshot
+			 * orphan must not get deleted.
+			 * find_dead_roots already ran before us, so if this
+			 * is a snapshot deletion, we should find the root
+			 * in the dead_roots list
+			 */
+			spin_lock(&fs_info->trans_lock);
+			list_for_each_entry(dead_root, &fs_info->dead_roots,
+					    root_list) {
+				if (dead_root->root_key.objectid ==
+				    found_key.objectid) {
+					is_dead_root = 1;
+					break;
+				}
+			}
+			spin_unlock(&fs_info->trans_lock);
+			if (is_dead_root) {
+				/* prevent this orphan from being found again */
+				key.offset = found_key.objectid - 1;
+				continue;
+			}
+		}
 		/*
 		 * Inode is already gone but the orphan item is still there,
 		 * kill the orphan item.
-- 
cgit v1.1


From 306424cc880a0fbbdc99eee1f43d056a301a180f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 14 Dec 2011 20:12:02 -0500
Subject: Btrfs: fix ctime update of on-disk inode

To reproduce the bug:

    # touch /mnt/tmp
    # stat /mnt/tmp | grep Change
    Change: 2011-12-09 09:32:23.412105981 +0800
    # chattr +i /mnt/tmp
    # stat /mnt/tmp | grep Change
    Change: 2011-12-09 09:32:43.198105295 +0800
    # umount /mnt
    # mount /dev/loop1 /mnt
    # stat /mnt/tmp | grep Change
    Change: 2011-12-09 09:32:23.412105981 +0800

We should update ctime of in-memory inode before calling
btrfs_update_inode().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 72d4616..40eaa9f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
 
+	btrfs_update_iflags(inode);
+	inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 
-	btrfs_update_iflags(inode);
-	inode->i_ctime = CURRENT_TIME;
 	btrfs_end_transaction(trans, root);
 
 	mnt_drop_write(file->f_path.mnt);
-- 
cgit v1.1


From 8f3b65a3d66bcc086e1eb040b7545e70681f2ed1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Dec 2011 09:29:43 -0500
Subject: Btrfs: add a cond_resched() into the worker loop

If we have a constant stream of end_io completions or crc work,
we can hit softlockup messages from the async helper threads.  This
adds a cond_resched() into the loop to avoid them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec1409..203e849 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -331,7 +331,7 @@ again:
 			run_ordered_completions(worker->workers, work);
 
 			check_pending_worker_creates(worker);
-
+			cond_resched();
 		}
 
 		spin_lock_irq(&worker->lock);
-- 
cgit v1.1


From ad19db71f498fd858dd84ce603efcf97e321f184 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Thu, 15 Dec 2011 10:09:07 -0500
Subject: BTRFS: Establish i_ops before calling d_instantiate

The Smack LSM hook for security_d_instantiate checks
the inode's i_op->getxattr value to determine if the
containing filesystem supports extended attributes.
The BTRFS filesystem sets the inode's i_op value only
after it has instantiated the inode. This results in
Smack incorrectly giving new BTRFS inodes attributes
from the filesystem defaults on the assumption that
values can't be stored on the filesystem. This patch
moves the assignment of inode operation vectors ahead
of the calls to d_instantiate, letting Smack know that
the filesystem supports extended attributes. There
should be no impact on the performance or behavior of
BTRFS.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a31493..d4a9195 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4596,11 +4596,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+
+	inode->i_op = &btrfs_special_inode_operations;
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
-		inode->i_op = &btrfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, rdev);
 		btrfs_update_inode(trans, root, inode);
 	}
@@ -4654,14 +4661,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 out_unlock:
@@ -7117,14 +7131,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	if (drop_inode)
-- 
cgit v1.1


From 0dc3b84a73267f47a75468f924f5d58a840e3152 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 18 Nov 2011 14:37:27 -0500
Subject: Btrfs: fix num_workers_starting bug and other bugs in async thread

Al pointed out we have some random problems with the way we account for
num_workers_starting in the async thread stuff.  First of all we need to make
sure to decrement num_workers_starting if we fail to start the worker, so make
__btrfs_start_workers do this.  Also fix __btrfs_start_workers so that it
doesn't call btrfs_stop_workers(), there is no point in stopping everybody if we
failed to create a worker.  Also check_pending_worker_creates needs to call
__btrfs_start_work in it's work function since it already increments
num_workers_starting.

People only start one worker at a time, so get rid of the num_workers argument
everywhere, and make btrfs_queue_worker a void since it will always succeed.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/async-thread.c | 115 +++++++++++++++++++++++-------------------------
 fs/btrfs/async-thread.h |   4 +-
 fs/btrfs/disk-io.c      |  34 ++++++++------
 fs/btrfs/scrub.c        |   8 +++-
 4 files changed, 83 insertions(+), 78 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec1409..af8e117 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
 	int idle;
 };
 
+static int __btrfs_start_workers(struct btrfs_workers *workers);
+
 /*
  * btrfs_start_workers uses kthread_run, which can block waiting for memory
  * for a very long time.  It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
 {
 	struct worker_start *start;
 	start = container_of(work, struct worker_start, work);
-	btrfs_start_workers(start->queue, 1);
+	__btrfs_start_workers(start->queue);
 	kfree(start);
 }
 
-static int start_new_worker(struct btrfs_workers *queue)
-{
-	struct worker_start *start;
-	int ret;
-
-	start = kzalloc(sizeof(*start), GFP_NOFS);
-	if (!start)
-		return -ENOMEM;
-
-	start->work.func = start_new_worker_func;
-	start->queue = queue;
-	ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
-	if (ret)
-		kfree(start);
-	return ret;
-}
-
 /*
  * helper function to move a thread onto the idle list after it
  * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 {
 	struct btrfs_workers *workers = worker->workers;
+	struct worker_start *start;
 	unsigned long flags;
 
 	rmb();
 	if (!workers->atomic_start_pending)
 		return;
 
+	start = kzalloc(sizeof(*start), GFP_NOFS);
+	if (!start)
+		return;
+
+	start->work.func = start_new_worker_func;
+	start->queue = workers;
+
 	spin_lock_irqsave(&workers->lock, flags);
 	if (!workers->atomic_start_pending)
 		goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 
 	workers->num_workers_starting += 1;
 	spin_unlock_irqrestore(&workers->lock, flags);
-	start_new_worker(workers);
+	btrfs_queue_worker(workers->atomic_worker_start, &start->work);
 	return;
 
 out:
+	kfree(start);
 	spin_unlock_irqrestore(&workers->lock, flags);
 }
 
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
  * starts new worker threads.  This does not enforce the max worker
  * count in case you need to temporarily go past it.
  */
-static int __btrfs_start_workers(struct btrfs_workers *workers,
-				 int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
 	int ret = 0;
-	int i;
 
-	for (i = 0; i < num_workers; i++) {
-		worker = kzalloc(sizeof(*worker), GFP_NOFS);
-		if (!worker) {
-			ret = -ENOMEM;
-			goto fail;
-		}
+	worker = kzalloc(sizeof(*worker), GFP_NOFS);
+	if (!worker) {
+		ret = -ENOMEM;
+		goto fail;
+	}
 
-		INIT_LIST_HEAD(&worker->pending);
-		INIT_LIST_HEAD(&worker->prio_pending);
-		INIT_LIST_HEAD(&worker->worker_list);
-		spin_lock_init(&worker->lock);
-
-		atomic_set(&worker->num_pending, 0);
-		atomic_set(&worker->refs, 1);
-		worker->workers = workers;
-		worker->task = kthread_run(worker_loop, worker,
-					   "btrfs-%s-%d", workers->name,
-					   workers->num_workers + i);
-		if (IS_ERR(worker->task)) {
-			ret = PTR_ERR(worker->task);
-			kfree(worker);
-			goto fail;
-		}
-		spin_lock_irq(&workers->lock);
-		list_add_tail(&worker->worker_list, &workers->idle_list);
-		worker->idle = 1;
-		workers->num_workers++;
-		workers->num_workers_starting--;
-		WARN_ON(workers->num_workers_starting < 0);
-		spin_unlock_irq(&workers->lock);
+	INIT_LIST_HEAD(&worker->pending);
+	INIT_LIST_HEAD(&worker->prio_pending);
+	INIT_LIST_HEAD(&worker->worker_list);
+	spin_lock_init(&worker->lock);
+
+	atomic_set(&worker->num_pending, 0);
+	atomic_set(&worker->refs, 1);
+	worker->workers = workers;
+	worker->task = kthread_run(worker_loop, worker,
+				   "btrfs-%s-%d", workers->name,
+				   workers->num_workers + 1);
+	if (IS_ERR(worker->task)) {
+		ret = PTR_ERR(worker->task);
+		kfree(worker);
+		goto fail;
 	}
+	spin_lock_irq(&workers->lock);
+	list_add_tail(&worker->worker_list, &workers->idle_list);
+	worker->idle = 1;
+	workers->num_workers++;
+	workers->num_workers_starting--;
+	WARN_ON(workers->num_workers_starting < 0);
+	spin_unlock_irq(&workers->lock);
+
 	return 0;
 fail:
-	btrfs_stop_workers(workers);
+	spin_lock_irq(&workers->lock);
+	workers->num_workers_starting--;
+	spin_unlock_irq(&workers->lock);
 	return ret;
 }
 
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+int btrfs_start_workers(struct btrfs_workers *workers)
 {
 	spin_lock_irq(&workers->lock);
-	workers->num_workers_starting += num_workers;
+	workers->num_workers_starting++;
 	spin_unlock_irq(&workers->lock);
-	return __btrfs_start_workers(workers, num_workers);
+	return __btrfs_start_workers(workers);
 }
 
 /*
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 	struct btrfs_worker_thread *worker;
 	unsigned long flags;
 	struct list_head *fallback;
+	int ret;
 
 again:
 	spin_lock_irqsave(&workers->lock, flags);
@@ -584,7 +578,9 @@ again:
 			workers->num_workers_starting++;
 			spin_unlock_irqrestore(&workers->lock, flags);
 			/* we're below the limit, start another worker */
-			__btrfs_start_workers(workers, 1);
+			ret = __btrfs_start_workers(workers);
+			if (ret)
+				goto fallback;
 			goto again;
 		}
 	}
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
 /*
  * places a struct btrfs_work into the pending queue of one of the kthreads
  */
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 {
 	struct btrfs_worker_thread *worker;
 	unsigned long flags;
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	/* don't requeue something already on a list */
 	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		goto out;
+		return;
 
 	worker = find_worker(workers);
 	if (workers->ordered) {
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	if (wake)
 		wake_up_process(worker->task);
 	spin_unlock_irqrestore(&worker->lock, flags);
-
-out:
-	return 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746..f34cc31 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
 	char *name;
 };
 
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
 			struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94abc25..3f9d555 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2194,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
 	fs_info->readahead_workers.idle_thresh = 2;
 
-	btrfs_start_workers(&fs_info->workers, 1);
-	btrfs_start_workers(&fs_info->generic_worker, 1);
-	btrfs_start_workers(&fs_info->submit_workers, 1);
-	btrfs_start_workers(&fs_info->delalloc_workers, 1);
-	btrfs_start_workers(&fs_info->fixup_workers, 1);
-	btrfs_start_workers(&fs_info->endio_workers, 1);
-	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
-	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
-	btrfs_start_workers(&fs_info->endio_write_workers, 1);
-	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
-	btrfs_start_workers(&fs_info->delayed_workers, 1);
-	btrfs_start_workers(&fs_info->caching_workers, 1);
-	btrfs_start_workers(&fs_info->readahead_workers, 1);
+	/*
+	 * btrfs_start_workers can really only fail because of ENOMEM so just
+	 * return -ENOMEM if any of these fail.
+	 */
+	ret = btrfs_start_workers(&fs_info->workers);
+	ret |= btrfs_start_workers(&fs_info->generic_worker);
+	ret |= btrfs_start_workers(&fs_info->submit_workers);
+	ret |= btrfs_start_workers(&fs_info->delalloc_workers);
+	ret |= btrfs_start_workers(&fs_info->fixup_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
+	ret |= btrfs_start_workers(&fs_info->delayed_workers);
+	ret |= btrfs_start_workers(&fs_info->caching_workers);
+	ret |= btrfs_start_workers(&fs_info->readahead_workers);
+	if (ret) {
+		ret = -ENOMEM;
+		goto fail_sb_buffer;
+	}
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c27bcb6..ddf2c90 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1535,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret = 0;
 
 	mutex_lock(&fs_info->scrub_lock);
 	if (fs_info->scrub_workers_refcnt == 0) {
 		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
 			   fs_info->thread_pool_size, &fs_info->generic_worker);
 		fs_info->scrub_workers.idle_thresh = 4;
-		btrfs_start_workers(&fs_info->scrub_workers, 1);
+		ret = btrfs_start_workers(&fs_info->scrub_workers);
+		if (ret)
+			goto out;
 	}
 	++fs_info->scrub_workers_refcnt;
+out:
 	mutex_unlock(&fs_info->scrub_lock);
 
-	return 0;
+	return ret;
 }
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
-- 
cgit v1.1


From 22c44fe65adacd20a174f3f54686509ee94ef7be Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 30 Nov 2011 10:45:38 -0500
Subject: Btrfs: deal with enospc from dirtying inodes properly

Now that we're properly keeping track of delayed inode space we've been getting
a lot of warnings out of btrfs_dirty_inode() when running xfstest 83.  This is
because a bunch of people call mark_inode_dirty, which is void so we can't
return ENOSPC.  This needs to be fixed in a few areas

1) file_update_time - this updates the mtime and such when writing to a file,
which will call mark_inode_dirty.  So copy file_update_time into btrfs so we can
call btrfs_dirty_inode directly and return an error if we get one appropriately.

2) fix symlinks to use btrfs_setattr for ->setattr.  For some reason we weren't
setting ->setattr for symlinks, even though we should have been.  This catches
one of the cases where we were getting errors in mark_inode_dirty.

3) Fix btrfs_setattr and btrfs_setsize to call btrfs_dirty_inode directly
instead of mark_inode_dirty.  This lets us return errors properly for truncate
and chown/anything related to setattr.

4) Add a new btrfs_fs_dirty_inode which will just call btrfs_dirty_inode and
print an error if we have one.  The only remaining user we can't control for
this is touch_atime(), but we don't really want to keep people from walking
down the tree if we don't have space to save the atime update, so just complain
but don't worry about it.

With this patch xfstests 83 complains a handful of times instead of hundreds of
times.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h |  3 ++-
 fs/btrfs/file.c  |  6 ++++-
 fs/btrfs/inode.c | 80 ++++++++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/super.c | 13 ++++++++-
 4 files changed, 80 insertions(+), 22 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50634abe..6738503 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2692,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode, int flags);
+int btrfs_dirty_inode(struct inode *inode);
+int btrfs_update_time(struct file *file);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f2e9282..cc7492c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1387,7 +1387,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		goto out;
 	}
 
-	file_update_time(file);
+	err = btrfs_update_time(file);
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
 	BTRFS_I(inode)->sequence++;
 
 	start_pos = round_down(pos, root->sectorsize);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5ccec2..eec33b9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
 #include <linux/falloc.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/mount.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -3386,7 +3387,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
 			return ret;
 		}
 
-		mark_inode_dirty(inode);
+		ret = btrfs_dirty_inode(inode);
 	} else {
 
 		/*
@@ -3426,9 +3427,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (attr->ia_valid) {
 		setattr_copy(inode, attr);
-		mark_inode_dirty(inode);
+		err = btrfs_dirty_inode(inode);
 
-		if (attr->ia_valid & ATTR_MODE)
+		if (!err && attr->ia_valid & ATTR_MODE)
 			err = btrfs_acl_chmod(inode);
 	}
 
@@ -4204,42 +4205,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-void btrfs_dirty_inode(struct inode *inode, int flags)
+int btrfs_dirty_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
 	if (BTRFS_I(inode)->dummy_inode)
-		return;
+		return 0;
 
 	trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	ret = btrfs_update_inode(trans, root, inode);
 	if (ret && ret == -ENOSPC) {
 		/* whoops, lets try again with the full transaction */
 		btrfs_end_transaction(trans, root);
 		trans = btrfs_start_transaction(root, 1);
-		if (IS_ERR(trans)) {
-			printk_ratelimited(KERN_ERR "btrfs: fail to "
-				       "dirty  inode %llu error %ld\n",
-				       (unsigned long long)btrfs_ino(inode),
-				       PTR_ERR(trans));
-			return;
-		}
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
 
 		ret = btrfs_update_inode(trans, root, inode);
-		if (ret) {
-			printk_ratelimited(KERN_ERR "btrfs: fail to "
-				       "dirty  inode %llu error %d\n",
-				       (unsigned long long)btrfs_ino(inode),
-				       ret);
-		}
 	}
 	btrfs_end_transaction(trans, root);
 	if (BTRFS_I(inode)->delayed_node)
 		btrfs_balance_delayed_items(root);
+
+	return ret;
+}
+
+/*
+ * This is a copy of file_update_time.  We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+int btrfs_update_time(struct file *file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct timespec now;
+	int ret;
+	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+
+	/* First try to exhaust all avenues to not sync */
+	if (IS_NOCMTIME(inode))
+		return 0;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		sync_it = S_MTIME;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		sync_it |= S_CTIME;
+
+	if (IS_I_VERSION(inode))
+		sync_it |= S_VERSION;
+
+	if (!sync_it)
+		return 0;
+
+	/* Finally allowed to write? Takes lock. */
+	if (mnt_want_write_file(file))
+		return 0;
+
+	/* Only change inode inside the lock region */
+	if (sync_it & S_VERSION)
+		inode_inc_iversion(inode);
+	if (sync_it & S_CTIME)
+		inode->i_ctime = now;
+	if (sync_it & S_MTIME)
+		inode->i_mtime = now;
+	ret = btrfs_dirty_inode(inode);
+	if (!ret)
+		mark_inode_dirty_sync(inode);
+	mnt_drop_write(file->f_path.mnt);
+	return ret;
 }
 
 /*
@@ -6304,6 +6343,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_end;
 
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (!ret)
+		ret = btrfs_update_time(vma->vm_file);
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
@@ -7353,6 +7394,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.setxattr	= btrfs_setxattr,
 	.getxattr	= btrfs_getxattr,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a3ce9e..05b4045 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/mnt_namespace.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -1290,6 +1291,16 @@ static int btrfs_unfreeze(struct super_block *sb)
 	return 0;
 }
 
+static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
+{
+	int ret;
+
+	ret = btrfs_dirty_inode(inode);
+	if (ret)
+		printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
+				   "error %d\n", btrfs_ino(inode), ret);
+}
+
 static const struct super_operations btrfs_super_ops = {
 	.drop_inode	= btrfs_drop_inode,
 	.evict_inode	= btrfs_evict_inode,
@@ -1297,7 +1308,7 @@ static const struct super_operations btrfs_super_ops = {
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
 	.write_inode	= btrfs_write_inode,
-	.dirty_inode	= btrfs_dirty_inode,
+	.dirty_inode	= btrfs_fs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
-- 
cgit v1.1


From 660d3f6cde552323578b85fc5a09a6742f1fe804 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 9 Dec 2011 11:18:51 -0500
Subject: Btrfs: fix how we do delalloc reservations and how we free
 reservations on error

Running xfstests 269 with some tracing my scripts kept spitting out errors about
releasing bytes that we didn't actually have reserved.  This took me down a huge
rabbit hole and it turns out the way we deal with reserved_extents is wrong,
we need to only be setting it if the reservation succeeds, otherwise the free()
method will come in and unreserve space that isn't actually reserved yet, which
can lead to other warnings and such.  The math was all working out right in the
end, but it caused all sorts of other issues in addition to making my scripts
yell and scream and generally make it impossible for me to track down the
original issue I was looking for.  The other problem is with our error handling
in the reservation code.  There are two cases that we need to deal with

1) We raced with free.  In this case free won't free anything because csum_bytes
is modified before we dro the lock in our reservation path, so free rightly
doesn't release any space because the reservation code may be depending on that
reservation.  However if we fail, we need the reservation side to do the free at
that point since that space is no longer in use.  So as it stands the code was
doing this fine and it worked out, except in case #2

2) We don't race with free.  Nobody comes in and changes anything, and our
reservation fails.  In this case we didn't reserve anything anyway and we just
need to clean up csum_bytes but not free anything.  So we keep track of
csum_bytes before we drop the lock and if it hasn't changed we know we can just
decrement csum_bytes and carry on.

Because of the case where we can race with free()'s since we have to drop our
spin_lock to do the reservation, I'm going to serialize all reservations with
the i_mutex.  We already get this for free in the heavy use paths, truncate and
file write all hold the i_mutex, just needed to add it to page_mkwrite and
various ioctl/balance things.  With this patch my space leak scripts no longer
scream bloody murder.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 43 ++++++++++++++++++++++++++++++-------------
 fs/btrfs/inode.c       | 10 ++++++++++
 fs/btrfs/ioctl.c       |  2 ++
 fs/btrfs/relocation.c  |  2 ++
 4 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 05e1386..4eb4d27 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4203,12 +4203,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
 	u64 to_reserve = 0;
+	u64 csum_bytes;
 	unsigned nr_extents = 0;
+	int extra_reserve = 0;
 	int flush = 1;
 	int ret;
 
+	/* Need to be holding the i_mutex here if we aren't free space cache */
 	if (btrfs_is_free_space_inode(root, inode))
 		flush = 0;
+	else
+		WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
 	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
@@ -4219,11 +4224,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	BTRFS_I(inode)->outstanding_extents++;
 
 	if (BTRFS_I(inode)->outstanding_extents >
-	    BTRFS_I(inode)->reserved_extents) {
+	    BTRFS_I(inode)->reserved_extents)
 		nr_extents = BTRFS_I(inode)->outstanding_extents -
 			BTRFS_I(inode)->reserved_extents;
-		BTRFS_I(inode)->reserved_extents += nr_extents;
-	}
 
 	/*
 	 * Add an item to reserve for updating the inode when we complete the
@@ -4231,11 +4234,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	 */
 	if (!BTRFS_I(inode)->delalloc_meta_reserved) {
 		nr_extents++;
-		BTRFS_I(inode)->delalloc_meta_reserved = 1;
+		extra_reserve = 1;
 	}
 
 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+	csum_bytes = BTRFS_I(inode)->csum_bytes;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4245,22 +4249,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
-		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-		spin_unlock(&BTRFS_I(inode)->lock);
-		to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
 		/*
-		 * Somebody could have come in and twiddled with the
-		 * reservation, so if we have to free more than we would have
-		 * reserved from this reservation go ahead and release those
-		 * bytes.
+		 * If the inodes csum_bytes is the same as the original
+		 * csum_bytes then we know we haven't raced with any free()ers
+		 * so we can just reduce our inodes csum bytes and carry on.
+		 * Otherwise we have to do the normal free thing to account for
+		 * the case that the free side didn't free up its reserve
+		 * because of this outstanding reservation.
 		 */
-		to_free -= to_reserve;
+		if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+			calc_csum_metadata_size(inode, num_bytes, 0);
+		else
+			to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+		spin_unlock(&BTRFS_I(inode)->lock);
+		if (dropped)
+			to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
 		if (to_free)
 			btrfs_block_rsv_release(root, block_rsv, to_free);
 		return ret;
 	}
 
+	spin_lock(&BTRFS_I(inode)->lock);
+	if (extra_reserve) {
+		BTRFS_I(inode)->delalloc_meta_reserved = 1;
+		nr_extents--;
+	}
+	BTRFS_I(inode)->reserved_extents += nr_extents;
+	spin_unlock(&BTRFS_I(inode)->lock);
+
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eec33b9..8938174 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2192,7 +2192,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				continue;
 			}
 			nr_truncate++;
+			/*
+			 * Need to hold the imutex for reservation purposes, not
+			 * a huge deal here but I have a WARN_ON in
+			 * btrfs_delalloc_reserve_space to catch offenders.
+			 */
+			mutex_lock(&inode->i_mutex);
 			ret = btrfs_truncate(inode);
+			mutex_unlock(&inode->i_mutex);
 		} else {
 			nr_unlink++;
 		}
@@ -6342,7 +6349,10 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_start;
 	u64 page_end;
 
+	/* Need this to keep space reservations serialized */
+	mutex_lock(&inode->i_mutex);
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	mutex_unlock(&inode->i_mutex);
 	if (!ret)
 		ret = btrfs_update_time(vma->vm_file);
 	if (ret) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 72d4616..dd88916 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
 		return 0;
 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
 
+	mutex_lock(&inode->i_mutex);
 	ret = btrfs_delalloc_reserve_space(inode,
 					   num_pages << PAGE_CACHE_SHIFT);
+	mutex_unlock(&inode->i_mutex);
 	if (ret)
 		return ret;
 again:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5..cfb5543 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
 	while (index <= last_index) {
+		mutex_lock(&inode->i_mutex);
 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+		mutex_unlock(&inode->i_mutex);
 		if (ret)
 			goto out;
 
-- 
cgit v1.1


From 7041ee97281c30a78658904140c7bd9373a36142 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 9 Dec 2011 13:26:22 -0500
Subject: Btrfs: fix leaked space in truncate

We were occasionaly leaking space when running xfstest 269.  This is because if
we failed to start the transaction in the truncate loop we'd just goto out, but
we need to break so that the inode is removed from the orphan list and the space
is properly freed.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8938174..6349c63 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6566,8 +6566,9 @@ static int btrfs_truncate(struct inode *inode)
 			/* Just need the 1 for updating the inode */
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
-				err = PTR_ERR(trans);
-				goto out;
+				ret = err = PTR_ERR(trans);
+				trans = NULL;
+				break;
 			}
 		}
 
-- 
cgit v1.1


From ee4d89f0c4967c624c92516fcc37b41069bfdc23 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 13 Dec 2011 12:55:58 -0500
Subject: Btrfs: don't panic if orphan item already exists

I've been hitting this BUG_ON() in btrfs_orphan_add when running xfstest 269 in
a loop.  This is because we will add an orphan item, do the truncate, the
truncate will fail for whatever reason (*cough*ENOSPC*cough*) and then we're
left with an orphan item still in the fs.  Then we come back later to do another
truncate and it blows up because we already have an orphan item.  This is ok so
just fix the BUG_ON() to only BUG() if ret is not EEXIST.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6349c63..b212f39 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2032,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	/* insert an orphan item to track this unlinked/truncated file */
 	if (insert >= 1) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-		BUG_ON(ret);
+		BUG_ON(ret && ret != -EEXIST);
 	}
 
 	/* insert an orphan item to track subvolume contains orphan files */
-- 
cgit v1.1


From e65cbb94e036058128a5dec6398be2fd64cf88ba Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 13 Dec 2011 16:04:54 -0500
Subject: Btrfs: only set cache_generation if we setup the block group

A user reported a problem booting into a new kernel with the old format inodes.
He was panicing in cow_file_range while writing out the inode cache.  This is
because if the block group is not cached we'll just skip writing out the cache,
however if it gets dirtied again in the same transaction and it finished caching
we'd go ahead and write it out, but since we set cache_generation to the transid
we think we've already truncated it and will just carry on, running into
cow_file_range and blowing up.  We need to make sure we only set
cache_generation if we've done the truncate.  The user tested this patch and
verified that the panic no longer occured.  Thanks,

Reported-and-Tested-by: Klaus Bitto <klaus.bitto@gmail.com>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4eb4d27..8603ee4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2822,7 +2822,7 @@ out_free:
 	btrfs_release_path(path);
 out:
 	spin_lock(&block_group->lock);
-	if (!ret)
+	if (!ret && dcs == BTRFS_DC_SETUP)
 		block_group->cache_generation = trans->transid;
 	block_group->disk_cache_state = dcs;
 	spin_unlock(&block_group->lock);
-- 
cgit v1.1


From e755d9ab387c4c2401dce5b6f435432a829f41cb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Dec 2011 13:36:29 -0500
Subject: Btrfs: deal with NULL srv_rsv in the delalloc inode reservation code

btrfs_update_inode is sometimes called with a null reservation.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/delayed-inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6a1a680..c7ddf8a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -640,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
 	 * we're accounted for.
 	 */
-	if (!trans->bytes_reserved &&
-	    src_rsv != &root->fs_info->delalloc_block_rsv) {
+	if (!src_rsv || (!trans->bytes_reserved &&
+	    src_rsv != &root->fs_info->delalloc_block_rsv)) {
 		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
 		/*
 		 * Since we're under a transaction reserve_metadata_bytes could
-- 
cgit v1.1


From d85c8a6f1bc083279215ff6e79b7c292bf3ec905 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Dec 2011 15:38:41 -0500
Subject: Btrfs: unplug every once and a while

The btrfs io submission threads can build up massive plug lists.  This
keeps things more reasonable so we don't hand over huge dumps of IO at
once.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 91ea57a..f4b839f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
 			btrfs_requeue_work(&device->work);
 			goto done;
 		}
+		/* unplug every 64 requests just for good measure */
+		if (batch_run % 64 == 0) {
+			blk_finish_plug(&plug);
+			blk_start_plug(&plug);
+			sync_pending = 0;
+		}
 	}
 
 	cond_resched();
-- 
cgit v1.1


From 142349f541d0bb6bc3e0d4563268105aada42b0b Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 16 Dec 2011 12:32:57 -0500
Subject: btrfs: lower the dirty balance poll interval

Tests show that the original large intervals can easily make the dirty
limit exceeded on 100 concurrent dd's. So adapt to as large as the
next check point selected by the dirty throttling algorithm.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dafdfa0..52305a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
 		     (sizeof(struct page *)));
+	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+	nrptrs = max(nrptrs, 8);
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
-- 
cgit v1.1