From 96f3136e51580ed68a2952341c8b9e2d7f853472 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:46 +0800
Subject: Btrfs: rename btrfs_sysfs_add_one to btrfs_sysfs_add_mounted

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/sysfs.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..afce306 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4004,7 +4004,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
 
 /* xattr.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..4622d38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2933,7 +2933,7 @@ retry_root_backup:
 		goto fail_fsdev_sysfs;
 	}
 
-	ret = btrfs_sysfs_add_one(fs_info);
+	ret = btrfs_sysfs_add_mounted(fs_info);
 	if (ret) {
 		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
 		goto fail_fsdev_sysfs;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc..cabf840 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -736,7 +736,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 	return error;
 }
 
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 {
 	int error;
 	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
-- 
cgit v1.1


From 6618a59bfc0a0490f2a51df37c67878e23285670 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:47 +0800
Subject: Btrfs: rename btrfs_sysfs_remove_one to btrfs_sysfs_remove_mounted

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/disk-io.c | 4 ++--
 fs/btrfs/sysfs.c   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index afce306..4484063 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4005,7 +4005,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4622d38..7673473 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3113,7 +3113,7 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 
 fail_sysfs:
-	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_mounted(fs_info);
 
 fail_fsdev_sysfs:
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3788,7 +3788,7 @@ void close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_mounted(fs_info);
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 
 	btrfs_free_fs_roots(fs_info);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index cabf840..095a302 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 	}
 }
 
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 {
 	btrfs_reset_fs_info_ptr(fs_info);
 
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 
 	return 0;
 failure:
-	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_mounted(fs_info);
 	return error;
 }
 
-- 
cgit v1.1


From e3bd6973bcf134a56786a8bd248d1740249352ec Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:48 +0800
Subject: Btrfs: rename btrfs_kobj_add_device to btrfs_sysfs_add_device_link

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/sysfs.c       | 4 ++--
 fs/btrfs/sysfs.h       | 2 +-
 fs/btrfs/volumes.c     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd59..200c972 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -375,7 +375,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
-	ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
 	if (ret)
 		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 095a302..df67f6b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 	return 0;
 }
 
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 				struct btrfs_device *one_device)
 {
 	int error = 0;
@@ -744,7 +744,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 
 	btrfs_set_fs_info_ptr(fs_info);
 
-	error = btrfs_kobj_add_device(fs_devs, NULL);
+	error = btrfs_sysfs_add_device_link(fs_devs, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527..6529680 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,7 +82,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
 extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
 int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..d22029a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2309,7 +2309,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 				    tmp + 1);
 
 	/* add sysfs device entry */
-	btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+	btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
 
 	/*
 	 * we've got more storage, clear any full flags on the space
-- 
cgit v1.1


From 325760404820e070bb20be0ce57e8d684d69a2ac Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:49 +0800
Subject: Btrfs: rename btrfs_kobj_rm_device to btrfs_sysfs_rm_device_link

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/sysfs.c       | 6 +++---
 fs/btrfs/sysfs.h       | 2 +-
 fs/btrfs/volumes.c     | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 200c972..7aed8a8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -586,7 +586,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	mutex_unlock(&uuid_mutex);
 
 	/* replace the sysfs entry */
-	btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
 	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 
 	/* write back the superblocks */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index df67f6b..52319d1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -557,7 +557,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
 	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
-	btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
 }
 
 const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
 
 /* when one_device is NULL, it removes all device links */
 
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device)
 {
 	struct hd_struct *disk;
@@ -750,7 +750,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 
 	error = sysfs_create_files(super_kobj, btrfs_attrs);
 	if (error) {
-		btrfs_kobj_rm_device(fs_devs, NULL);
+		btrfs_sysfs_rm_device_link(fs_devs, NULL);
 		return error;
 	}
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6529680..9c09522 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -84,7 +84,7 @@ extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
 int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
 int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d22029a..003d3dd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1801,7 +1801,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		device->fs_devices->open_devices--;
 		/* remove sysfs entry */
-		btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+		btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
 	}
 
 	call_rcu(&device->rcu, free_device);
@@ -1971,7 +1971,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	WARN_ON(!tgtdev);
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 
-	btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
 
 	if (tgtdev->bdev) {
 		btrfs_scratch_superblock(tgtdev);
@@ -2388,7 +2388,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 error_trans:
 	btrfs_end_transaction(trans, root);
 	rcu_string_free(device->name);
-	btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+	btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
-- 
cgit v1.1


From c1b7e474595b1b3a4463450cc22aa18b926093a3 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:50 +0800
Subject: Btrfs: rename super_kobj to fsid_kobj

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c   | 36 ++++++++++++++++++------------------
 fs/btrfs/volumes.c |  2 +-
 fs/btrfs/volumes.h |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 52319d1..e0ac859 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
 	NULL,
 };
 
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
 {
 	struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
 
-	memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+	memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
 	complete(&fs_devs->kobj_unregister);
 }
 
 static struct kobj_type btrfs_ktype = {
 	.sysfs_ops	= &kobj_sysfs_ops,
-	.release	= btrfs_release_super_kobj,
+	.release	= btrfs_release_fsid_kobj,
 };
 
 static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
 {
 	if (kobj->ktype != &btrfs_ktype)
 		return NULL;
-	return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+	return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
 }
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 			attrs[0] = &fa->kobj_attr.attr;
 			if (add) {
 				int ret;
-				ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+				ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
 							&agroup);
 				if (ret)
 					return ret;
 			} else
-				sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+				sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
 						    &agroup);
 		}
 
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 		fs_devs->device_dir_kobj = NULL;
 	}
 
-	if (fs_devs->super_kobj.state_initialized) {
-		kobject_del(&fs_devs->super_kobj);
-		kobject_put(&fs_devs->super_kobj);
+	if (fs_devs->fsid_kobj.state_initialized) {
+		kobject_del(&fs_devs->fsid_kobj);
+		kobject_put(&fs_devs->fsid_kobj);
 		wait_for_completion(&fs_devs->kobj_unregister);
 	}
 }
@@ -555,8 +555,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 		kobject_put(fs_info->space_info_kobj);
 	}
 	addrm_unknown_feature_attrs(fs_info, false);
-	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
-	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+	sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+	sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
 }
 
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 {
 	if (!fs_devs->device_dir_kobj)
 		fs_devs->device_dir_kobj = kobject_create_and_add("devices",
-						&fs_devs->super_kobj);
+						&fs_devs->fsid_kobj);
 
 	if (!fs_devs->device_dir_kobj)
 		return -ENOMEM;
@@ -730,8 +730,8 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 	int error;
 
 	init_completion(&fs_devs->kobj_unregister);
-	fs_devs->super_kobj.kset = btrfs_kset;
-	error = kobject_init_and_add(&fs_devs->super_kobj,
+	fs_devs->fsid_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs_devs->fsid_kobj,
 				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
 	return error;
 }
@@ -740,7 +740,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 {
 	int error;
 	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
-	struct kobject *super_kobj = &fs_devs->super_kobj;
+	struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
 
 	btrfs_set_fs_info_ptr(fs_info);
 
@@ -748,13 +748,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 	if (error)
 		return error;
 
-	error = sysfs_create_files(super_kobj, btrfs_attrs);
+	error = sysfs_create_files(fsid_kobj, btrfs_attrs);
 	if (error) {
 		btrfs_sysfs_rm_device_link(fs_devs, NULL);
 		return error;
 	}
 
-	error = sysfs_create_group(super_kobj,
+	error = sysfs_create_group(fsid_kobj,
 				   &btrfs_feature_attr_group);
 	if (error)
 		goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 		goto failure;
 
 	fs_info->space_info_kobj = kobject_create_and_add("allocation",
-						  super_kobj);
+						  fsid_kobj);
 	if (!fs_info->space_info_kobj) {
 		error = -ENOMEM;
 		goto failure;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 003d3dd..6438792 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2350,7 +2350,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		 */
 		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
 						root->fs_info->fsid);
-		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+		if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
 								fsid_buf))
 			pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ca784a..413a70c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
-	struct kobject super_kobj;
+	struct kobject fsid_kobj;
 	struct kobject *device_dir_kobj;
 	struct completion kobj_unregister;
 };
-- 
cgit v1.1


From 92fc03fbdcbe2523be3f7e6b8e95fee9563a10d2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:51 +0800
Subject: Btrfs: SB read failure should return EIO for __bread failure

This will return EIO when __bread() fails to read SB,
instead of EINVAL.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 18 +++++++++++++++---
 fs/btrfs/volumes.c |  8 ++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7673473..357d9f9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2653,8 +2653,8 @@ int open_ctree(struct super_block *sb,
 	 * Read super block and check the signature bytes only
 	 */
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-	if (!bh) {
-		err = -EINVAL;
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
 		goto fail_alloc;
 	}
 
@@ -3196,6 +3196,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	int i;
 	u64 transid = 0;
 	u64 bytenr;
+	int ret = -EINVAL;
 
 	/* we would like to check all the supers, but that would make
 	 * a btrfs mount succeed after a mkfs from a different FS.
@@ -3209,13 +3210,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 			break;
 		bh = __bread(bdev, bytenr / 4096,
 					BTRFS_SUPER_INFO_SIZE);
-		if (!bh)
+		/*
+		 * If we fail to read from the underlying devices, as of now
+		 * the best option we have is to mark it EIO.
+		 */
+		if (!bh) {
+			ret = -EIO;
 			continue;
+		}
 
 		super = (struct btrfs_super_block *)bh->b_data;
 		if (btrfs_super_bytenr(super) != bytenr ||
 		    btrfs_super_magic(super) != BTRFS_MAGIC) {
 			brelse(bh);
+			ret = -EINVAL;
 			continue;
 		}
 
@@ -3227,6 +3235,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 			brelse(bh);
 		}
 	}
+
+	if (!latest)
+		return ERR_PTR(ret);
+
 	return latest;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6438792..e8a6182 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -211,8 +211,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 	}
 	invalidate_bdev(*bdev);
 	*bh = btrfs_read_dev_super(*bdev);
-	if (!*bh) {
-		ret = -EINVAL;
+	if (IS_ERR(*bh)) {
+		ret = PTR_ERR(*bh);
 		blkdev_put(*bdev, flags);
 		goto error;
 	}
@@ -6746,8 +6746,8 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
 	struct btrfs_super_block *disk_super;
 
 	bh = btrfs_read_dev_super(device->bdev);
-	if (!bh)
-		return -EINVAL;
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 
 	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-- 
cgit v1.1


From 57d816a15ba2c2690c57635134bc01cf4da4623c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:52 +0800
Subject: Btrfs: __btrfs_std_error() logic should be consistent w/out
 CONFIG_PRINTK defined

error handling logic behaves differently with or without
CONFIG_PRINTK defined, since there are two copies of the same
function which a bit of different logic

One, when CONFIG_PRINTK is defined, code is

__btrfs_std_error(..)
{
::
       save_error_info(fs_info);
       if (sb->s_flags & MS_BORN)
               btrfs_handle_error(fs_info);
}

and two when CONFIG_PRINTK is not defined, the code is

__btrfs_std_error(..)
{
::
       if (sb->s_flags & MS_BORN) {
               save_error_info(fs_info);
               btrfs_handle_error(fs_info);
        }
}

I doubt if this was intentional ? and appear to have caused since
we maintain two copies of the same function and they got diverged
with commits.

Now to decide which logic is correct reviewed changes as below,

 533574c6bc30cf526cc1c41bde050c854a945efb
Commit added two copies of this function

 cf79ffb5b79e8a2b587fbf218809e691bb396c98
Commit made change to only one copy of the function and to the
copy when CONFIG_PRINTK is defined.

To fix this, instead of maintaining two copies of same function
approach, maintain single function, and just put the extra
portion of the code under CONFIG_PRINTK define.

This patch just does that. And keeps code of with CONFIG_PRINTK
defined.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 11d1eab..b23d49d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 	}
 }
 
-#ifdef CONFIG_PRINTK
 /*
  * __btrfs_std_error decodes expected errors from the caller and
  * invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		       unsigned int line, int errno, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
 	const char *errstr;
+#endif
 
 	/*
 	 * Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
   		return;
 
+#ifdef CONFIG_PRINTK
 	errstr = btrfs_decode_error(errno);
 	if (fmt) {
 		struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
 			sb->s_id, function, line, errno, errstr);
 	}
+#endif
 
 	/* Don't go through full error handling during mount */
 	save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		btrfs_handle_error(fs_info);
 }
 
+#ifdef CONFIG_PRINTK
 static const char * const logtypes[] = {
 	"emergency",
 	"alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 
 	va_end(args);
 }
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-		       unsigned int line, int errno, const char *fmt, ...)
-{
-	struct super_block *sb = fs_info->sb;
-
-	/*
-	 * Special case: if the error is EROFS, and we're already
-	 * under MS_RDONLY, then it is safe here.
-	 */
-	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
-		return;
-
-	/* Don't go through full error handling during mount */
-	if (sb->s_flags & MS_BORN) {
-		save_error_info(fs_info);
-		btrfs_handle_error(fs_info);
-	}
-}
 #endif
 
 /*
-- 
cgit v1.1


From a4553fefb59cb0336f543fa567170b47e90142a9 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 25 Sep 2015 14:43:01 +0800
Subject: Btrfs: consolidate btrfs_error() to btrfs_std_error()

btrfs_error() and btrfs_std_error() does the same thing
and calls _btrfs_std_error(), so consolidate them together.
And the main motivation is that btrfs_error() is closely
named with btrfs_err(), one handles error action the other
is to log the error, so don't closely name them.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Suggested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       |  6 +++---
 fs/btrfs/ctree.h       |  9 +--------
 fs/btrfs/disk-io.c     |  8 ++++----
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/inode-item.c  |  2 +-
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/relocation.c  |  2 +-
 fs/btrfs/root-tree.c   |  4 ++--
 fs/btrfs/transaction.c |  2 +-
 fs/btrfs/tree-log.c    |  8 ++++----
 fs/btrfs/volumes.c     | 14 +++++++-------
 11 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745ea..1063315 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			return ret;
 		if (refs == 0) {
 			ret = -EROFS;
-			btrfs_std_error(root->fs_info, ret);
+			btrfs_std_error(root->fs_info, ret, NULL);
 			return ret;
 		}
 	} else {
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		child = read_node_slot(root, mid, 0);
 		if (!child) {
 			ret = -EROFS;
-			btrfs_std_error(root->fs_info, ret);
+			btrfs_std_error(root->fs_info, ret, NULL);
 			goto enospc;
 		}
 
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		 */
 		if (!left) {
 			ret = -EROFS;
-			btrfs_std_error(root->fs_info, ret);
+			btrfs_std_error(root->fs_info, ret, NULL);
 			goto enospc;
 		}
 		wret = balance_node_right(trans, root, mid, left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4484063..a86051e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4127,14 +4127,7 @@ do {								\
 				  __LINE__, (errno));		\
 } while (0)
 
-#define btrfs_std_error(fs_info, errno)				\
-do {								\
-	if ((errno))						\
-		__btrfs_std_error((fs_info), __func__,		\
-				   __LINE__, (errno), NULL);	\
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...)		\
+#define btrfs_std_error(fs_info, errno, fmt, args...)		\
 do {								\
 	__btrfs_std_error((fs_info), __func__, __LINE__,	\
 			  (errno), fmt, ##args);		\
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 357d9f9..0f8e33f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2377,7 +2377,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	/* returns with log_tree_root freed on success */
 	ret = btrfs_recover_log_trees(log_tree_root);
 	if (ret) {
-		btrfs_error(tree_root->fs_info, ret,
+		btrfs_std_error(tree_root->fs_info, ret,
 			    "Failed to recover log tree");
 		free_extent_buffer(log_tree_root->node);
 		kfree(log_tree_root);
@@ -3556,7 +3556,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		if (ret) {
 			mutex_unlock(
 				&root->fs_info->fs_devices->device_list_mutex);
-			btrfs_error(root->fs_info, ret,
+			btrfs_std_error(root->fs_info, ret,
 				    "errors while submitting device barriers.");
 			return ret;
 		}
@@ -3596,7 +3596,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 		/* FUA is masked off if unsupported and can't be the reason */
-		btrfs_error(root->fs_info, -EIO,
+		btrfs_std_error(root->fs_info, -EIO,
 			    "%d errors while writing supers", total_errors);
 		return -EIO;
 	}
@@ -3614,7 +3614,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	}
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
-		btrfs_error(root->fs_info, -EIO,
+		btrfs_std_error(root->fs_info, -EIO,
 			    "%d errors while writing supers", total_errors);
 		return -EIO;
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..5069f1b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8688,7 +8688,7 @@ out:
 	if (!for_reloc && root_dropped == false)
 		btrfs_add_dead_root(root);
 	if (err && err != -EAGAIN)
-		btrfs_std_error(root->fs_info, err);
+		btrfs_std_error(root->fs_info, err, NULL);
 	return err;
 }
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c..be4d22a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	 */
 	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
 					    name, name_len, &extref)) {
-		btrfs_std_error(root->fs_info, -ENOENT);
+		btrfs_std_error(root->fs_info, -ENOENT, NULL);
 		ret = -EROFS;
 		goto out;
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..f704d1c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4806,7 +4806,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 	/* update qgroup status and info */
 	err = btrfs_run_qgroups(trans, root->fs_info);
 	if (err < 0)
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "failed to update qgroup status and info\n");
 	err = btrfs_end_transaction(trans, root);
 	if (err && !ret)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 303babe..58ede0a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2418,7 +2418,7 @@ again:
 	}
 out:
 	if (ret) {
-		btrfs_std_error(root->fs_info, ret);
+		btrfs_std_error(root->fs_info, ret, NULL);
 		if (!list_empty(&reloc_roots))
 			free_reloc_roots(&reloc_roots);
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..6d90851 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -283,7 +283,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 			trans = btrfs_join_transaction(tree_root);
 			if (IS_ERR(trans)) {
 				err = PTR_ERR(trans);
-				btrfs_error(tree_root->fs_info, err,
+				btrfs_std_error(tree_root->fs_info, err,
 					    "Failed to start trans to delete "
 					    "orphan item");
 				break;
@@ -292,7 +292,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 						    root_key.objectid);
 			btrfs_end_transaction(trans, tree_root);
 			if (err) {
-				btrfs_error(tree_root->fs_info, err,
+				btrfs_std_error(tree_root->fs_info, err,
 					    "Failed to delete root orphan "
 					    "item");
 				break;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 74bc333..3482d9e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2135,7 +2135,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	if (ret) {
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "Error while writing out transaction");
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		goto scrub_continue;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace..c3f9a9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5314,7 +5314,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
 	ret = walk_log_tree(trans, log_root_tree, &wc);
 	if (ret) {
-		btrfs_error(fs_info, ret, "Failed to pin buffers while "
+		btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
 			    "recovering log root tree.");
 		goto error;
 	}
@@ -5328,7 +5328,7 @@ again:
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
 		if (ret < 0) {
-			btrfs_error(fs_info, ret,
+			btrfs_std_error(fs_info, ret,
 				    "Couldn't find tree log root.");
 			goto error;
 		}
@@ -5346,7 +5346,7 @@ again:
 		log = btrfs_read_fs_root(log_root_tree, &found_key);
 		if (IS_ERR(log)) {
 			ret = PTR_ERR(log);
-			btrfs_error(fs_info, ret,
+			btrfs_std_error(fs_info, ret,
 				    "Couldn't read tree log root.");
 			goto error;
 		}
@@ -5361,7 +5361,7 @@ again:
 			free_extent_buffer(log->node);
 			free_extent_buffer(log->commit_root);
 			kfree(log);
-			btrfs_error(fs_info, ret, "Couldn't read target root "
+			btrfs_std_error(fs_info, ret, "Couldn't read target root "
 				    "for tree log recovery.");
 			goto error;
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e8a6182..99f294c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1402,7 +1402,7 @@ again:
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_dev_extent);
 	} else {
-		btrfs_error(root->fs_info, ret, "Slot search failed");
+		btrfs_std_error(root->fs_info, ret, "Slot search failed");
 		goto out;
 	}
 
@@ -1410,7 +1410,7 @@ again:
 
 	ret = btrfs_del_item(trans, root, path);
 	if (ret) {
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "Failed to remove dev extent item");
 	} else {
 		trans->transaction->have_free_bgs = 1;
@@ -2368,7 +2368,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 		ret = btrfs_relocate_sys_chunks(root);
 		if (ret < 0)
-			btrfs_error(root->fs_info, ret,
+			btrfs_std_error(root->fs_info, ret,
 				    "Failed to relocate sys chunks after "
 				    "device initialization. This can be fixed "
 				    "using the \"btrfs balance\" command.");
@@ -2613,7 +2613,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 	else if (ret > 0) { /* Logic error or corruption */
-		btrfs_error(root->fs_info, -ENOENT,
+		btrfs_std_error(root->fs_info, -ENOENT,
 			    "Failed lookup while freeing chunk.");
 		ret = -ENOENT;
 		goto out;
@@ -2621,7 +2621,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_item(trans, root, path);
 	if (ret < 0)
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "Failed to delete chunk item.");
 out:
 	btrfs_free_path(path);
@@ -2806,7 +2806,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		btrfs_std_error(root->fs_info, ret);
+		btrfs_std_error(root->fs_info, ret, NULL);
 		return ret;
 	}
 
@@ -3461,7 +3461,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	unset_balance_control(fs_info);
 	ret = del_balance_item(fs_info->tree_root);
 	if (ret)
-		btrfs_std_error(fs_info, ret);
+		btrfs_std_error(fs_info, ret, NULL);
 
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
-- 
cgit v1.1


From d74a625987a134b00749e3912bdb6d3ac83fd71d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:56 +0800
Subject: Btrfs: use BTRFS_ERROR_DEV_MISSING_NOT_FOUND when missing device is
 not found

Use btrfs specific error code BTRFS_ERROR_DEV_MISSING_NOT_FOUND instead
of -ENOENT.  Next this removes the logging when user specifies "missing"
and we don't find it in the kernel device list. Logging are for system
events not for user input errors.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 99f294c..c3e2591 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2041,10 +2041,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
 			}
 		}
 
-		if (!*device) {
-			btrfs_err(root->fs_info, "no missing device found");
-			return -ENOENT;
-		}
+		if (!*device)
+			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 
 		return 0;
 	} else {
-- 
cgit v1.1


From 29c36d72535deb3d8961b3fb4b6a565190a6c63b Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:58 +0800
Subject: Btrfs: add btrfs_read_dev_one_super() to read one specific SB

This uses a chunk of code from btrfs_read_dev_super() and creates
a function called btrfs_read_dev_one_super() so that next patch
can use it for scratch superblock.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[renamed bufhead to bh]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 53 +++++++++++++++++++++++++++++++++--------------------
 fs/btrfs/disk-io.h |  2 ++
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0f8e33f..c7c739f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3188,6 +3188,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+			struct buffer_head **bh_ret)
+{
+	struct buffer_head *bh;
+	struct btrfs_super_block *super;
+	u64 bytenr;
+
+	bytenr = btrfs_sb_offset(copy_num);
+	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+		return -EINVAL;
+
+	bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+	/*
+	 * If we fail to read from the underlying devices, as of now
+	 * the best option we have is to mark it EIO.
+	 */
+	if (!bh)
+		return -EIO;
+
+	super = (struct btrfs_super_block *)bh->b_data;
+	if (btrfs_super_bytenr(super) != bytenr ||
+		    btrfs_super_magic(super) != BTRFS_MAGIC) {
+		brelse(bh);
+		return -EINVAL;
+	}
+
+	*bh_ret = bh;
+	return 0;
+}
+
+
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 {
 	struct buffer_head *bh;
@@ -3195,7 +3226,6 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	struct btrfs_super_block *super;
 	int i;
 	u64 transid = 0;
-	u64 bytenr;
 	int ret = -EINVAL;
 
 	/* we would like to check all the supers, but that would make
@@ -3204,28 +3234,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 	 */
 	for (i = 0; i < 1; i++) {
-		bytenr = btrfs_sb_offset(i);
-		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-					i_size_read(bdev->bd_inode))
-			break;
-		bh = __bread(bdev, bytenr / 4096,
-					BTRFS_SUPER_INFO_SIZE);
-		/*
-		 * If we fail to read from the underlying devices, as of now
-		 * the best option we have is to mark it EIO.
-		 */
-		if (!bh) {
-			ret = -EIO;
+		ret = btrfs_read_dev_one_super(bdev, i, &bh);
+		if (ret)
 			continue;
-		}
 
 		super = (struct btrfs_super_block *)bh->b_data;
-		if (btrfs_super_bytenr(super) != bytenr ||
-		    btrfs_super_magic(super) != BTRFS_MAGIC) {
-			brelse(bh);
-			ret = -EINVAL;
-			continue;
-		}
 
 		if (!latest || btrfs_super_generation(super) > transid) {
 			brelse(latest);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bdfb479..adeb318 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+			struct buffer_head **bh_ret);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr);
-- 
cgit v1.1


From 12b1c2637b6e0763121648d637ffe603b422e986 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:59 +0800
Subject: Btrfs: enhance btrfs_scratch_superblock to scratch all superblocks

This patch updates and renames btrfs_scratch_superblocks, (which is used
by the replace device thread), with those fixes from the scratch
superblock code section of btrfs_rm_device(). The fixes are:
  Scratch all copies of superblock
  Notify kobject that superblock has been changed
  Update time on the device

So that btrfs_rm_device() can use the function
btrfs_scratch_superblocks() instead of its own scratch code. And further
replace deivce code which similarly releases device back to the system,
will have the fixes from the btrfs device delete.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[renamed to btrfs_scratch_superblock]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 38 ++++++++++++++++++++++++++------------
 fs/btrfs/volumes.h |  2 +-
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c3e2591..1cde784 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1924,7 +1924,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	if (srcdev->writeable) {
 		fs_devices->rw_devices--;
 		/* zero out the old super if it is writable */
-		btrfs_scratch_superblock(srcdev);
+		btrfs_scratch_superblocks(srcdev->bdev,
+					rcu_str_deref(srcdev->name));
 	}
 
 	if (srcdev->bdev)
@@ -1974,7 +1975,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
 
 	if (tgtdev->bdev) {
-		btrfs_scratch_superblock(tgtdev);
+		btrfs_scratch_superblocks(tgtdev->bdev,
+					rcu_str_deref(tgtdev->name));
 		fs_info->fs_devices->open_devices--;
 	}
 	fs_info->fs_devices->num_devices--;
@@ -6738,22 +6740,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
 {
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
+	int copy_num;
 
-	bh = btrfs_read_dev_super(device->bdev);
-	if (IS_ERR(bh))
-		return PTR_ERR(bh);
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (!bdev)
+		return;
 
-	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-	set_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-	brelse(bh);
+	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+		copy_num++) {
 
-	return 0;
+		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+			continue;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+
+	/* Notify udev that device has changed */
+	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+	/* Update ctime/mtime for device path for libblkid */
+	update_dev_time(device_path);
 }
 
 /*
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 413a70c..8663bf7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -474,7 +474,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				      struct btrfs_device *tgtdev);
 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
 					      struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 			   u64 logical, u64 len, int mirror_num);
 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
-- 
cgit v1.1


From 9e271ae27e4407388c5b25c747abc25c07838b0b Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:33:02 +0800
Subject: Btrfs: kernel operation should come after user input has been
 verified

By general rule of thumb there shouldn't be any way that user land
could trigger a kernel operation just by sending wrong arguments.

Here do commit cleanups after user input has been verified.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7aed8a8..02031fc 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	    args->start.tgtdev_name[0] == '\0')
 		return -EINVAL;
 
-	/*
-	 * Here we commit the transaction to make sure commit_total_bytes
-	 * of all the devices are updated.
-	 */
-	trans = btrfs_attach_transaction(root);
-	if (!IS_ERR(trans)) {
-		ret = btrfs_commit_transaction(trans, root);
-		if (ret)
-			return ret;
-	} else if (PTR_ERR(trans) != -ENOENT) {
-		return PTR_ERR(trans);
-	}
-
 	/* the disk copy procedure reuses the scrub code */
 	mutex_lock(&fs_info->volume_mutex);
 	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
+	/*
+	 * Here we commit the transaction to make sure commit_total_bytes
+	 * of all the devices are updated.
+	 */
+	trans = btrfs_attach_transaction(root);
+	if (!IS_ERR(trans)) {
+		ret = btrfs_commit_transaction(trans, root);
+		if (ret)
+			return ret;
+	} else if (PTR_ERR(trans) != -ENOENT) {
+		return PTR_ERR(trans);
+	}
+
 	btrfs_dev_replace_lock(dev_replace);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
-- 
cgit v1.1


From 097efc966ab3e2c3677698f4ab5b229bfff101dd Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:33:04 +0800
Subject: Btrfs: don't log error from btrfs_get_bdev_and_sb

Originally the message was not in a helper but ended up there. We should
print error messages from callers instead.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[reworded subject and changelog]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1cde784..694d82b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -198,7 +198,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 
 	if (IS_ERR(*bdev)) {
 		ret = PTR_ERR(*bdev);
-		printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
 		goto error;
 	}
 
-- 
cgit v1.1


From f190aa471a7a703444f6587ed79c24a04a22d848 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:33:05 +0800
Subject: Btrfs: add helper for closing one device

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[reworded subject and changelog]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 66 +++++++++++++++++++++++++++++-------------------------
 fs/btrfs/volumes.h |  1 +
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 694d82b..5688b78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -764,36 +764,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
-		struct btrfs_device *new_device;
-		struct rcu_string *name;
-
-		if (device->bdev)
-			fs_devices->open_devices--;
-
-		if (device->writeable &&
-		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
-			list_del_init(&device->dev_alloc_list);
-			fs_devices->rw_devices--;
-		}
-
-		if (device->missing)
-			fs_devices->missing_devices--;
-
-		new_device = btrfs_alloc_device(NULL, &device->devid,
-						device->uuid);
-		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
-		/* Safe because we are under uuid_mutex */
-		if (device->name) {
-			name = rcu_string_strdup(device->name->str, GFP_NOFS);
-			BUG_ON(!name); /* -ENOMEM */
-			rcu_assign_pointer(new_device->name, name);
-		}
-
-		list_replace_rcu(&device->dev_list, &new_device->dev_list);
-		new_device->fs_devices = device->fs_devices;
-
-		call_rcu(&device->rcu, free_device);
+		btrfs_close_one_device(device);
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 
@@ -6834,3 +6805,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
 		fs_devices = fs_devices->seed;
 	}
 }
+
+void btrfs_close_one_device(struct btrfs_device *device)
+{
+	struct btrfs_fs_devices *fs_devices = device->fs_devices;
+	struct btrfs_device *new_device;
+	struct rcu_string *name;
+
+	if (device->bdev)
+		fs_devices->open_devices--;
+
+	if (device->writeable &&
+	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
+		list_del_init(&device->dev_alloc_list);
+		fs_devices->rw_devices--;
+	}
+
+	if (device->missing)
+		fs_devices->missing_devices--;
+
+	new_device = btrfs_alloc_device(NULL, &device->devid,
+					device->uuid);
+	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+	/* Safe because we are under uuid_mutex */
+	if (device->name) {
+		name = rcu_string_strdup(device->name->str, GFP_NOFS);
+		BUG_ON(!name); /* -ENOMEM */
+		rcu_assign_pointer(new_device->name, name);
+	}
+
+	list_replace_rcu(&device->dev_list, &new_device->dev_list);
+	new_device->fs_devices = device->fs_devices;
+
+	call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8663bf7..38450dc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -547,5 +547,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
 struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_close_one_device(struct btrfs_device *device);
 
 #endif
-- 
cgit v1.1


From 73416dab235e5ff030e3b3c61da0cd6ced324fc9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Aug 2015 18:33:07 +0800
Subject: Btrfs: move kobj stuff out of dev_replace lock range

To avoid deadlock described in commit 084b6e7c7607 ("btrfs: Fix a
lockdep warning when running xfstest."), we should move kobj stuff out
of dev_replace lock range.

  "It is because the btrfs_kobj_{add/rm}_device() will call memory
  allocation with GFP_KERNEL,
  which may flush fs page cache to free space, waiting for it self to do
  the commit, causing the deadlock.

  To solve the problem, move btrfs_kobj_{add/rm}_device() out of the
  dev_replace lock range, also involing split the
  btrfs_rm_dev_replace_srcdev() function into remove and free parts.

  Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace
  lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are
  called out of the lock range."

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
[added lockup description]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 02031fc..0978e52 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -375,10 +375,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
-	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
-	if (ret)
-		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
 	printk_in_rcu(KERN_INFO
 		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
 		      src_device->missing ? "<missing disk>" :
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 	btrfs_dev_replace_unlock(dev_replace);
 
+	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+	if (ret)
+		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
 	btrfs_wait_ordered_roots(root->fs_info, -1);
 
 	/* force writing the updated state information to disk */
-- 
cgit v1.1


From 08a84e25a80566cf3cb1903c2448123efedff132 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 08:48:52 +0200
Subject: btrfs: introduce _in_rcu variants of message printing functions

Due to the missing variants there are messages that lack the information
printed by btrfs_info etc helpers.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..88acdff 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4039,14 +4039,43 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_info(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
 
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
 #ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 #else
 #define btrfs_debug(fs_info, fmt, args...) \
     no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+	no_printk(KERN_DEBUG fmt, ##args)
 #endif
 
+#define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
+do {							\
+	rcu_read_lock();				\
+	btrfs_printk(fs_info, fmt, ##args);		\
+	rcu_read_unlock();				\
+} while (0)
+
 #ifdef CONFIG_BTRFS_ASSERT
 
 __cold
-- 
cgit v1.1


From ecaeb14b912a3be55530aeec4e81c7243f5ceb5d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 09:01:03 +0200
Subject: btrfs: switch message printers to _in_rcu variants

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  4 ++--
 fs/btrfs/dev-replace.c     | 16 ++++++++--------
 fs/btrfs/ioctl.c           |  2 +-
 fs/btrfs/scrub.c           | 14 +++++++-------
 fs/btrfs/volumes.c         | 18 +++++++++---------
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfa..3e20630 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror(
 		superblock_tmp->never_written = 0;
 		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
 		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
-				     " @%llu (%s/%llu/%d)\n",
+			btrfs_info_in_rcu(device->dev_root->fs_info,
+				"new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
 				     superblock_bdev,
 				     rcu_str_deref(device->name), dev_bytenr,
 				     dev_state->name, dev_bytenr,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd59..ede1c1a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -379,8 +379,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	if (ret)
 		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
 
-	printk_in_rcu(KERN_INFO
-		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+	btrfs_info_in_rcu(root->fs_info,
+		      "dev_replace from %s (devid %llu) to %s started",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
@@ -523,8 +523,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 								src_device,
 								tgt_device);
 	} else {
-		printk_in_rcu(KERN_ERR
-			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+		btrfs_err_in_rcu(root->fs_info,
+			      "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 			      src_device->missing ? "<missing disk>" :
 			        rcu_str_deref(src_device->name),
 			      src_device->devid,
@@ -540,8 +540,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		return scrub_ret;
 	}
 
-	printk_in_rcu(KERN_INFO
-		      "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+	btrfs_info_in_rcu(root->fs_info,
+		      "dev_replace from %s (devid %llu) to %s finished",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
@@ -809,8 +809,8 @@ static int btrfs_dev_replace_kthread(void *data)
 		progress = status_args->status.progress_1000;
 		kfree(status_args);
 		progress = div_u64(progress, 10);
-		printk_in_rcu(KERN_INFO
-			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+		btrfs_info_in_rcu(fs_info,
+			"continuing dev_replace from %s (devid %llu) to %s @%u%%",
 			dev_replace->srcdev->missing ? "<missing disk>" :
 			rcu_str_deref(dev_replace->srcdev->name),
 			dev_replace->srcdev->devid,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..8e9105a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1579,7 +1579,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	new_size = div_u64(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+	btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
 		      rcu_str_deref(device->name), new_size);
 
 	if (new_size > old_size) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a39f5d1..26cfbb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	 * hold all of the paths here
 	 */
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
-		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+		btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
 			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
-			"length %llu, links %u (path: %s)\n", swarn->errstr,
+			"length %llu, links %u (path: %s)", swarn->errstr,
 			swarn->logical, rcu_str_deref(swarn->dev->name),
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	return 0;
 
 err:
-	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+	btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
 		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
-		"resolving failed with ret=%d\n", swarn->errstr,
+		"resolving failed with ret=%d", swarn->errstr,
 		swarn->logical, rcu_str_deref(swarn->dev->name),
 		(unsigned long long)swarn->sector, root, inum, offset, ret);
 
@@ -649,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 						      item_size, &ref_root,
 						      &ref_level);
-			printk_in_rcu(KERN_WARNING
-				"BTRFS: %s at logical %llu on dev %s, "
+			btrfs_warn_in_rcu(fs_info,
+				"%s at logical %llu on dev %s, "
 				"sector %llu: metadata %s (level %d) in tree "
-				"%llu\n", errstr, swarn.logical,
+				"%llu", errstr, swarn.logical,
 				rcu_str_deref(dev->name),
 				(unsigned long long)swarn.sector,
 				ref_level ? "node" : "leaf",
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..b0dae14 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6594,8 +6594,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 	if (ret < 0) {
-		printk_in_rcu(KERN_WARNING "BTRFS: "
-			"error %d while searching for dev_stats item for device %s!\n",
+		btrfs_warn_in_rcu(dev_root->fs_info,
+			"error %d while searching for dev_stats item for device %s",
 			      ret, rcu_str_deref(device->name));
 		goto out;
 	}
@@ -6605,8 +6605,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		/* need to delete old one and insert a new one */
 		ret = btrfs_del_item(trans, dev_root, path);
 		if (ret != 0) {
-			printk_in_rcu(KERN_WARNING "BTRFS: "
-				"delete too small dev_stats item for device %s failed %d!\n",
+			btrfs_warn_in_rcu(dev_root->fs_info,
+				"delete too small dev_stats item for device %s failed %d",
 				      rcu_str_deref(device->name), ret);
 			goto out;
 		}
@@ -6619,9 +6619,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		ret = btrfs_insert_empty_item(trans, dev_root, path,
 					      &key, sizeof(*ptr));
 		if (ret < 0) {
-			printk_in_rcu(KERN_WARNING "BTRFS: "
-					  "insert dev_stats item for device %s failed %d!\n",
-				      rcu_str_deref(device->name), ret);
+			btrfs_warn_in_rcu(dev_root->fs_info,
+				"insert dev_stats item for device %s failed %d",
+				rcu_str_deref(device->name), ret);
 			goto out;
 		}
 	}
@@ -6695,8 +6695,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
 		return; /* all values == 0, suppress message */
 
-	printk_in_rcu(KERN_INFO "BTRFS: "
-		   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	btrfs_info_in_rcu(dev->dev_root->fs_info,
+		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
 	       rcu_str_deref(dev->name),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
-- 
cgit v1.1


From 24aa6b41d4611b14ad0fb89fa0b899ae9ab7005c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 10:27:02 +0200
Subject: btrfs: introduce ratelimited _in_rcu variants of message printing
 functions

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 88acdff..9c0fd90 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4057,16 +4057,38 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_info_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
 
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
 #ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
 #define btrfs_debug_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 #else
 #define btrfs_debug(fs_info, fmt, args...) \
     no_printk(KERN_DEBUG fmt, ##args)
 #define btrfs_debug_in_rcu(fs_info, fmt, args...) \
 	no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+	no_printk(KERN_DEBUG fmt, ##args)
 #endif
 
 #define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
@@ -4076,6 +4098,22 @@ do {							\
 	rcu_read_unlock();				\
 } while (0)
 
+#define btrfs_printk_ratelimited(fs_info, fmt, args...)		\
+do {								\
+	static DEFINE_RATELIMIT_STATE(_rs,			\
+		DEFAULT_RATELIMIT_INTERVAL,			\
+		DEFAULT_RATELIMIT_BURST);       		\
+	if (__ratelimit(&_rs))					\
+		btrfs_printk(fs_info, fmt, ##args);		\
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)		\
+do {								\
+	rcu_read_lock();					\
+	btrfs_printk_ratelimited(fs_info, fmt, ##args);		\
+	rcu_read_unlock();					\
+} while (0)
+
 #ifdef CONFIG_BTRFS_ASSERT
 
 __cold
-- 
cgit v1.1


From b14af3b46f214c5e927dacf58a994fa3c4cb20dc Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 10:43:10 +0200
Subject: btrfs: switch message printers to ratelimited _in_rcu variants

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   |  4 ++--
 fs/btrfs/extent_io.c |  4 ++--
 fs/btrfs/scrub.c     | 20 ++++++++++----------
 fs/btrfs/volumes.c   |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..ba41faf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3175,8 +3175,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 		struct btrfs_device *device = (struct btrfs_device *)
 			bh->b_private;
 
-		printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
-					  "I/O error on %s\n",
+		btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+				"lost page write due to IO error on %s",
 					  rcu_str_deref(device->name));
 		/* note, we dont' set_buffer_write_io_error because we have
 		 * our own ways of dealing with the IO errors
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e2357e3..58b9ccd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2078,8 +2078,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 
-	printk_ratelimited_in_rcu(KERN_INFO
-				  "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+	btrfs_info_rl_in_rcu(fs_info,
+		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
 				  btrfs_ino(inode), start,
 				  rcu_str_deref(dev->name), sector);
 	bio_put(bio);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 26cfbb0..bce86f8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -850,8 +850,8 @@ out:
 		btrfs_dev_replace_stats_inc(
 			&sctx->dev_root->fs_info->dev_replace.
 			num_uncorrectable_read_errors);
-		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+		btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+		    "unable to fixup (nodatasum) error at logical %llu on dev %s",
 			fixup->logical, rcu_str_deref(fixup->dev->name));
 	}
 
@@ -1230,8 +1230,8 @@ corrected_error:
 			sctx->stat.corrected_errors++;
 			sblock_to_check->data_corrected = 1;
 			spin_unlock(&sctx->stat_lock);
-			printk_ratelimited_in_rcu(KERN_ERR
-				"BTRFS: fixed up error at logical %llu on dev %s\n",
+			btrfs_err_rl_in_rcu(fs_info,
+				"fixed up error at logical %llu on dev %s",
 				logical, rcu_str_deref(dev->name));
 		}
 	} else {
@@ -1239,8 +1239,8 @@ did_not_correct_error:
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+		btrfs_err_rl_in_rcu(fs_info,
+			"unable to fixup (regular) error at logical %llu on dev %s",
 			logical, rcu_str_deref(dev->name));
 	}
 
@@ -2201,15 +2201,15 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.read_errors++;
 		spin_unlock(&sctx->stat_lock);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"BTRFS: I/O error rebulding logical %llu for dev %s\n",
+		btrfs_err_rl_in_rcu(fs_info,
+			"IO error rebuilding logical %llu for dev %s",
 			logical, rcu_str_deref(dev->name));
 	} else if (sblock->header_error || sblock->checksum_error) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+		btrfs_err_rl_in_rcu(fs_info,
+			"failed to rebuild valid logical %llu for dev %s",
 			logical, rcu_str_deref(dev->name));
 	} else {
 		scrub_write_block_to_dev_replace(sblock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b0dae14..4f80986 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6675,8 +6675,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
 {
 	if (!dev->dev_stats_valid)
 		return;
-	printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-			   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
 			   rcu_str_deref(dev->name),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
-- 
cgit v1.1


From 1dd6d7ca9d062721ae37388900a322a4b959e939 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 10:51:11 +0200
Subject: btrfs: introduce ratelimited variants of message printing functions

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c0fd90..bca42c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4075,6 +4075,23 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
 
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
 #ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -4082,6 +4099,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
 #else
 #define btrfs_debug(fs_info, fmt, args...) \
     no_printk(KERN_DEBUG fmt, ##args)
@@ -4089,6 +4108,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	no_printk(KERN_DEBUG fmt, ##args)
 #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
 	no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+	no_printk(KERN_DEBUG fmt, ##args)
 #endif
 
 #define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
-- 
cgit v1.1


From 9464732266862f6044e4708dca6c4b2a83dd937b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 11:01:36 +0200
Subject: btrfs: switch message printers to ratelimited variants

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c          | 21 ++++++++++-----------
 fs/btrfs/extent-tree.c      | 14 ++++++++------
 fs/btrfs/extent_io.c        |  4 ++--
 fs/btrfs/free-space-cache.c | 10 +++++-----
 fs/btrfs/inode.c            |  7 ++-----
 fs/btrfs/scrub.c            |  8 ++++----
 6 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ba41faf..72553cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk_ratelimited(KERN_WARNING
-				"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
-				"level %d\n",
+			btrfs_warn_rl(fs_info,
+				"%s checksum verify failed on %llu wanted %X found %X "
+				"level %d",
 				fs_info->sb->s_id, buf->start,
 				val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
@@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	printk_ratelimited(KERN_ERR
-	    "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
-			eb->fs_info->sb->s_id, eb->start,
+	btrfs_err_rl(eb->fs_info,
+		"parent transid verify failed on %llu wanted %llu found %llu",
+			eb->start,
 			parent_transid, btrfs_header_generation(eb));
 	ret = 1;
 
@@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != eb->start) {
-		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
-			       "%llu %llu\n",
-			       eb->fs_info->sb->s_id, found_start, eb->start);
+		btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+			       found_start, eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root->fs_info, eb)) {
-		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
-			       eb->fs_info->sb->s_id, eb->start);
+		btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+			       eb->start);
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..913a583 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8271,10 +8271,11 @@ skip:
 			ret = account_shared_subtree(trans, root, next,
 						     generation, level - 1);
 			if (ret) {
-				printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+				btrfs_err_rl(root->fs_info,
+					"Error "
 					"%d accounting shared subtree. Quota "
-					"is out of sync, rescan required.\n",
-					root->fs_info->sb->s_id, ret);
+					"is out of sync, rescan required.",
+					ret);
 			}
 		}
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
@@ -8363,10 +8364,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			BUG_ON(ret); /* -ENOMEM */
 			ret = account_leaf_items(trans, root, eb);
 			if (ret) {
-				printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+				btrfs_err_rl(root->fs_info,
+					"error "
 					"%d accounting leaf items. Quota "
-					"is out of sync, rescan required.\n",
-					root->fs_info->sb->s_id, ret);
+					"is out of sync, rescan required.",
+					ret);
 			}
 		}
 		/* make block locked assertion in clean_tree_block happy */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 58b9ccd..bb318b4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 	inode = tree->mapping->host;
 	isize = i_size_read(inode);
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-		printk_ratelimited(KERN_DEBUG
-		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
 				caller, btrfs_ino(inode), isize, start, end);
 	}
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66..afc1072 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 
 	gen = io_ctl->cur;
 	if (le64_to_cpu(*gen) != generation) {
-		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
-				   "(%Lu) does not match inode (%Lu)\n", *gen,
-				   generation);
+		btrfs_err_rl(io_ctl->root->fs_info,
+			"space cache generation (%llu) does not match inode (%llu)",
+				*gen, generation);
 		io_ctl_unmap_page(io_ctl);
 		return -EIO;
 	}
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	if (val != crc) {
-		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
-				   "space cache\n");
+		btrfs_err_rl(io_ctl->root->fs_info,
+			"csum mismatch on free space cache");
 		io_ctl_unmap_page(io_ctl);
 		return -EIO;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d..25eb814 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3018,8 +3018,6 @@ static int __readpage_endio_check(struct inode *inode,
 	char *kaddr;
 	u32 csum_expected;
 	u32 csum = ~(u32)0;
-	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-				      DEFAULT_RATELIMIT_BURST);
 
 	csum_expected = *(((u32 *)io_bio->csum) + icsum);
 
@@ -3032,9 +3030,8 @@ static int __readpage_endio_check(struct inode *inode,
 	kunmap_atomic(kaddr);
 	return 0;
 zeroit:
-	if (__ratelimit(&_rs))
-		btrfs_warn(BTRFS_I(inode)->root->fs_info,
-			   "csum failed ino %llu off %llu csum %u expected csum %u",
+	btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+		"csum failed ino %llu off %llu csum %u expected csum %u",
 			   btrfs_ino(inode), start, csum, csum_expected);
 	memset(kaddr + pgoff, 1, len);
 	flush_dcache_page(page);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bce86f8..550de89 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1626,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		int ret;
 
 		if (!page_bad->dev->bdev) {
-			printk_ratelimited(KERN_WARNING "BTRFS: "
+			btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
 				"scrub_repair_page_from_good_copy(bdev == NULL) "
-				"is unexpected!\n");
+				"is unexpected");
 			return -EIO;
 		}
 
@@ -4375,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	if (!dev)
 		return -EIO;
 	if (!dev->bdev) {
-		printk_ratelimited(KERN_WARNING
-			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+		btrfs_warn_rl(dev->dev_root->fs_info,
+			"scrub write_page_nocow(bdev == NULL) is unexpected");
 		return -EIO;
 	}
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-- 
cgit v1.1


From f14d104dbdb5044dac9acd0e983ffb60f706c746 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 11:37:06 +0200
Subject: btrfs: switch more printks to our helpers

Convert the simple cases, not all functions provide a way to reach the
fs_info. Also skipped debugging messages (print-tree, integrity
checker and pr_debug) and messages that are printed from possibly
unfinished mount.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 12 ++++++------
 fs/btrfs/extent_io.c | 18 ++++++++++--------
 fs/btrfs/ioctl.c     |  6 +++---
 fs/btrfs/root-tree.c |  5 +++--
 fs/btrfs/send.c      |  2 +-
 fs/btrfs/volumes.c   |  3 ++-
 6 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 72553cd..5bc94d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2347,8 +2347,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	u64 bytenr = btrfs_super_log_root(disk_super);
 
 	if (fs_devices->rw_devices == 0) {
-		printk(KERN_WARNING "BTRFS: log replay required "
-		       "on RO media\n");
+		btrfs_warn(fs_info, "log replay required on RO media");
 		return -EIO;
 	}
 
@@ -2363,12 +2362,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	log_tree_root->node = read_tree_block(tree_root, bytenr,
 			fs_info->generation + 1);
 	if (IS_ERR(log_tree_root->node)) {
-		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		btrfs_warn(fs_info, "failed to read log tree");
 		ret = PTR_ERR(log_tree_root->node);
 		kfree(log_tree_root);
 		return ret;
 	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
-		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		btrfs_err(fs_info, "failed to read log tree");
 		free_extent_buffer(log_tree_root->node);
 		kfree(log_tree_root);
 		return -EIO;
@@ -3294,8 +3293,9 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
 			if (!bh) {
-				printk(KERN_ERR "BTRFS: couldn't get super "
-				       "buffer head for bytenr %Lu\n", bytenr);
+				btrfs_err(device->dev_root->fs_info,
+				    "couldn't get super buffer head for bytenr %llu",
+				    bytenr);
 				errors++;
 				continue;
 			}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bb318b4..c7a3391 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5563,13 +5563,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-		       "len %lu dst len %lu\n", src_offset, len, dst->len);
+		btrfs_err(dst->fs_info,
+			"memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+		btrfs_err(dst->fs_info,
+			"memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 
@@ -5609,13 +5611,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-		       "len %lu len %lu\n", src_offset, len, dst->len);
+		btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+		       "len %lu len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-		       "len %lu len %lu\n", dst_offset, len, dst->len);
+		btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+		       "len %lu len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset < src_offset) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8e9105a..2e520c6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1342,7 +1342,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			break;
 
 		if (btrfs_defrag_cancelled(root->fs_info)) {
-			printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+			btrfs_debug(root->fs_info, "defrag_file cancelled");
 			ret = -EAGAIN;
 			break;
 		}
@@ -2081,7 +2081,7 @@ static noinline int search_ioctl(struct inode *inode,
 		key.offset = (u64)-1;
 		root = btrfs_read_fs_root_no_name(info, &key);
 		if (IS_ERR(root)) {
-			printk(KERN_ERR "BTRFS: could not find root %llu\n",
+			btrfs_err(info, "could not find root %llu",
 			       sk->tree_id);
 			btrfs_free_path(path);
 			return -ENOENT;
@@ -2221,7 +2221,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.offset = (u64)-1;
 	root = btrfs_read_fs_root_no_name(info, &key);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+		btrfs_err(info, "could not find root %llu", tree_id);
 		ret = -ENOENT;
 		goto out;
 	}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..42267d3 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 	if (!need_reset && btrfs_root_generation(item)
 		!= btrfs_root_generation_v2(item)) {
 		if (btrfs_root_generation_v2(item) != 0) {
-			printk(KERN_WARNING "BTRFS: mismatching "
+			btrfs_warn(eb->fs_info,
+					"mismatching "
 					"generation and generation_v2 "
 					"found in root item. This root "
 					"was probably mounted with an "
 					"older kernel. Resetting all "
-					"new fields.\n");
+					"new fields.");
 		}
 		need_reset = 1;
 	}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa72bfd..4ea07c8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2562,7 +2562,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 	} else if (S_ISSOCK(mode)) {
 		cmd = BTRFS_SEND_C_MKSOCK;
 	} else {
-		printk(KERN_WARNING "btrfs: unexpected inode type %o",
+		btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
 				(int)(mode & S_IFMT));
 		ret = -ENOTSUPP;
 		goto out;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4f80986..7ad310c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2352,7 +2352,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 						root->fs_info->fsid);
 		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
 								fsid_buf))
-			pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+			btrfs_warn(root->fs_info,
+				"sysfs: failed to create fsid for sprout");
 	}
 
 	root->fs_info->num_tolerated_disk_barrier_failures =
-- 
cgit v1.1


From 730d9ec36bf27e87805aee6ebf462071735e31f9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 18:16:38 +0200
Subject: btrfs: remove waitqueue_active check from
 btrfs_rm_dev_replace_unblocked

Normally the waitqueue_active would need a barrier, but this is not
necessary here because it's not a performance sensitive context and we
can call wake_up directly.

Suggested-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd59..733ff75 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
 {
 	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-	if (waitqueue_active(&fs_info->replace_wait))
-		wake_up(&fs_info->replace_wait);
+	wake_up(&fs_info->replace_wait);
 }
 
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
-- 
cgit v1.1


From b666a9cd993c6a49c4bf23a1ed476359c978f60b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 18:24:48 +0200
Subject: btrfs: add barrier for waitqueue_active in clear_btree_io_tree

waitqueue_active should be preceded by a barrier, in this function we
don't need to call it all the time.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 74bc333..e8e5b5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 static void clear_btree_io_tree(struct extent_io_tree *tree)
 {
 	spin_lock(&tree->lock);
+	/*
+	 * Do a single barrier for the waitqueue_active check here, the state
+	 * of the waitqueue should not change once clear_btree_io_tree is
+	 * called.
+	 */
+	smp_mb();
 	while (!RB_EMPTY_ROOT(&tree->state)) {
 		struct rb_node *node;
 		struct extent_state *state;
-- 
cgit v1.1


From 33a9eca7e4a4c2c17aebbb11f0e506a48ebc30c9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 18:35:10 +0200
Subject: btrfs: comment waitqueue_active implied by locks

Suggested-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c   | 6 +++++-
 fs/btrfs/tree-log.c | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcf7265..1a33d3e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 			}
 
 			goto done_nolock;
-		} else  if (waitqueue_active(&h->wait)) {
+			/*
+			 * The barrier for this waitqueue_active is not needed,
+			 * we're protected by h->lock and can't miss a wakeup.
+			 */
+		} else if (waitqueue_active(&h->wait)) {
 			spin_unlock(&rbio->bio_list_lock);
 			spin_unlock_irqrestore(&h->lock, flags);
 			wake_up(&h->wait);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace..d0deb46 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2950,6 +2950,9 @@ out_wake_log_root:
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	mutex_unlock(&log_root_tree->log_mutex);
 
+	/*
+	 * The barrier before waitqueue_active is implied by mutex_unlock
+	 */
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
 		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
@@ -2961,6 +2964,9 @@ out:
 	atomic_set(&root->log_commit[index1], 0);
 	mutex_unlock(&root->log_mutex);
 
+	/*
+	 * The barrier before waitqueue_active is implied by mutex_unlock
+	 */
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
 	return ret;
-- 
cgit v1.1


From a83342aa0c8f0ca90057d3837ae8d198186e5153 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Feb 2015 19:36:47 +0100
Subject: btrfs: add comments to barriers before waitqueue_active

Reduce number of undocumented barriers out there.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c  | 3 +++
 fs/btrfs/extent-tree.c  | 3 +--
 fs/btrfs/locking.c      | 3 +++
 fs/btrfs/ordered-data.c | 6 ++++++
 fs/btrfs/transaction.c  | 3 +++
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca..3a9317c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -839,6 +839,9 @@ static void free_workspace(int type, struct list_head *workspace)
 	btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(alloc_workspace);
 wake:
+	/*
+	 * Make sure counter is updated before we wake up waiters.
+	 */
 	smp_mb();
 	if (waitqueue_active(workspace_wait))
 		wake_up(workspace_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..59eb92f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10363,8 +10363,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
 {
 	percpu_counter_dec(&root->subv_writers->counter);
 	/*
-	 * Make sure counter is updated before we wake up
-	 * waiters.
+	 * Make sure counter is updated before we wake up waiters.
 	 */
 	smp_mb();
 	if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d7e6baf..03f8630 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -280,6 +280,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
 	if (blockers) {
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_dec(&eb->blocking_writers);
+		/*
+		 * Make sure counter is updated before we wake up waiters.
+		 */
 		smp_mb();
 		if (waitqueue_active(&eb->write_lock_wq))
 			wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf..071005f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 
 	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+		/*
+		 * Implicit memory barrier after test_and_set_bit
+		 */
 		if (waitqueue_active(&entry->wait))
 			wake_up(&entry->wait);
 	} else {
@@ -409,6 +412,9 @@ have_entry:
 
 	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+		/*
+		 * Implicit memory barrier after test_and_set_bit
+		 */
 		if (waitqueue_active(&entry->wait))
 			wake_up(&entry->wait);
 	} else {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e8e5b5a..3fd70f7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -861,6 +861,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	atomic_dec(&cur_trans->num_writers);
 	extwriter_counter_dec(cur_trans, trans->type);
 
+	/*
+	 * Make sure counter is updated before we wake up waiters.
+	 */
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-- 
cgit v1.1


From 779adf0f647651f5a45eeee3442c881300ce989e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Feb 2015 19:39:00 +0100
Subject: btrfs: remove extra barrier before waitqueue_active

Removing barriers is scary, but a call to atomic_dec_and_test implies
a barrier, so we don't need to issue another one.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d0deb46..6327559 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
 void btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
-		smp_mb();
+		/*
+		 * Implicit memory barrier after atomic_dec_and_test
+		 */
 		if (waitqueue_active(&root->log_writer_wait))
 			wake_up(&root->log_writer_wait);
 	}
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&log_root_tree->log_mutex);
 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
-		smp_mb();
+		/*
+		 * Implicit memory barrier after atomic_dec_and_test
+		 */
 		if (waitqueue_active(&log_root_tree->log_writer_wait))
 			wake_up(&log_root_tree->log_writer_wait);
 	}
-- 
cgit v1.1


From ee86395458072760d62e66aad10a5e9e8902b8cf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Feb 2015 19:41:40 +0100
Subject: btrfs: comment the rest of implicit barriers before waitqueue_active

There are atomic operations that imply the barrier for waitqueue_active
mixed in an if-condition.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 4 ++++
 fs/btrfs/disk-io.c       | 3 +++
 fs/btrfs/inode.c         | 3 +++
 fs/btrfs/locking.c       | 9 +++++++++
 fs/btrfs/volumes.c       | 3 +++
 5 files changed, 22 insertions(+)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae427..e0941fb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
 static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 {
 	int seq = atomic_inc_return(&delayed_root->items_seq);
+
+	/*
+	 * atomic_dec_return implies a barrier for waitqueue_active
+	 */
 	if ((atomic_dec_return(&delayed_root->items) <
 	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
 	    waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..379526f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -802,6 +802,9 @@ static void run_one_async_done(struct btrfs_work *work)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
+	/*
+	 * atomic_dec_return implies a barrier for waitqueue_active
+	 */
 	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d..7be4abe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1096,6 +1096,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
 		PAGE_CACHE_SHIFT;
 
+	/*
+	 * atomic_sub_return implies a barrier for waitqueue_active
+	 */
 	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
 	    5 * 1024 * 1024 &&
 	    waitqueue_active(&root->fs_info->async_submit_wait))
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 03f8630..8077461 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		write_lock(&eb->lock);
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_inc(&eb->spinning_writers);
+		/*
+		 * atomic_dec_and_test implies a barrier for waitqueue_active
+		 */
 		if (atomic_dec_and_test(&eb->blocking_writers) &&
 		    waitqueue_active(&eb->write_lock_wq))
 			wake_up(&eb->write_lock_wq);
@@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
 		read_lock(&eb->lock);
 		atomic_inc(&eb->spinning_readers);
+		/*
+		 * atomic_dec_and_test implies a barrier for waitqueue_active
+		 */
 		if (atomic_dec_and_test(&eb->blocking_readers) &&
 		    waitqueue_active(&eb->read_lock_wq))
 			wake_up(&eb->read_lock_wq);
@@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+	/*
+	 * atomic_dec_and_test implies a barrier for waitqueue_active
+	 */
 	if (atomic_dec_and_test(&eb->blocking_readers) &&
 	    waitqueue_active(&eb->read_lock_wq))
 		wake_up(&eb->read_lock_wq);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..ff35271 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -345,6 +345,9 @@ loop_lock:
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
 
+		/*
+		 * atomic_dec_return implies a barrier for waitqueue_active
+		 */
 		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
-- 
cgit v1.1


From d906d49fc5f49a0129527e8fbc13312f36b9b9ce Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 2 Oct 2015 10:47:34 +0100
Subject: Btrfs: send, fix file corruption due to incorrect cloning operations

If we have a file that shares an extent with other files, when processing
the extent item relative to a shared extent, we blindly issue a clone
operation that will target a length matching the length in the extent item
and uses as a source some other file the receiver already has and points
to the same extent. However that range in the other file might not
exclusively point only to the shared extent, and so using that length
will result in the receiver getting a file with different data from the
one in the send snapshot. This issue happened both for incremental and
full send operations.

So fix this by issuing clone operations with lengths that don't cover
regions of the source file that point to different extents (or have holes).

The following test case for fstests reproduces the problem.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $send_files_dir
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _need_to_be_root
  _require_cp_reflink
  _require_xfs_io_command "fpunch"

  send_files_dir=$TEST_DIR/btrfs-test-$seq

  rm -f $seqres.full
  rm -fr $send_files_dir
  mkdir $send_files_dir

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  # Create our test file with a single 100K extent.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
     $SCRATCH_MNT/foo | _filter_xfs_io

  # Clone our file into a new file named bar.
  cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar

  # Now overwrite parts of our foo file.
  $XFS_IO_PROG -c "pwrite -S 0xbb 50K 10K" \
     -c "pwrite -S 0xcc 90K 10K" \
     -c "fpunch 70K 10k" \
     $SCRATCH_MNT/foo | _filter_xfs_io

  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
     $SCRATCH_MNT/snap

  echo "File digests in the original filesystem:"
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  _run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap

  # Now recreate the filesystem by receiving the send stream and verify
  # we get the same file contents that the original filesystem had.
  _scratch_unmount
  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  _run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap

  # We expect the destination filesystem to have exactly the same file
  # data as the original filesystem.
  # The btrfs send implementation had a bug where it sent a clone
  # operation from file foo into file bar covering the whole [0, 100K[
  # range after creating and writing the file foo. This was incorrect
  # because the file bar now included the updates done to file foo after
  # we cloned foo to bar, breaking the COW nature of reflink copies
  # (cloned extents).
  echo "File digests in the new filesystem:"
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  status=0
  exit

Another test case that reproduces the problem when we have compressed
extents:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $send_files_dir
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _need_to_be_root
  _require_cp_reflink

  send_files_dir=$TEST_DIR/btrfs-test-$seq

  rm -f $seqres.full
  rm -fr $send_files_dir
  mkdir $send_files_dir

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  # Create our file with an extent of 100K starting at file offset 0K.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K"       \
                  -c "fsync"                        \
                  $SCRATCH_MNT/foo | _filter_xfs_io

  # Rewrite part of the previous extent (its first 40K) and write a new
  # 100K extent starting at file offset 100K.
  $XFS_IO_PROG -c "pwrite -S 0xbb 0K 40K"    \
          -c "pwrite -S 0xcc 100K 100K"      \
          $SCRATCH_MNT/foo | _filter_xfs_io

  # Our file foo now has 3 file extent items in its metadata:
  #
  # 1) One covering the file range 0 to 40K;
  # 2) One covering the file range 40K to 100K, which points to the first
  #    extent we wrote to the file and has a data offset field with value
  #    40K (our file no longer uses the first 40K of data from that
  #    extent);
  # 3) One covering the file range 100K to 200K.

  # Now clone our file foo into file bar.
  cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar

  # Create our snapshot for the send operation.
  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
          $SCRATCH_MNT/snap

  echo "File digests in the original filesystem:"
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  _run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap

  # Now recreate the filesystem by receiving the send stream and verify we
  # get the same file contents that the original filesystem had.
  # Btrfs send used to issue a clone operation from foo's range
  # [80K, 140K[ to bar's range [40K, 100K[ when cloning the extent pointed
  # to by foo's second file extent item, this was incorrect because of bad
  # accounting of the file extent item's data offset field. The correct
  # range to clone from should have been [40K, 100K[.
  _scratch_unmount
  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  _run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap

  echo "File digests in the new filesystem:"
  # Must match the digests we got in the original filesystem.
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 173 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b5d47b9..ced935b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 	}
 
 	if (cur_clone_root) {
-		if (compressed != BTRFS_COMPRESS_NONE) {
-			/*
-			 * Offsets given by iterate_extent_inodes() are relative
-			 * to the start of the extent, we need to add logical
-			 * offset from the file extent item.
-			 * (See why at backref.c:check_extent_in_eb())
-			 */
-			cur_clone_root->offset += btrfs_file_extent_offset(eb,
-									   fi);
-		}
 		*found = cur_clone_root;
 		ret = 0;
 	} else {
@@ -4687,6 +4677,171 @@ tlv_put_failure:
 	return ret;
 }
 
+static int send_extent_data(struct send_ctx *sctx,
+			    const u64 offset,
+			    const u64 len)
+{
+	u64 sent = 0;
+
+	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+		return send_update_extent(sctx, offset, len);
+
+	while (sent < len) {
+		u64 size = len - sent;
+		int ret;
+
+		if (size > BTRFS_SEND_READ_SIZE)
+			size = BTRFS_SEND_READ_SIZE;
+		ret = send_write(sctx, offset + sent, size);
+		if (ret < 0)
+			return ret;
+		if (!ret)
+			break;
+		sent += ret;
+	}
+	return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+		       struct clone_root *clone_root,
+		       const u64 disk_byte,
+		       u64 data_offset,
+		       u64 offset,
+		       u64 len)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * We can't send a clone operation for the entire range if we find
+	 * extent items in the respective range in the source file that
+	 * refer to different extents or if we find holes.
+	 * So check for that and do a mix of clone and regular write/copy
+	 * operations if needed.
+	 *
+	 * Example:
+	 *
+	 * mkfs.btrfs -f /dev/sda
+	 * mount /dev/sda /mnt
+	 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+	 * cp --reflink=always /mnt/foo /mnt/bar
+	 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+	 * btrfs subvolume snapshot -r /mnt /mnt/snap
+	 *
+	 * If when we send the snapshot and we are processing file bar (which
+	 * has a higher inode number than foo) we blindly send a clone operation
+	 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+	 * a file bar that matches the content of file foo - iow, doesn't match
+	 * the content from bar in the original filesystem.
+	 */
+	key.objectid = clone_root->ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = clone_root->offset;
+	ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0 && path->slots[0] > 0) {
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+		if (key.objectid == clone_root->ino &&
+		    key.type == BTRFS_EXTENT_DATA_KEY)
+			path->slots[0]--;
+	}
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+		struct btrfs_file_extent_item *ei;
+		u8 type;
+		u64 ext_len;
+		u64 clone_len;
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(clone_root->root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		/*
+		 * We might have an implicit trailing hole (NO_HOLES feature
+		 * enabled). We deal with it after leaving this loop.
+		 */
+		if (key.objectid != clone_root->ino ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+		type = btrfs_file_extent_type(leaf, ei);
+		if (type == BTRFS_FILE_EXTENT_INLINE) {
+			ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+			ext_len = PAGE_CACHE_ALIGN(ext_len);
+		} else {
+			ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+		}
+
+		if (key.offset + ext_len <= clone_root->offset)
+			goto next;
+
+		if (key.offset > clone_root->offset) {
+			/* Implicit hole, NO_HOLES feature enabled. */
+			u64 hole_len = key.offset - clone_root->offset;
+
+			if (hole_len > len)
+				hole_len = len;
+			ret = send_extent_data(sctx, offset, hole_len);
+			if (ret < 0)
+				goto out;
+
+			len -= hole_len;
+			if (len == 0)
+				break;
+			offset += hole_len;
+			clone_root->offset += hole_len;
+			data_offset += hole_len;
+		}
+
+		if (key.offset >= clone_root->offset + len)
+			break;
+
+		clone_len = min_t(u64, ext_len, len);
+
+		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+		    btrfs_file_extent_offset(leaf, ei) == data_offset)
+			ret = send_clone(sctx, offset, clone_len, clone_root);
+		else
+			ret = send_extent_data(sctx, offset, clone_len);
+
+		if (ret < 0)
+			goto out;
+
+		len -= clone_len;
+		if (len == 0)
+			break;
+		offset += clone_len;
+		clone_root->offset += clone_len;
+		data_offset += clone_len;
+next:
+		path->slots[0]++;
+	}
+
+	if (len > 0)
+		ret = send_extent_data(sctx, offset, len);
+	else
+		ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
 			       struct btrfs_path *path,
 			       struct btrfs_key *key,
@@ -4695,9 +4850,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	int ret = 0;
 	struct btrfs_file_extent_item *ei;
 	u64 offset = key->offset;
-	u64 pos = 0;
 	u64 len;
-	u32 l;
 	u8 type;
 	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
 
@@ -4725,22 +4878,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	}
 
 	if (clone_root && IS_ALIGNED(offset + len, bs)) {
-		ret = send_clone(sctx, offset, len, clone_root);
-	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
-		ret = send_update_extent(sctx, offset, len);
+		u64 disk_byte;
+		u64 data_offset;
+
+		disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+		data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+		ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+				  offset, len);
 	} else {
-		while (pos < len) {
-			l = len - pos;
-			if (l > BTRFS_SEND_READ_SIZE)
-				l = BTRFS_SEND_READ_SIZE;
-			ret = send_write(sctx, pos + offset, l);
-			if (ret < 0)
-				goto out;
-			if (!ret)
-				break;
-			pos += ret;
-		}
-		ret = 0;
+		ret = send_extent_data(sctx, offset, len);
 	}
 out:
 	return ret;
-- 
cgit v1.1


From b96b1db039ebc584d03a9933b279e0d3e704c528 Mon Sep 17 00:00:00 2001
From: Robin Ruede <r.ruede@gmail.com>
Date: Wed, 30 Sep 2015 21:23:33 +0200
Subject: btrfs: fix resending received snapshot with parent

This fixes a regression introduced by 37b8d27d between v4.1 and v4.2.

When a snapshot is received, its received_uuid is set to the original
uuid of the subvolume. When that snapshot is then resent to a third
filesystem, it's received_uuid is set to the second uuid
instead of the original one. The same was true for the parent_uuid.
This behaviour was partially changed in 37b8d27d, but in that patch
only the parent_uuid was taken from the real original,
not the uuid itself, causing the search for the parent to fail in
the case below.

This happens for example when trying to send a series of linked
snapshots (e.g. created by snapper) from the backup file system back
to the original one.

The following commands reproduce the issue in v4.2.1
(no error in 4.1.6)

    # setup three test file systems
    for i in 1 2 3; do
	    truncate -s 50M fs$i
	    mkfs.btrfs fs$i
	    mkdir $i
	    mount fs$i $i
    done
    echo "content" > 1/testfile
    btrfs su snapshot -r 1/ 1/snap1
    echo "changed content" > 1/testfile
    btrfs su snapshot -r 1/ 1/snap2

    # works fine:
    btrfs send 1/snap1 | btrfs receive 2/
    btrfs send -p 1/snap1 1/snap2 | btrfs receive 2/

    # ERROR: could not find parent subvolume
    btrfs send 2/snap1 | btrfs receive 3/
    btrfs send -p 2/snap1 2/snap2 | btrfs receive 3/

Signed-off-by: Robin Ruede <rruede+git@gmail.com>
Fixes: 37b8d27de5d0 ("Btrfs: use received_uuid of parent during send")
Cc: stable@vger.kernel.org # v4.2+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Ed Tomlinson <edt@aei.ca>
---
 fs/btrfs/send.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ced935b..355a458 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2343,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	}
 
 	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
-	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
-			sctx->send_root->root_item.uuid);
+
+	if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			    sctx->send_root->root_item.received_uuid);
+	else
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			    sctx->send_root->root_item.uuid);
+
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
 		    le64_to_cpu(sctx->send_root->root_item.ctransid));
 	if (parent_root) {
-- 
cgit v1.1


From 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 13 Oct 2015 15:15:00 +0100
Subject: Btrfs: fix file corruption and data loss after cloning inline extents

Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.

Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).

So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.

Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.

The following test case for fstests exercises all these cases that result
in file corruption and data loss:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner
  _require_btrfs_fs_feature "no_holes"
  _require_btrfs_mkfs_feature "no-holes"

  rm -f $seqres.full

  test_cloning_inline_extents()
  {
      local mkfs_opts=$1
      local mount_opts=$2

      _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
      _scratch_mount $mount_opts

      # File bar, the source for all the following clone operations, consists
      # of a single inline extent (50 bytes).
      $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
          | _filter_xfs_io

      # Test cloning into a file with an extent (non-inlined) where the
      # destination offset overlaps that extent. It should not be possible to
      # clone the inline extent from file bar into this file.
      $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo

      # Doing IO against any range in the first 4K of the file should work.
      # Due to a past clone ioctl bug which allowed cloning the inline extent,
      # these operations resulted in EIO errors.
      echo "File foo data after clone operation:"
      # All bytes should have the value 0xaa (clone operation failed and did
      # not modify our file).
      od -t x1 $SCRATCH_MNT/foo
      $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io

      # Test cloning the inline extent against a file which has a hole in its
      # first 4K followed by a non-inlined extent. It should not be possible
      # as well to clone the inline extent from file bar into this file.
      $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2

      # Doing IO against any range in the first 4K of the file should work.
      # Due to a past clone ioctl bug which allowed cloning the inline extent,
      # these operations resulted in EIO errors.
      echo "File foo2 data after clone operation:"
      # All bytes should have the value 0x00 (clone operation failed and did
      # not modify our file).
      od -t x1 $SCRATCH_MNT/foo2
      $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io

      # Test cloning the inline extent against a file which has a size of zero
      # but has a prealloc extent. It should not be possible as well to clone
      # the inline extent from file bar into this file.
      $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3

      # Doing IO against any range in the first 4K of the file should work.
      # Due to a past clone ioctl bug which allowed cloning the inline extent,
      # these operations resulted in EIO errors.
      echo "First 50 bytes of foo3 after clone operation:"
      # Should not be able to read any bytes, file has 0 bytes i_size (the
      # clone operation failed and did not modify our file).
      od -t x1 $SCRATCH_MNT/foo3
      $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io

      # Test cloning the inline extent against a file which consists of a
      # single inline extent that has a size not greater than the size of
      # bar's inline extent (40 < 50).
      # It should be possible to do the extent cloning from bar to this file.
      $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4

      # Doing IO against any range in the first 4K of the file should work.
      echo "File foo4 data after clone operation:"
      # Must match file bar's content.
      od -t x1 $SCRATCH_MNT/foo4
      $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io

      # Test cloning the inline extent against a file which consists of a
      # single inline extent that has a size greater than the size of bar's
      # inline extent (60 > 50).
      # It should not be possible to clone the inline extent from file bar
      # into this file.
      $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5

      # Reading the file should not fail.
      echo "File foo5 data after clone operation:"
      # Must have a size of 60 bytes, with all bytes having a value of 0x03
      # (the clone operation failed and did not modify our file).
      od -t x1 $SCRATCH_MNT/foo5

      # Test cloning the inline extent against a file which has no extents but
      # has a size greater than bar's inline extent (16K > 50).
      # It should not be possible to clone the inline extent from file bar
      # into this file.
      $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6

      # Reading the file should not fail.
      echo "File foo6 data after clone operation:"
      # Must have a size of 16K, with all bytes having a value of 0x00 (the
      # clone operation failed and did not modify our file).
      od -t x1 $SCRATCH_MNT/foo6

      # Test cloning the inline extent against a file which has no extents but
      # has a size not greater than bar's inline extent (30 < 50).
      # It should be possible to clone the inline extent from file bar into
      # this file.
      $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7

      # Reading the file should not fail.
      echo "File foo7 data after clone operation:"
      # Must have a size of 50 bytes, with all bytes having a value of 0xbb.
      od -t x1 $SCRATCH_MNT/foo7

      # Test cloning the inline extent against a file which has a size not
      # greater than the size of bar's inline extent (20 < 50) but has
      # a prealloc extent that goes beyond the file's size. It should not be
      # possible to clone the inline extent from bar into this file.
      $XFS_IO_PROG -f -c "falloc -k 0 1M" \
                      -c "pwrite -S 0x88 0 20" \
                      $SCRATCH_MNT/foo8 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8

      echo "File foo8 data after clone operation:"
      # Must have a size of 20 bytes, with all bytes having a value of 0x88
      # (the clone operation did not modify our file).
      od -t x1 $SCRATCH_MNT/foo8

      _scratch_unmount
  }

  echo -e "\nTesting without compression and without the no-holes feature...\n"
  test_cloning_inline_extents

  echo -e "\nTesting with compression and without the no-holes feature...\n"
  test_cloning_inline_extents "" "-o compress"

  echo -e "\nTesting without compression and with the no-holes feature...\n"
  test_cloning_inline_extents "-O no-holes" ""

  echo -e "\nTesting with compression and with the no-holes feature...\n"
  test_cloning_inline_extents "-O no-holes" "-o compress"

  status=0
  exit

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ioctl.c | 195 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 152 insertions(+), 43 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 80342d3..55a735a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3328,6 +3328,150 @@ static void clone_update_extent_map(struct inode *inode,
 			&BTRFS_I(inode)->runtime_flags);
 }
 
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+				    struct inode *dst,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct btrfs_key *new_key,
+				    const u64 drop_start,
+				    const u64 datal,
+				    const u64 skip,
+				    const u64 size,
+				    char *inline_data)
+{
+	struct btrfs_root *root = BTRFS_I(dst)->root;
+	const u64 aligned_end = ALIGN(new_key->offset + datal,
+				      root->sectorsize);
+	int ret;
+	struct btrfs_key key;
+
+	if (new_key->offset > 0)
+		return -EOPNOTSUPP;
+
+	key.objectid = btrfs_ino(dst);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		return ret;
+	} else if (ret > 0) {
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ret;
+			else if (ret > 0)
+				goto copy_inline_extent;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == btrfs_ino(dst) &&
+		    key.type == BTRFS_EXTENT_DATA_KEY) {
+			ASSERT(key.offset > 0);
+			return -EOPNOTSUPP;
+		}
+	} else if (i_size_read(dst) <= datal) {
+		struct btrfs_file_extent_item *ei;
+		u64 ext_len;
+
+		/*
+		 * If the file size is <= datal, make sure there are no other
+		 * extents following (can happen do to an fallocate call with
+		 * the flag FALLOC_FL_KEEP_SIZE).
+		 */
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		/*
+		 * If it's an inline extent, it can not have other extents
+		 * following it.
+		 */
+		if (btrfs_file_extent_type(path->nodes[0], ei) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			goto copy_inline_extent;
+
+		ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+		if (ext_len > aligned_end)
+			return -EOPNOTSUPP;
+
+		ret = btrfs_next_item(root, path);
+		if (ret < 0) {
+			return ret;
+		} else if (ret == 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == btrfs_ino(dst) &&
+			    key.type == BTRFS_EXTENT_DATA_KEY)
+				return -EOPNOTSUPP;
+		}
+	}
+
+copy_inline_extent:
+	/*
+	 * We have no extent items, or we have an extent at offset 0 which may
+	 * or may not be inlined. All these cases are dealt the same way.
+	 */
+	if (i_size_read(dst) > datal) {
+		/*
+		 * If the destination inode has an inline extent...
+		 * This would require copying the data from the source inline
+		 * extent into the beginning of the destination's inline extent.
+		 * But this is really complex, both extents can be compressed
+		 * or just one of them, which would require decompressing and
+		 * re-compressing data (which could increase the new compressed
+		 * size, not allowing the compressed data to fit anymore in an
+		 * inline extent).
+		 * So just don't support this case for now (it should be rare,
+		 * we are not really saving space when cloning inline extents).
+		 */
+		return -EOPNOTSUPP;
+	}
+
+	btrfs_release_path(path);
+	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+	if (ret)
+		return ret;
+	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+	if (ret)
+		return ret;
+
+	if (skip) {
+		const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+		memmove(inline_data + start, inline_data + start + skip, datal);
+	}
+
+	write_extent_buffer(path->nodes[0], inline_data,
+			    btrfs_item_ptr_offset(path->nodes[0],
+						  path->slots[0]),
+			    size);
+	inode_add_bytes(dst, datal);
+
+	return 0;
+}
+
 /**
  * btrfs_clone() - clone a range from inode file to another
  *
@@ -3594,21 +3738,6 @@ process_slot:
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				u64 skip = 0;
 				u64 trim = 0;
-				u64 aligned_end = 0;
-
-				/*
-				 * Don't copy an inline extent into an offset
-				 * greater than zero. Having an inline extent
-				 * at such an offset results in chaos as btrfs
-				 * isn't prepared for such cases. Just skip
-				 * this case for the same reasons as commented
-				 * at btrfs_ioctl_clone().
-				 */
-				if (last_dest_end > 0) {
-					ret = -EOPNOTSUPP;
-					btrfs_end_transaction(trans, root);
-					goto out;
-				}
 
 				if (off > key.offset) {
 					skip = off - key.offset;
@@ -3626,42 +3755,22 @@ process_slot:
 				size -= skip + trim;
 				datal -= skip + trim;
 
-				aligned_end = ALIGN(new_key.offset + datal,
-						    root->sectorsize);
-				ret = btrfs_drop_extents(trans, root, inode,
-							 drop_start,
-							 aligned_end,
-							 1);
+				ret = clone_copy_inline_extent(src, inode,
+							       trans, path,
+							       &new_key,
+							       drop_start,
+							       datal,
+							       skip, size, buf);
 				if (ret) {
 					if (ret != -EOPNOTSUPP)
 						btrfs_abort_transaction(trans,
-							root, ret);
-					btrfs_end_transaction(trans, root);
-					goto out;
-				}
-
-				ret = btrfs_insert_empty_item(trans, root, path,
-							      &new_key, size);
-				if (ret) {
-					btrfs_abort_transaction(trans, root,
-								ret);
+									root,
+									ret);
 					btrfs_end_transaction(trans, root);
 					goto out;
 				}
-
-				if (skip) {
-					u32 start =
-					  btrfs_file_extent_calc_inline_size(0);
-					memmove(buf+start, buf+start+skip,
-						datal);
-				}
-
 				leaf = path->nodes[0];
 				slot = path->slots[0];
-				write_extent_buffer(leaf, buf,
-					    btrfs_item_ptr_offset(leaf, slot),
-					    size);
-				inode_add_bytes(inode, datal);
 			}
 
 			/* If we have an implicit hole (NO_HOLES feature). */
-- 
cgit v1.1


From 5e6ecb362bd5950a3d8ce19c32829e4f8c7917d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 13 Oct 2015 16:36:09 +0100
Subject: Btrfs: fix double range unlock of hole region when reading page

If when reading a page we find a hole and our caller had already locked
the range (bio flags has the bit EXTENT_BIO_PARENT_LOCKED set), we end
up unlocking the hole's range and then later our caller unlocks it
again, which might have already been locked by some other task once
the first unlock happened.

Currently this can only happen during a call to the extent_same ioctl,
as it's the only caller of __do_readpage() that sets the bit
EXTENT_BIO_PARENT_LOCKED for bio flags.

Fix this by leaving the unlock exclusively to the caller.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/extent_io.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ecb1204..6e6df34 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3070,8 +3070,12 @@ static int __do_readpage(struct extent_io_tree *tree,
 
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
-			unlock_extent_cached(tree, cur, cur + iosize - 1,
-			                     &cached, GFP_NOFS);
+			if (parent_locked)
+				free_extent_state(cached);
+			else
+				unlock_extent_cached(tree, cur,
+						     cur + iosize - 1,
+						     &cached, GFP_NOFS);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
-- 
cgit v1.1


From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 16 Oct 2015 12:34:25 +0100
Subject: Btrfs: fix truncation of compressed and inlined extents

When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.

Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:

   1) User A creates a file which consists of an inline and compressed
      extent with a size of 2000 bytes - the file is not accessible to
      any other users (no read, write or execution permission for anyone
      else);

   2) The user truncates the file to a size of 1000 bytes;

   3) User A makes the file world readable;

   4) User B creates a file consisting of an inline extent of 2000 bytes;

   5) User B issues a clone operation from user A's file into its own
      file (using a length argument of 0, clone the whole range);

   6) User B now gets to see the 1000 bytes that user A truncated from
      its file before it made its file world readbale. User B also lost
      the bytes in the range [1000, 2000[ bytes from its own file, but
      that might be ok if his/her intention was reading stale data from
      user A that was never supposed to be public.

Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.

This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").

So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.

The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  # Create our test files. File foo is going to be the source of a clone operation
  # and consists of a single inline extent with an uncompressed size of 512 bytes,
  # while file bar consists of a single inline extent with an uncompressed size of
  # 256 bytes. For our test's purpose, it's important that file bar has an inline
  # extent with a size smaller than foo's inline extent.
  $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128"   \
          -c "pwrite -S 0x2a 128 384" \
          $SCRATCH_MNT/foo | _filter_xfs_io
  $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io

  # Now durably persist all metadata and data. We do this to make sure that we get
  # on disk an inline extent with a size of 512 bytes for file foo.
  sync

  # Now truncate our file foo to a smaller size. Because it consists of a
  # compressed and inline extent, btrfs did not shrink the inline extent to the
  # new size (if the extent was not compressed, btrfs would shrink it to 128
  # bytes), it only updates the inode's i_size to 128 bytes.
  $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo

  # Now clone foo's inline extent into bar.
  # This clone operation should fail with errno EOPNOTSUPP because the source
  # file consists only of an inline extent and the file's size is smaller than
  # the inline extent of the destination (128 bytes < 256 bytes). However the
  # clone ioctl was not prepared to deal with a file that has a size smaller
  # than the size of its inline extent (something that happens only for compressed
  # inline extents), resulting in copying the full inline extent from the source
  # file into the destination file.
  #
  # Note that btrfs' clone operation for inline extents consists of removing the
  # inline extent from the destination inode and copy the inline extent from the
  # source inode into the destination inode, meaning that if the destination
  # inode's inline extent is larger (N bytes) than the source inode's inline
  # extent (M bytes), some bytes (N - M bytes) will be lost from the destination
  # file. Btrfs could copy the source inline extent's data into the destination's
  # inline extent so that we would not lose any data, but that's currently not
  # done due to the complexity that would be needed to deal with such cases
  # (specially when one or both extents are compressed), returning EOPNOTSUPP, as
  # it's normally not a very common case to clone very small files (only case
  # where we get inline extents) and copying inline extents does not save any
  # space (unlike for normal, non-inlined extents).
  $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar

  # Now because the above clone operation used to succeed, and due to foo's inline
  # extent not being shinked by the truncate operation, our file bar got the whole
  # inline extent copied from foo, making us lose the last 128 bytes from bar
  # which got replaced by the bytes in range [128, 256[ from foo before foo was
  # truncated - in other words, data loss from bar and being able to read old and
  # stale data from foo that should not be possible to read anymore through normal
  # filesystem operations. Contrast with the case where we truncate a file from a
  # size N to a smaller size M, truncate it back to size N and then read the range
  # [M, N[, we should always get the value 0x00 for all the bytes in that range.

  # We expected the clone operation to fail with errno EOPNOTSUPP and therefore
  # not modify our file's bar data/metadata. So its content should be 256 bytes
  # long with all bytes having the value 0xbb.
  #
  # Without the btrfs bug fix, the clone operation succeeded and resulted in
  # leaking truncated data from foo, the bytes that belonged to its range
  # [128, 256[, and losing data from bar in that same range. So reading the
  # file gave us the following content:
  #
  # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
  # *
  # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
  # *
  # 0000400
  echo "File bar's content after the clone operation:"
  od -t x1 $SCRATCH_MNT/bar

  # Also because the foo's inline extent was not shrunk by the truncate
  # operation, btrfs' fsck, which is run by the fstests framework everytime a
  # test completes, failed reporting the following error:
  #
  #  root 5 inode 257 errors 400, nbytes wrong

  status=0
  exit

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 68 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 208db4e..cbb4286 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4217,6 +4217,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
 
 }
 
+static int truncate_inline_extent(struct inode *inode,
+				  struct btrfs_path *path,
+				  struct btrfs_key *found_key,
+				  const u64 item_end,
+				  const u64 new_size)
+{
+	struct extent_buffer *leaf = path->nodes[0];
+	int slot = path->slots[0];
+	struct btrfs_file_extent_item *fi;
+	u32 size = (u32)(new_size - found_key->offset);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+		loff_t offset = new_size;
+		loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+		/*
+		 * Zero out the remaining of the last page of our inline extent,
+		 * instead of directly truncating our inline extent here - that
+		 * would be much more complex (decompressing all the data, then
+		 * compressing the truncated data, which might be bigger than
+		 * the size of the inline extent, resize the extent, etc).
+		 * We release the path because to get the page we might need to
+		 * read the extent item from disk (data not in the page cache).
+		 */
+		btrfs_release_path(path);
+		return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+	}
+
+	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+	size = btrfs_file_extent_calc_inline_size(size);
+	btrfs_truncate_item(root, path, size, 1);
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		inode_sub_bytes(inode, item_end + 1 - new_size);
+
+	return 0;
+}
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -4411,27 +4452,40 @@ search_again:
 			 * special encodings
 			 */
 			if (!del_item &&
-			    btrfs_file_extent_compression(leaf, fi) == 0 &&
 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-				u32 size = new_size - found_key.offset;
-
-				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-					inode_sub_bytes(inode, item_end + 1 -
-							new_size);
 
 				/*
-				 * update the ram bytes to properly reflect
-				 * the new size of our item
+				 * Need to release path in order to truncate a
+				 * compressed extent. So delete any accumulated
+				 * extent items so far.
 				 */
-				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-				size =
-				    btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(root, path, size, 1);
+				if (btrfs_file_extent_compression(leaf, fi) !=
+				    BTRFS_COMPRESS_NONE && pending_del_nr) {
+					err = btrfs_del_items(trans, root, path,
+							      pending_del_slot,
+							      pending_del_nr);
+					if (err) {
+						btrfs_abort_transaction(trans,
+									root,
+									err);
+						goto error;
+					}
+					pending_del_nr = 0;
+				}
+
+				err = truncate_inline_extent(inode, path,
+							     &found_key,
+							     item_end,
+							     new_size);
+				if (err) {
+					btrfs_abort_transaction(trans,
+								root, err);
+					goto error;
+				}
 			} else if (test_bit(BTRFS_ROOT_REF_COWS,
 					    &root->state)) {
-				inode_sub_bytes(inode, item_end + 1 -
-						found_key.offset);
+				inode_sub_bytes(inode, item_end + 1 - new_size);
 			}
 		}
 delete:
-- 
cgit v1.1


From 0d51e28a11ad428ac8545a3ec8724395fa1c71a7 Mon Sep 17 00:00:00 2001
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Date: Mon, 27 Jul 2015 15:26:43 +0530
Subject: Btrfs: btrfs_submit_bio_hook: Use btrfs_wq_endio_type values instead
 of integer constants

btrfs_submit_bio_hook() uses integer constants instead of values from "enum
btrfs_wq_endio_type". Fix this.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d..dc3a9fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1861,15 +1861,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
 	int ret = 0;
 	int skip_sum;
-	int metadata = 0;
 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	if (btrfs_is_free_space_inode(inode))
-		metadata = 2;
+		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
 
 	if (!(rw & REQ_WRITE)) {
 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
-- 
cgit v1.1


From ce0eac2a1d82eb841eae357bcef7ac39df131b21 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Sun, 23 Aug 2015 16:01:42 +0000
Subject: btrfs: Fixed dsize and last_off declarations

The return values of btrfs_item_offset_nr and btrfs_item_size_nr are of
type u32. To avoid mixing signed and unsigned integers we should also
declare dsize and last_off to be of type u32.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745ea..f553913 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
-	int last_off;
-	int dsize = 0;
+	u32 last_off;
+	u32 dsize = 0;
 	int ret = 0;
 	int wret;
 	int i;
-- 
cgit v1.1


From 0412e58c6d62168643a285f99b2354fa97df6cf3 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Mon, 24 Aug 2015 21:56:28 +0000
Subject: btrfs: Fixed declaration of old_len

old_len is used to store the return value of btrfs_item_size_nr().
The return value of btrfs_item_size_nr() is of type u32.
To improve code correctness and avoid mixing signed and unsigned
integers I've changed old_len to be of type u32 as well.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..024ea4f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,7 +141,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	int slot;
 	unsigned long ptr;
-	int old_len;
+	u32 old_len;
 
 	path = btrfs_alloc_path();
 	if (!path)
-- 
cgit v1.1


From f2f767e7345dfe56102d6809f647ba38a238f718 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Thu, 27 Aug 2015 23:53:45 +0000
Subject: btrfs: trimming some start_transaction() code away

Just call kmem_cache_zalloc() instead of calling kmem_cache_alloc().
We're just initializing most fields to 0, false and NULL later on
_anyway_, so to make the code mode readable and potentially gain
a bit of performance (completely untested claim), we should fill our
btrfs_trans_handle with zeros on allocation then just initialize
those five remaining fields (not counting the list_heads) as normal.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a5b0644..c191606 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -502,7 +502,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 			goto reserve_fail;
 	}
 again:
-	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	if (!h) {
 		ret = -ENOMEM;
 		goto alloc_fail;
@@ -543,23 +543,10 @@ again:
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
-	h->blocks_used = 0;
-	h->bytes_reserved = 0;
-	h->chunk_bytes_reserved = 0;
 	h->root = root;
-	h->delayed_ref_updates = 0;
 	h->use_count = 1;
-	h->adding_csums = 0;
-	h->block_rsv = NULL;
-	h->orig_rsv = NULL;
-	h->aborted = 0;
-	h->qgroup_reserved = 0;
-	h->delayed_ref_elem.seq = 0;
 	h->type = type;
-	h->allocating_chunk = false;
 	h->can_flush_pending_bgs = true;
-	h->reloc_reserved = false;
-	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 	INIT_LIST_HEAD(&h->ordered);
-- 
cgit v1.1


From d7641a49a54f66e1a323d0de6b42caeee6d33aa5 Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Tue, 1 Sep 2015 23:10:57 +0900
Subject: btrfs: replace unnecessary list_for_each_entry_safe to
 list_for_each_entry

There is no removing list element while iterating over list.
So, replace list_for_each_entry_safe to list_for_each_entry.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..9181e64 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2699,7 +2699,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_fs_info_args *fi_args;
 	struct btrfs_device *device;
-	struct btrfs_device *next;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	int ret = 0;
 
@@ -2711,7 +2710,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 	fi_args->num_devices = fs_devices->num_devices;
 	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
 
-	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->devid > fi_args->max_id)
 			fi_args->max_id = device->devid;
 	}
-- 
cgit v1.1


From 568b1c9cca82623af764ee6ea65dc41a7079171c Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Tue, 1 Sep 2015 23:36:28 +0900
Subject: btrfs: remove unnecessary list_del

We can safely iterate whole list items, without using list_del macro.
So remove the list_del call.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc3a9fc..c4a97dc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2599,7 +2599,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
 		return;
 
 	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
 		kfree(old);
 	}
 	kfree(new);
-- 
cgit v1.1


From a099d0fdb34f00cee346703036a0a90d267e77d7 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Mon, 7 Sep 2015 17:24:37 +0300
Subject: btrfs: memset cur_trans->delayed_refs to zero

Use memset() to null out the btrfs_delayed_ref_root of
btrfs_transaction instead of setting all the members to 0 by hand.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c191606..c2f84e2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -236,15 +236,11 @@ loop:
 	cur_trans->start_time = get_seconds();
 	cur_trans->dirty_bg_run = 0;
 
+	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
+
 	cur_trans->delayed_refs.href_root = RB_ROOT;
 	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
-	cur_trans->delayed_refs.num_heads_ready = 0;
-	cur_trans->delayed_refs.pending_csums = 0;
-	cur_trans->delayed_refs.num_heads = 0;
-	cur_trans->delayed_refs.flushing = 0;
-	cur_trans->delayed_refs.run_delayed_start = 0;
-	cur_trans->delayed_refs.qgroup_to_skip = 0;
 
 	/*
 	 * although the tree mod log is per file system and not per transaction,
-- 
cgit v1.1


From e9cf439f0dc824cca6b2dc0dbb14e17f3f80b7f1 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Wed, 9 Sep 2015 00:18:50 +0000
Subject: btrfs: use a single if() statement for one outcome in get_block_rsv()

Rather than have three separate if() statements for the same outcome
we should just OR them together in the same if() statement.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 601d7d4..8a1759b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4891,13 +4891,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 {
 	struct btrfs_block_rsv *block_rsv = NULL;
 
-	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-		block_rsv = trans->block_rsv;
-
-	if (root == root->fs_info->csum_root && trans->adding_csums)
-		block_rsv = trans->block_rsv;
-
-	if (root == root->fs_info->uuid_root)
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    (root == root->fs_info->csum_root && trans->adding_csums) ||
+	     (root == root->fs_info->uuid_root))
 		block_rsv = trans->block_rsv;
 
 	if (!block_rsv)
-- 
cgit v1.1


From af902047508d3619499cbbc2f7c4c5e16428b2e3 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 15 Sep 2015 21:08:06 +0800
Subject: btrfs: Move btrfs_raid_array to public

This array is used to record attributes of each raid type,
make it public, and many functions will benifit with this array.

For example, num_tolerated_disk_barrier_failures(), we can
avoid complex conditions in this function, and get raid attribute
simply by accessing above array.

It can also make code logic simple, reduce duplication code, and
increase maintainability.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 128 +++++++++++++++++++++++++++++------------------------
 fs/btrfs/volumes.h |   4 ++
 2 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..2a65d2d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,75 @@
 #include "dev-replace.h"
 #include "sysfs.h"
 
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = {
+		.sub_stripes	= 2,
+		.dev_stripes	= 1,
+		.devs_max	= 0,	/* 0 == as many as possible */
+		.devs_min	= 4,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID1] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 2,
+		.devs_min	= 2,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_DUP] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 2,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID0] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_SINGLE] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_RAID5] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID6] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 3,
+		.devs_increment	= 1,
+		.ncopies	= 3,
+	},
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
+	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
+	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
+	[BTRFS_RAID_SINGLE] = 0,
+	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
+	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
+};
+
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				struct btrfs_device *device);
@@ -4285,65 +4354,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
-	[BTRFS_RAID_RAID10] = {
-		.sub_stripes	= 2,
-		.dev_stripes	= 1,
-		.devs_max	= 0,	/* 0 == as many as possible */
-		.devs_min	= 4,
-		.devs_increment	= 2,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_RAID1] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 2,
-		.devs_min	= 2,
-		.devs_increment	= 2,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_DUP] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 2,
-		.devs_max	= 1,
-		.devs_min	= 1,
-		.devs_increment	= 1,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_RAID0] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 0,
-		.devs_min	= 2,
-		.devs_increment	= 1,
-		.ncopies	= 1,
-	},
-	[BTRFS_RAID_SINGLE] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 1,
-		.devs_min	= 1,
-		.devs_increment	= 1,
-		.ncopies	= 1,
-	},
-	[BTRFS_RAID_RAID5] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 0,
-		.devs_min	= 2,
-		.devs_increment	= 1,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_RAID6] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 0,
-		.devs_min	= 3,
-		.devs_increment	= 1,
-		.ncopies	= 3,
-	},
-};
-
 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
 {
 	/* TODO allow them to set a preferred stripe size */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ca784a..75d6083 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -338,6 +338,10 @@ struct btrfs_raid_attr {
 	int ncopies;		/* how many copies to data has */
 };
 
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
 struct map_lookup {
 	u64 type;
 	int io_align;
-- 
cgit v1.1


From 8789f4fe608922f484369382a8e507ca12df7f4e Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 15 Sep 2015 21:08:07 +0800
Subject: btrfs: use btrfs_raid_array for
 btrfs_get_num_tolerated_disk_barrier_failures()

btrfs_raid_array[] is used to define all raid attributes, use it
to get tolerated_failures in btrfs_get_num_tolerated_disk_barrier_failures(),
instead of complex condition in function.

It can make code simple and auto-support other possible raid-type in
future.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 35 ++++++++++++++++++++++-------------
 fs/btrfs/volumes.c |  7 +++++++
 fs/btrfs/volumes.h |  1 +
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e60d00..322fd6d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3449,22 +3449,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
 {
-	if ((flags & (BTRFS_BLOCK_GROUP_DUP |
-		      BTRFS_BLOCK_GROUP_RAID0 |
-		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
-	    ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
-		return 0;
+	int raid_type;
+	int min_tolerated = INT_MAX;
 
-	if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		     BTRFS_BLOCK_GROUP_RAID5 |
-		     BTRFS_BLOCK_GROUP_RAID10))
-		return 1;
+	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+		min_tolerated = min(min_tolerated,
+				    btrfs_raid_array[BTRFS_RAID_SINGLE].
+				    tolerated_failures);
 
-	if (flags & BTRFS_BLOCK_GROUP_RAID6)
-		return 2;
+	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+		if (raid_type == BTRFS_RAID_SINGLE)
+			continue;
+		if (!(flags & btrfs_raid_group[raid_type]))
+			continue;
+		min_tolerated = min(min_tolerated,
+				    btrfs_raid_array[raid_type].
+				    tolerated_failures);
+	}
 
-	pr_warn("BTRFS: unknown raid type: %llu\n", flags);
-	return 0;
+	if (min_tolerated == INT_MAX) {
+		pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+		min_tolerated = 0;
+	}
+
+	return min_tolerated;
 }
 
 int btrfs_calc_num_tolerated_disk_barrier_failures(
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2a65d2d..7b53390 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -48,6 +48,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,	/* 0 == as many as possible */
 		.devs_min	= 4,
+		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
 	},
@@ -56,6 +57,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 2,
 		.devs_min	= 2,
+		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
 	},
@@ -64,6 +66,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 2,
 		.devs_max	= 1,
 		.devs_min	= 1,
+		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 2,
 	},
@@ -72,6 +75,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,
 		.devs_min	= 2,
+		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
 	},
@@ -80,6 +84,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 1,
 		.devs_min	= 1,
+		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
 	},
@@ -88,6 +93,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,
 		.devs_min	= 2,
+		.tolerated_failures = 1,
 		.devs_increment	= 1,
 		.ncopies	= 2,
 	},
@@ -96,6 +102,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,
 		.devs_min	= 3,
+		.tolerated_failures = 2,
 		.devs_increment	= 1,
 		.ncopies	= 3,
 	},
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 75d6083..04ff0ef 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -334,6 +334,7 @@ struct btrfs_raid_attr {
 	int dev_stripes;	/* stripes per dev */
 	int devs_max;		/* max devs to use */
 	int devs_min;		/* min devs needed */
+	int tolerated_failures; /* max tolerated fail devs */
 	int devs_increment;	/* ndevs has to be a multiple of this */
 	int ncopies;		/* how many copies to data has */
 };
-- 
cgit v1.1


From 9c170b2644610a62b107c12a007e065d4456798d Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 15 Sep 2015 21:08:08 +0800
Subject: btrfs: use btrfs_raid_array in btrfs_reduce_alloc_profile

btrfs_raid_array[] holds attributes of all raid types.

Use btrfs_raid_array[].devs_min is best way for request
in btrfs_reduce_alloc_profile(), instead of use complex
condition of each raid types.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8a1759b..2b29800 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3822,7 +3822,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
 	u64 target;
-	u64 tmp;
+	u64 raid_type;
+	u64 allowed = 0;
 
 	/*
 	 * see if restripe for this chunk_type is in progress, if so
@@ -3840,31 +3841,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	spin_unlock(&root->fs_info->balance_lock);
 
 	/* First, mask out the RAID levels which aren't possible */
-	if (num_devices == 1)
-		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
-			   BTRFS_BLOCK_GROUP_RAID5);
-	if (num_devices < 3)
-		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
-	if (num_devices < 4)
-		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
-	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-	flags &= ~tmp;
-
-	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-		tmp = BTRFS_BLOCK_GROUP_RAID6;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-		tmp = BTRFS_BLOCK_GROUP_RAID5;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-		tmp = BTRFS_BLOCK_GROUP_RAID10;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
-		tmp = BTRFS_BLOCK_GROUP_RAID1;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
-		tmp = BTRFS_BLOCK_GROUP_RAID0;
-
-	return extended_to_chunk(flags | tmp);
+	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+			allowed |= btrfs_raid_group[raid_type];
+	}
+	allowed &= flags;
+
+	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+		allowed = BTRFS_BLOCK_GROUP_RAID6;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+		allowed = BTRFS_BLOCK_GROUP_RAID5;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+		allowed = BTRFS_BLOCK_GROUP_RAID10;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+		allowed = BTRFS_BLOCK_GROUP_RAID1;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+		allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	return extended_to_chunk(flags | allowed);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
-- 
cgit v1.1


From bb78915203c0c1b07096e50d12f6e8fe54666f4f Mon Sep 17 00:00:00 2001
From: Shan Hai <shan.hai@windriver.com>
Date: Mon, 21 Sep 2015 11:40:57 +0800
Subject: btrfs/file.c: remove an unsed varialbe first_index

The commit b37392ea86761 ("Btrfs: cleanup unnecessary parameter
and variant of prepare_pages()") makes it redundant.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Shan Hai <haishan.bai@hotmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac9..b6695c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	u64 release_bytes = 0;
 	u64 lockstart;
 	u64 lockend;
-	unsigned long first_index;
 	size_t num_written = 0;
 	int nrptrs;
 	int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	if (!pages)
 		return -ENOMEM;
 
-	first_index = pos >> PAGE_CACHE_SHIFT;
-
 	while (iov_iter_count(i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(i),
-- 
cgit v1.1


From bdcd3c97d1cb4b8de8b57caab56eb0bb44d1f0f5 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 22 Sep 2015 20:02:25 +0000
Subject: btrfs: cleanup btrfs_balance profile validity checks

Improve readability by generalizing the profile validity checks.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7b53390..59538a78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3542,6 +3542,15 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
 
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+		u64 allowed)
+{
+	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
+		 (bctl_arg->target & ~allowed)));
+}
+
 /*
  * Should be called with both balance and volume mutexes held
  */
@@ -3599,27 +3608,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	if (num_devices > 3)
 		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
 			    BTRFS_BLOCK_GROUP_RAID6);
-	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
-	     (bctl->data.target & ~allowed))) {
+	if (validate_convert_profile(&bctl->data, allowed)) {
 		btrfs_err(fs_info, "unable to start balance with target "
 			   "data profile %llu",
 		       bctl->data.target);
 		ret = -EINVAL;
 		goto out;
 	}
-	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
-	     (bctl->meta.target & ~allowed))) {
+	if (validate_convert_profile(&bctl->meta, allowed)) {
 		btrfs_err(fs_info,
 			   "unable to start balance with target metadata profile %llu",
 		       bctl->meta.target);
 		ret = -EINVAL;
 		goto out;
 	}
-	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
-	     (bctl->sys.target & ~allowed))) {
+	if (validate_convert_profile(&bctl->sys, allowed)) {
 		btrfs_err(fs_info,
 			   "unable to start balance with target system profile %llu",
 		       bctl->sys.target);
-- 
cgit v1.1


From 5aed1dd8b458aa63aa9b7c1c5dd78f54de143c6f Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 22 Sep 2015 20:59:15 +0000
Subject: btrfs: change num_items type from u64 to unsigned int

The value of num_items that start_transaction() ultimately
always takes is a small one, so a 64 bit integer is overkill.

Also change num_items for btrfs_start_transaction() and
btrfs_start_transaction_lflush() as well.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 9 +++++----
 fs/btrfs/transaction.h | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c2f84e2..0c04245 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -443,8 +443,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
-		  enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+		  unsigned int type, enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
@@ -586,14 +586,15 @@ reserve_fail:
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_items)
+						   unsigned int num_items)
 {
 	return start_transaction(root, num_items, TRANS_START,
 				 BTRFS_RESERVE_FLUSH_ALL);
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-					struct btrfs_root *root, int num_items)
+					struct btrfs_root *root,
+					unsigned int num_items)
 {
 	return start_transaction(root, num_items, TRANS_START,
 				 BTRFS_RESERVE_FLUSH_LIMIT);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index a994bb0..db6bfd9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -185,9 +185,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_items);
+						   unsigned int num_items);
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-					struct btrfs_root *root, int num_items);
+					struct btrfs_root *root,
+					unsigned int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
-- 
cgit v1.1


From 6e4d6fa12ceb22f87d49e043e5a7636abdac970f Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 22 Sep 2015 21:00:07 +0000
Subject: btrfs: declare rsv_count as unsigned int instead of int

rsv_count ultimately gets passed to start_transaction() which
now takes an unsigned int as its num_items parameter.
The value of rsv_count should always be positive so declare it
as being unsigned.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b6695c4..7031e96 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2263,7 +2263,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	u64 drop_end;
 	int ret = 0;
 	int err = 0;
-	int rsv_count;
+	unsigned int rsv_count;
 	bool same_page;
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 	u64 ino_size;
-- 
cgit v1.1


From 8cd1e73111f95ca915f37ed534bbfbdf24fc1ca8 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Sun, 4 Oct 2015 17:05:32 +0800
Subject: btrfs: fix a comment typo

Just fix a typo in the code comment.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66..7332995 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1215,7 +1215,7 @@ out:
  * @offset - the offset for the key we'll insert
  *
  * This function writes out a free space cache struct to disk for quick recovery
- * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * on mount.  This will return 0 if it was successful in writing the cache out,
  * or an errno if it was not.
  */
 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
-- 
cgit v1.1


From 619ed39242d901339c09faa1b52dad47b0476f6f Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Thu, 8 Oct 2015 20:49:34 +0900
Subject: btrfs: cleanup iterating over prop_handlers array

This patch eliminates the last item of prop_handlers array which is used
to check end of array and instead uses ARRAY_SIZE macro.
Though this is a very tiny optimization, using ARRAY_SIZE macro is a
good practice to iterate array.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b..f9e6023 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
 		.extract = prop_compression_extract,
 		.inheritable = 1
 	},
-	{
-		.xattr_name = NULL
-	}
 };
 
 void __init btrfs_props_init(void)
 {
-	struct prop_handler *p;
+	int i;
 
 	hash_init(prop_handlers_ht);
 
-	for (p = &prop_handlers[0]; p->xattr_name; p++) {
+	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+		struct prop_handler *p = &prop_handlers[i];
 		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
 
 		hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
 			 struct inode *inode,
 			 struct inode *parent)
 {
-	const struct prop_handler *h;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
+	int i;
 
 	if (!test_bit(BTRFS_INODE_HAS_PROPS,
 		      &BTRFS_I(parent)->runtime_flags))
 		return 0;
 
-	for (h = &prop_handlers[0]; h->xattr_name; h++) {
+	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+		const struct prop_handler *h = &prop_handlers[i];
 		const char *value;
 		u64 num_bytes;
 
-- 
cgit v1.1


From d91876496bcf2236efb75ef12378c964bbe8c970 Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Wed, 14 Oct 2015 14:05:24 +0900
Subject: btrfs: compress: put variables defined per compress type in struct to
 make cache friendly

Below variables are defined per compress type.
 - struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]
 - spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]
 - int comp_num_workspace[BTRFS_COMPRESS_TYPES]
 - atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]
 - wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]

BTW, while accessing one compress type of these variables, the next or
before address is other compress types of it.
So this patch puts these variables in a struct to make cache friendly.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 94 ++++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 46 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca..b524e02 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -745,11 +745,13 @@ out:
 	return ret;
 }
 
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+	struct list_head idle_ws;
+	spinlock_t ws_lock;
+	int num_ws;
+	atomic_t alloc_ws;
+	wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
 
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
@@ -761,10 +763,10 @@ void __init btrfs_init_compress(void)
 	int i;
 
 	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		INIT_LIST_HEAD(&comp_idle_workspace[i]);
-		spin_lock_init(&comp_workspace_lock[i]);
-		atomic_set(&comp_alloc_workspace[i], 0);
-		init_waitqueue_head(&comp_workspace_wait[i]);
+		INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+		spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+		atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+		init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
 	}
 }
 
@@ -778,38 +780,38 @@ static struct list_head *find_workspace(int type)
 	int cpus = num_online_cpus();
 	int idx = type - 1;
 
-	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
-	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
-	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
-	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
-	int *num_workspace			= &comp_num_workspace[idx];
+	struct list_head *idle_ws	= &btrfs_comp_ws[idx].idle_ws;
+	spinlock_t *ws_lock		= &btrfs_comp_ws[idx].ws_lock;
+	atomic_t *alloc_ws		= &btrfs_comp_ws[idx].alloc_ws;
+	wait_queue_head_t *ws_wait	= &btrfs_comp_ws[idx].ws_wait;
+	int *num_ws			= &btrfs_comp_ws[idx].num_ws;
 again:
-	spin_lock(workspace_lock);
-	if (!list_empty(idle_workspace)) {
-		workspace = idle_workspace->next;
+	spin_lock(ws_lock);
+	if (!list_empty(idle_ws)) {
+		workspace = idle_ws->next;
 		list_del(workspace);
-		(*num_workspace)--;
-		spin_unlock(workspace_lock);
+		(*num_ws)--;
+		spin_unlock(ws_lock);
 		return workspace;
 
 	}
-	if (atomic_read(alloc_workspace) > cpus) {
+	if (atomic_read(alloc_ws) > cpus) {
 		DEFINE_WAIT(wait);
 
-		spin_unlock(workspace_lock);
-		prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+		spin_unlock(ws_lock);
+		prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(alloc_ws) > cpus && !*num_ws)
 			schedule();
-		finish_wait(workspace_wait, &wait);
+		finish_wait(ws_wait, &wait);
 		goto again;
 	}
-	atomic_inc(alloc_workspace);
-	spin_unlock(workspace_lock);
+	atomic_inc(alloc_ws);
+	spin_unlock(ws_lock);
 
 	workspace = btrfs_compress_op[idx]->alloc_workspace();
 	if (IS_ERR(workspace)) {
-		atomic_dec(alloc_workspace);
-		wake_up(workspace_wait);
+		atomic_dec(alloc_ws);
+		wake_up(ws_wait);
 	}
 	return workspace;
 }
@@ -821,27 +823,27 @@ again:
 static void free_workspace(int type, struct list_head *workspace)
 {
 	int idx = type - 1;
-	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
-	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
-	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
-	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
-	int *num_workspace			= &comp_num_workspace[idx];
-
-	spin_lock(workspace_lock);
-	if (*num_workspace < num_online_cpus()) {
-		list_add(workspace, idle_workspace);
-		(*num_workspace)++;
-		spin_unlock(workspace_lock);
+	struct list_head *idle_ws	= &btrfs_comp_ws[idx].idle_ws;
+	spinlock_t *ws_lock		= &btrfs_comp_ws[idx].ws_lock;
+	atomic_t *alloc_ws		= &btrfs_comp_ws[idx].alloc_ws;
+	wait_queue_head_t *ws_wait	= &btrfs_comp_ws[idx].ws_wait;
+	int *num_ws			= &btrfs_comp_ws[idx].num_ws;
+
+	spin_lock(ws_lock);
+	if (*num_ws < num_online_cpus()) {
+		list_add(workspace, idle_ws);
+		(*num_ws)++;
+		spin_unlock(ws_lock);
 		goto wake;
 	}
-	spin_unlock(workspace_lock);
+	spin_unlock(ws_lock);
 
 	btrfs_compress_op[idx]->free_workspace(workspace);
-	atomic_dec(alloc_workspace);
+	atomic_dec(alloc_ws);
 wake:
 	smp_mb();
-	if (waitqueue_active(workspace_wait))
-		wake_up(workspace_wait);
+	if (waitqueue_active(ws_wait))
+		wake_up(ws_wait);
 }
 
 /*
@@ -853,11 +855,11 @@ static void free_workspaces(void)
 	int i;
 
 	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		while (!list_empty(&comp_idle_workspace[i])) {
-			workspace = comp_idle_workspace[i].next;
+		while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+			workspace = btrfs_comp_ws[i].idle_ws.next;
 			list_del(workspace);
 			btrfs_compress_op[i]->free_workspace(workspace);
-			atomic_dec(&comp_alloc_workspace[i]);
+			atomic_dec(&btrfs_comp_ws[i].alloc_ws);
 		}
 	}
 }
-- 
cgit v1.1


From 0b8d8ce0297c4376d50318692e0aa53a3aaed90c Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Tue, 20 Oct 2015 14:56:22 +0100
Subject: btrfs: check-integrity: Fix returned errno codes

check-integrity is using -1 instead of the -ENOMEM defined macro to
specify that a buffer allocation failed. Since the error number is
propagated, the caller will get a -EPERM which is the wrong error
condition.

Also, the smatch tool complains with the following warnings:
btrfsic_process_superblock() warn: returning -1 instead of -ENOMEM is sloppy
btrfsic_read_block() warn: returning -1 instead of -ENOMEM is sloppy

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfa..9cacd06 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
 	if (NULL == selected_super) {
 		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-		return -1;
+		return -ENOMEM;
 	}
 
 	list_for_each_entry(device, dev_head, dev_list) {
@@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 					  sizeof(*block_ctx->pagev)) *
 					 num_pages, GFP_NOFS);
 	if (!block_ctx->mem_to_free)
-		return -1;
+		return -ENOMEM;
 	block_ctx->datav = block_ctx->mem_to_free;
 	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
 	for (i = 0; i < num_pages; i++) {
-- 
cgit v1.1


From ddd664f4478a4aaf0ac67a4297e33f1992bc9be2 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Tue, 20 Oct 2015 14:56:23 +0100
Subject: btrfs: reada: Fix returned errno code

reada is using -1 instead of the -ENOMEM defined macro to specify that
a buffer allocation failed. Since the error number is propagated, the
caller will get a -EPERM which is the wrong error condition.

Also, updating the caller to return the exact value from
reada_add_block.

Smatch tool warning:
reada_add_block() warn: returning -1 instead of -ENOMEM is sloppy

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/reada.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 4645cd1..619f929 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
 	rec = kzalloc(sizeof(*rec), GFP_NOFS);
 	if (!rec) {
 		reada_extent_put(root->fs_info, re);
-		return -1;
+		return -ENOMEM;
 	}
 
 	rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 	u64 start;
 	u64 generation;
 	int level;
+	int ret;
 	struct extent_buffer *node;
 	static struct btrfs_key max_key = {
 		.objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 	generation = btrfs_header_generation(node);
 	free_extent_buffer(node);
 
-	if (reada_add_block(rc, start, &max_key, level, generation)) {
+	ret = reada_add_block(rc, start, &max_key, level, generation);
+	if (ret) {
 		kfree(rc);
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(ret);
 	}
 
 	reada_start_machine(root->fs_info);
-- 
cgit v1.1


From ac46777213e00e26b9210060586f473368c54da2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 12:08:16 +0800
Subject: btrfs: extent_io: Introduce needed structure for recoding set/clear
 bits

Add a new structure, extent_change_set, to record how many bytes are
changed in one set/clear_extent_bits() operation, with detailed changed
ranges info.

This provides the needed facilities for later qgroup reserve framework.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36..3107a6e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
 #define __EXTENTIO__
 
 #include <linux/rbtree.h>
+#include "ulist.h"
 
 /* bits for the extent state */
 #define EXTENT_DIRTY		(1U << 0)
@@ -161,6 +162,17 @@ struct extent_buffer {
 #endif
 };
 
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+	/* How many bytes are set/cleared in this operation */
+	u64 bytes_changed;
+
+	/* Changed ranges */
+	struct ulist *range_changed;
+};
+
 static inline void extent_set_compress_type(unsigned long *bio_flags,
 					    int compress_type)
 {
-- 
cgit v1.1


From d38ed27f0442c8cd520e093081127949d4bcf9bc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 14:53:37 +0800
Subject: btrfs: extent_io: Introduce new function set_record_extent_bits

Introduce new function set_record_extent_bits(), which will not only set
given bits, but also record how many bytes are changed, and detailed
range info.

This is quite important for later qgroup reserve framework.
The number of bytes will be used to do qgroup reserve, and detailed
range info will be used to cleanup for EQUOT case.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 71 +++++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/extent_io.h |  3 +++
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6e6df34..9c066d6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -131,6 +131,23 @@ struct extent_page_data {
 	unsigned int sync_io:1;
 };
 
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+				 struct extent_changeset *changeset,
+				 int set)
+{
+	int ret;
+
+	if (!changeset)
+		return;
+	if (set && (state->state & bits) == bits)
+		return;
+	changeset->bytes_changed += state->end - state->start + 1;
+	ret = ulist_add(changeset->range_changed, state->start, state->end,
+			GFP_ATOMIC);
+	/* ENOMEM */
+	BUG_ON(ret < 0);
+}
+
 static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +427,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-			   struct extent_state *state, unsigned *bits);
+			   struct extent_state *state, unsigned *bits,
+			   struct extent_changeset *changeset);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +444,7 @@ static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
 			struct rb_node ***p,
 			struct rb_node **parent,
-			unsigned *bits)
+			unsigned *bits, struct extent_changeset *changeset)
 {
 	struct rb_node *node;
 
@@ -436,7 +454,7 @@ static int insert_state(struct extent_io_tree *tree,
 	state->start = start;
 	state->end = end;
 
-	set_state_bits(tree, state, bits);
+	set_state_bits(tree, state, bits, changeset);
 
 	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 	if (node) {
@@ -789,7 +807,7 @@ out:
 
 static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
-			   unsigned *bits)
+			   unsigned *bits, struct extent_changeset *changeset)
 {
 	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 
@@ -798,6 +816,7 @@ static void set_state_bits(struct extent_io_tree *tree,
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
+	add_extent_changeset(state, bits_to_set, changeset, 1);
 	state->state |= bits_to_set;
 }
 
@@ -835,7 +854,7 @@ static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		 unsigned bits, unsigned exclusive_bits,
 		 u64 *failed_start, struct extent_state **cached_state,
-		 gfp_t mask)
+		 gfp_t mask, struct extent_changeset *changeset)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -873,7 +892,7 @@ again:
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = insert_state(tree, prealloc, start, end,
-				   &p, &parent, &bits);
+				   &p, &parent, &bits, changeset);
 		if (err)
 			extent_io_tree_panic(tree, err);
 
@@ -899,7 +918,7 @@ hit_next:
 			goto out;
 		}
 
-		set_state_bits(tree, state, &bits);
+		set_state_bits(tree, state, &bits, changeset);
 		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
@@ -945,7 +964,7 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set_state_bits(tree, state, &bits);
+			set_state_bits(tree, state, &bits, changeset);
 			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
@@ -980,7 +999,7 @@ hit_next:
 		 * the later extent.
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   NULL, NULL, &bits);
+				   NULL, NULL, &bits, changeset);
 		if (err)
 			extent_io_tree_panic(tree, err);
 
@@ -1008,7 +1027,7 @@ hit_next:
 		if (err)
 			extent_io_tree_panic(tree, err);
 
-		set_state_bits(tree, prealloc, &bits);
+		set_state_bits(tree, prealloc, &bits, changeset);
 		cache_state(prealloc, cached_state);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
@@ -1038,7 +1057,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   struct extent_state **cached_state, gfp_t mask)
 {
 	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
-				cached_state, mask);
+				cached_state, mask, NULL);
 }
 
 
@@ -1111,7 +1130,7 @@ again:
 			goto out;
 		}
 		err = insert_state(tree, prealloc, start, end,
-				   &p, &parent, &bits);
+				   &p, &parent, &bits, NULL);
 		if (err)
 			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
@@ -1130,7 +1149,7 @@ hit_next:
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		set_state_bits(tree, state, &bits);
+		set_state_bits(tree, state, &bits, NULL);
 		cache_state(state, cached_state);
 		state = clear_state_bit(tree, state, &clear_bits, 0);
 		if (last_end == (u64)-1)
@@ -1171,7 +1190,7 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set_state_bits(tree, state, &bits);
+			set_state_bits(tree, state, &bits, NULL);
 			cache_state(state, cached_state);
 			state = clear_state_bit(tree, state, &clear_bits, 0);
 			if (last_end == (u64)-1)
@@ -1208,7 +1227,7 @@ hit_next:
 		 * the later extent.
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   NULL, NULL, &bits);
+				   NULL, NULL, &bits, NULL);
 		if (err)
 			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
@@ -1233,7 +1252,7 @@ hit_next:
 		if (err)
 			extent_io_tree_panic(tree, err);
 
-		set_state_bits(tree, prealloc, &bits);
+		set_state_bits(tree, prealloc, &bits, NULL);
 		cache_state(prealloc, cached_state);
 		clear_state_bit(tree, prealloc, &clear_bits, 0);
 		prealloc = NULL;
@@ -1274,6 +1293,22 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			      NULL, mask);
 }
 
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			   unsigned bits, gfp_t mask,
+			   struct extent_changeset *changeset)
+{
+	/*
+	 * We don't support EXTENT_LOCKED yet, as current changeset will
+	 * record any bits changed, so for EXTENT_LOCKED case, it will
+	 * either fail with -EEXIST or changeset will record the whole
+	 * range.
+	 */
+	BUG_ON(bits & EXTENT_LOCKED);
+
+	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+				changeset);
+}
+
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask)
 {
@@ -1343,7 +1378,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	while (1) {
 		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
 				       EXTENT_LOCKED, &failed_start,
-				       cached_state, GFP_NOFS);
+				       cached_state, GFP_NOFS, NULL);
 		if (err == -EEXIST) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
@@ -1365,7 +1400,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 	u64 failed_start;
 
 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-			       &failed_start, NULL, GFP_NOFS);
+			       &failed_start, NULL, GFP_NOFS, NULL);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3107a6e..4a7c9d9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -227,6 +227,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    unsigned bits, gfp_t mask);
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			   unsigned bits, gfp_t mask,
+			   struct extent_changeset *changeset);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   unsigned bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
-- 
cgit v1.1


From fefdc55702a5f9f99778b6bdce4c4e1185ff943f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 15:35:38 +0800
Subject: btrfs: extent_io: Introduce new function clear_record_extent_bits()

Introduce new function clear_record_extent_bits(), which will clear bits
for given range and record the details about which ranges are cleared
and how many bytes in total it changes.

This provides the basis for later qgroup reserve codes.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/extent_io.h |  3 +++
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9c066d6..33a01ea 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -141,6 +141,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
 		return;
 	if (set && (state->state & bits) == bits)
 		return;
+	if (!set && (state->state & bits) == 0)
+		return;
 	changeset->bytes_changed += state->end - state->start + 1;
 	ret = ulist_add(changeset->range_changed, state->start, state->end,
 			GFP_ATOMIC);
@@ -529,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
  */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 					    struct extent_state *state,
-					    unsigned *bits, int wake)
+					    unsigned *bits, int wake,
+					    struct extent_changeset *changeset)
 {
 	struct extent_state *next;
 	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -540,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 		tree->dirty_bytes -= range;
 	}
 	clear_state_cb(tree, state, bits);
+	add_extent_changeset(state, bits_to_clear, changeset, 0);
 	state->state &= ~bits_to_clear;
 	if (wake)
 		wake_up(&state->wq);
@@ -587,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  *
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     unsigned bits, int wake, int delete,
-		     struct extent_state **cached_state,
-		     gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			      unsigned bits, int wake, int delete,
+			      struct extent_state **cached_state,
+			      gfp_t mask, struct extent_changeset *changeset)
 {
 	struct extent_state *state;
 	struct extent_state *cached;
@@ -689,7 +693,8 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			state = clear_state_bit(tree, state, &bits, wake);
+			state = clear_state_bit(tree, state, &bits, wake,
+						changeset);
 			goto next;
 		}
 		goto search_again;
@@ -710,13 +715,13 @@ hit_next:
 		if (wake)
 			wake_up(&state->wq);
 
-		clear_state_bit(tree, prealloc, &bits, wake);
+		clear_state_bit(tree, prealloc, &bits, wake, changeset);
 
 		prealloc = NULL;
 		goto out;
 	}
 
-	state = clear_state_bit(tree, state, &bits, wake);
+	state = clear_state_bit(tree, state, &bits, wake, changeset);
 next:
 	if (last_end == (u64)-1)
 		goto out;
@@ -1151,7 +1156,7 @@ hit_next:
 	if (state->start == start && state->end <= end) {
 		set_state_bits(tree, state, &bits, NULL);
 		cache_state(state, cached_state);
-		state = clear_state_bit(tree, state, &clear_bits, 0);
+		state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
 		if (last_end == (u64)-1)
 			goto out;
 		start = last_end + 1;
@@ -1192,7 +1197,8 @@ hit_next:
 		if (state->end <= end) {
 			set_state_bits(tree, state, &bits, NULL);
 			cache_state(state, cached_state);
-			state = clear_state_bit(tree, state, &clear_bits, 0);
+			state = clear_state_bit(tree, state, &clear_bits, 0,
+						NULL);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
@@ -1254,7 +1260,7 @@ hit_next:
 
 		set_state_bits(tree, prealloc, &bits, NULL);
 		cache_state(prealloc, cached_state);
-		clear_state_bit(tree, prealloc, &clear_bits, 0);
+		clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
 		prealloc = NULL;
 		goto out;
 	}
@@ -1309,6 +1315,14 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 				changeset);
 }
 
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask)
+{
+	return __clear_extent_bit(tree, start, end, bits, wake, delete,
+				  cached, mask, NULL);
+}
+
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask)
 {
@@ -1320,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
 }
 
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			     unsigned bits, gfp_t mask,
+			     struct extent_changeset *changeset)
+{
+	/*
+	 * Don't support EXTENT_LOCKED case, same reason as
+	 * set_record_extent_bits().
+	 */
+	BUG_ON(bits & EXTENT_LOCKED);
+
+	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+				  changeset);
+}
+
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4a7c9d9..51e1b71 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -222,6 +222,9 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			     unsigned bits, gfp_t mask,
+			     struct extent_changeset *changeset);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
 		     struct extent_state **cached, gfp_t mask);
-- 
cgit v1.1


From 524725537023bb25a371722b1329446e5a2adcdb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 16:05:40 +0800
Subject: btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function

Introduce a new function, btrfs_qgroup_reserve_data(), which will use
io_tree to accurate qgroup reserve, to avoid reserved space leaking.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.h |  1 +
 fs/btrfs/qgroup.c    | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h    |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 51e1b71..f4c1ae1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
 #define EXTENT_NEED_WAIT	(1U << 13)
 #define EXTENT_DAMAGED		(1U << 14)
 #define EXTENT_NORESERVE	(1U << 15)
+#define EXTENT_QGROUP_RESERVED	(1U << 16)
 #define EXTENT_IOBITS		(EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1..5066274 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2486,3 +2486,52 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 		btrfs_queue_work(fs_info->qgroup_rescan_workers,
 				 &fs_info->qgroup_rescan_work);
 }
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_changeset changeset;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret;
+
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+	    len == 0)
+		return 0;
+
+	changeset.bytes_changed = 0;
+	changeset.range_changed = ulist_alloc(GFP_NOFS);
+
+	ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+			start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+			&changeset);
+	if (ret < 0)
+		goto cleanup;
+	ret = btrfs_qgroup_reserve(root, changeset.bytes_changed);
+	if (ret < 0)
+		goto cleanup;
+
+	ulist_free(changeset.range_changed);
+	return ret;
+
+cleanup:
+	/* cleanup already reserved ranges */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(changeset.range_changed, &uiter)))
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+				 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+				 GFP_NOFS);
+	ulist_free(changeset.range_changed);
+	return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcf..bd17cc2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -81,4 +81,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 			       u64 rfer, u64 excl);
 #endif
 
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From f695fdcef83ac6972e0eda1d6588a56348f521a2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 16:28:06 +0800
Subject: btrfs: qgroup: Introduce functions to release/free qgroup reserve
 data space

Introduce functions btrfs_qgroup_release/free_data() to release/free
reserved data range.

Release means, just remove the data range from io_tree, but doesn't
free the reserved space.
This is for normal buffered write case, when data is written into disc
and its metadata is added into tree, its reserved space should still be
kept until commit_trans().
So in that case, we only release dirty range, but keep the reserved
space recorded some other place until commit_tran().

Free means not only remove data range, but also free reserved space.
This is used for case for cleanup and invalidate page.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h |  2 ++
 2 files changed, 62 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5066274..e05d1f6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2535,3 +2535,63 @@ cleanup:
 	ulist_free(changeset.range_changed);
 	return ret;
 }
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+				       int free)
+{
+	struct extent_changeset changeset;
+	int ret;
+
+	changeset.bytes_changed = 0;
+	changeset.range_changed = ulist_alloc(GFP_NOFS);
+	if (!changeset.range_changed)
+		return -ENOMEM;
+
+	ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
+			start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+			&changeset);
+	if (ret < 0)
+		goto out;
+
+	if (free)
+		btrfs_qgroup_free(BTRFS_I(inode)->root,
+				  changeset.bytes_changed);
+out:
+	ulist_free(changeset.range_changed);
+	return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+	return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+	return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index bd17cc2..564eb21 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -83,4 +83,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 
 /* New io_tree based accurate qgroup reserve API */
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From f64d5ca868214f9beb60108c82fcb96940132bfb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:08:36 +0800
Subject: btrfs: delayed_ref: Add new function to record reserved space into
 delayed ref

Add new function btrfs_add_delayed_qgroup_reserve() function to record
how much space is reserved for that extent.

As btrfs only accounts qgroup at run_delayed_refs() time, so newly
allocated extent should keep the reserved space until then.

So add needed function with related members to do it.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 29 +++++++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h | 14 ++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81d..bd9b63b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -476,6 +476,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&head_ref->ref_list);
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
+	head_ref->qgroup_reserved = 0;
+	head_ref->qgroup_ref_root = 0;
 
 	/* Record qgroup extent info if provided */
 	if (qrecord) {
@@ -746,6 +748,33 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+				     struct btrfs_trans_handle *trans,
+				     u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *ref_head;
+	int ret = 0;
+
+	if (!fs_info->quota_enabled || !is_fstree(ref_root))
+		return 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+	if (!ref_head) {
+		ret = -ENOENT;
+		goto out;
+	}
+	WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+	ref_head->qgroup_ref_root = ref_root;
+	ref_head->qgroup_reserved = num_bytes;
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6..d4c41e2 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -113,6 +113,17 @@ struct btrfs_delayed_ref_head {
 	int total_ref_mod;
 
 	/*
+	 * For qgroup reserved space freeing.
+	 *
+	 * ref_root and reserved will be recorded after
+	 * BTRFS_ADD_DELAYED_EXTENT is called.
+	 * And will be used to free reserved qgroup space at
+	 * run_delayed_refs() time.
+	 */
+	u64 qgroup_ref_root;
+	u64 qgroup_reserved;
+
+	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
 	 * until the delayed ref is processed.  must_insert_reserved is
@@ -242,6 +253,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       u64 owner, u64 offset, int action,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       int no_quota);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+				     struct btrfs_trans_handle *trans,
+				     u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
-- 
cgit v1.1


From 297d750b9f8d7e6f2dbdf8abc5aa3b5c656affdc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:08:37 +0800
Subject: btrfs: delayed_ref: release and free qgroup reserved at proper timing

Qgroup reserved space needs to be released from inode dirty map and get
freed at different timing:

1) Release when the metadata is written into tree
After corresponding metadata is written into tree, any newer write will
be COWed(don't include NOCOW case yet).
So we must release its range from inode dirty range map, or we will
forget to reserve needed range, causing accounting exceeding the limit.

2) Free reserved bytes when delayed ref is run
When delayed refs are run, qgroup accounting will follow soon and turn
the reserved bytes into rfer/excl numbers.
As run_delayed_refs and qgroup accounting are all done at
commit_transaction() time, we are safe to free reserved space in
run_delayed_ref time().

With these timing to release/free reserved space, we should be able to
resolve the long existing qgroup reserve space leak problem.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  5 +++++
 fs/btrfs/inode.c       | 10 ++++++++++
 fs/btrfs/qgroup.c      |  5 ++---
 fs/btrfs/qgroup.h      | 18 +++++++++++++++++-
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2df4bc7..6c7927c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2345,6 +2345,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 						      node->num_bytes);
 			}
 		}
+
+		/* Also free its reserved qgroup space */
+		btrfs_qgroup_free_delayed_ref(root->fs_info,
+					      head->qgroup_ref_root,
+					      head->qgroup_reserved);
 		return ret;
 	}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ce55f6..c21ed7b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2119,6 +2119,16 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_alloc_reserved_file_extent(trans, root,
 					root->root_key.objectid,
 					btrfs_ino(inode), file_pos, &ins);
+	if (ret < 0)
+		goto out;
+	/*
+	 * Release the reserved range from inode dirty range map, and
+	 * move it to delayed ref codes, as now accounting only happens at
+	 * commit_transaction() time.
+	 */
+	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+	ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+			root->objectid, disk_bytenr, ram_bytes);
 out:
 	btrfs_free_path(path);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e05d1f6..75d8e58 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2116,14 +2116,13 @@ out:
 	return ret;
 }
 
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+			       u64 ref_root, u64 num_bytes)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
-	u64 ref_root = root->root_key.objectid;
 	int ret = 0;
 
 	if (!is_fstree(ref_root))
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 564eb21..80924ae 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -72,7 +72,23 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+			       u64 ref_root, u64 num_bytes);
+static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+					 num_bytes);
+}
+
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+						 u64 ref_root, u64 num_bytes)
+{
+	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+}
 
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
-- 
cgit v1.1


From 55eeaf0578038c40baaf3cf9408c23e42cd2a2b8 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:08:38 +0800
Subject: btrfs: qgroup: Introduce new functions to reserve/free metadata

Introduce new functions btrfs_qgroup_reserve/free_meta() to reserve/free
metadata reserved space.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/qgroup.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h  |  4 ++++
 4 files changed, 48 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49bc792..9df8f01 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1943,6 +1943,9 @@ struct btrfs_root {
 	int send_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
 	atomic_t will_be_snapshoted;
+
+	/* For qgroup metadata space reserve */
+	atomic_t qgroup_meta_rsv;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bcbb596..0e0544e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1267,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
 	atomic_set(&root->orphan_inodes, 0);
 	atomic_set(&root->refs, 1);
 	atomic_set(&root->will_be_snapshoted, 0);
+	atomic_set(&root->qgroup_meta_rsv, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 75d8e58..897a49d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2594,3 +2594,43 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
 {
 	return __btrfs_qgroup_release_data(inode, start, len, 0);
 }
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+	int ret;
+
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+	    num_bytes == 0)
+		return 0;
+
+	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+	ret = btrfs_qgroup_reserve(root, num_bytes);
+	if (ret < 0)
+		return ret;
+	atomic_add(num_bytes, &root->qgroup_meta_rsv);
+	return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+	int reserved;
+
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+		return;
+
+	reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+	if (reserved == 0)
+		return;
+	btrfs_qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+		return;
+
+	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+	WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+	atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+	btrfs_qgroup_free(root, num_bytes);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 80924ae..7d1c87c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -101,4 +101,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From 7174109c6548c4db85a383b8ae9d01469cddd110 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:41 +0800
Subject: btrfs: qgroup: Use new metadata reservation.

As we have the new metadata reservation functions, use them to replace
the old btrfs_qgroup_reserve() call for metadata.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++--------
 fs/btrfs/transaction.c | 34 +++++++---------------------------
 fs/btrfs/transaction.h |  1 -
 3 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6c7927c..130df70 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5337,7 +5337,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	if (root->fs_info->quota_enabled) {
 		/* One for parent inode, two for dir entries */
 		num_bytes = 3 * root->nodesize;
-		ret = btrfs_qgroup_reserve(root, num_bytes);
+		ret = btrfs_qgroup_reserve_meta(root, num_bytes);
 		if (ret)
 			return ret;
 	} else {
@@ -5355,10 +5355,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	if (ret == -ENOSPC && use_global_rsv)
 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
 
-	if (ret) {
-		if (*qgroup_reserved)
-			btrfs_qgroup_free(root, *qgroup_reserved);
-	}
+	if (ret && *qgroup_reserved)
+		btrfs_qgroup_free_meta(root, *qgroup_reserved);
 
 	return ret;
 }
@@ -5519,15 +5517,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	spin_unlock(&BTRFS_I(inode)->lock);
 
 	if (root->fs_info->quota_enabled) {
-		ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+		ret = btrfs_qgroup_reserve_meta(root,
+				nr_extents * root->nodesize);
 		if (ret)
 			goto out_fail;
 	}
 
 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (unlikely(ret)) {
-		if (root->fs_info->quota_enabled)
-			btrfs_qgroup_free(root, nr_extents * root->nodesize);
+		btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
 		goto out_fail;
 	}
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9354e7a..e377d7b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -480,13 +480,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items > 0 && root != root->fs_info->chunk_root) {
-		if (root->fs_info->quota_enabled &&
-		    is_fstree(root->root_key.objectid)) {
-			qgroup_reserved = num_items * root->nodesize;
-			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-			if (ret)
-				return ERR_PTR(ret);
-		}
+		qgroup_reserved = num_items * root->nodesize;
+		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+		if (ret)
+			return ERR_PTR(ret);
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
 		/*
@@ -547,6 +544,7 @@ again:
 	h->transaction = cur_trans;
 	h->root = root;
 	h->use_count = 1;
+
 	h->type = type;
 	h->can_flush_pending_bgs = true;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
@@ -568,7 +566,6 @@ again:
 		h->bytes_reserved = num_bytes;
 		h->reloc_reserved = reloc_reserved;
 	}
-	h->qgroup_reserved = qgroup_reserved;
 
 got_it:
 	btrfs_record_root_in_trans(h, root);
@@ -586,8 +583,7 @@ alloc_fail:
 		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
 					num_bytes);
 reserve_fail:
-	if (qgroup_reserved)
-		btrfs_qgroup_free(root, qgroup_reserved);
+	btrfs_qgroup_free_meta(root, qgroup_reserved);
 	return ERR_PTR(ret);
 }
 
@@ -805,15 +801,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			must_run_delayed_refs = 2;
 	}
 
-	if (trans->qgroup_reserved) {
-		/*
-		 * the same root has to be passed here between start_transaction
-		 * and end_transaction. Subvolume quota depends on this.
-		 */
-		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-		trans->qgroup_reserved = 0;
-	}
-
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
@@ -1231,6 +1218,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			spin_lock(&fs_info->fs_roots_radix_lock);
 			if (err)
 				break;
+			btrfs_qgroup_free_meta_all(root);
 		}
 	}
 	spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1835,10 +1823,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	if (trans->qgroup_reserved) {
-		btrfs_qgroup_free(root, trans->qgroup_reserved);
-		trans->qgroup_reserved = 0;
-	}
 
 	cur_trans = trans->transaction;
 
@@ -2191,10 +2175,6 @@ cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
 	btrfs_trans_release_chunk_metadata(trans);
 	trans->block_rsv = NULL;
-	if (trans->qgroup_reserved) {
-		btrfs_qgroup_free(root, trans->qgroup_reserved);
-		trans->qgroup_reserved = 0;
-	}
 	btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index db6bfd9..54b7dea 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,7 +107,6 @@ struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
 	u64 chunk_bytes_reserved;
-	u64 qgroup_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
-- 
cgit v1.1


From 4ceff0792d36256a5f879cec51c56e44db90b8ec Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:42 +0800
Subject: btrfs: extent-tree: Add new version of btrfs_check_data_free_space
 and btrfs_free_reserved_data_space.

Add new functions __btrfs_check_data_free_space() and
__btrfs_free_reserved_data_space() to work with new accurate qgroup
reserved space framework.

The new function will replace old btrfs_check_data_free_space() and
btrfs_free_reserved_data_space() respectively, but until all the change
is done, let's just use the new name.

Also, export internal use function btrfs_alloc_data_chunk_ondemand(), as
now qgroup reserve requires precious bytes, some operation can't get the
accurate number in advance(like fallocate).
But data space info check and data chunk allocate doesn't need to be
that accurate, and can be called at the beginning.

So export it for later operations.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 ++
 fs/btrfs/extent-tree.c | 85 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9df8f01..a55d199 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3453,7 +3453,10 @@ enum btrfs_reserve_flush_enum {
 };
 
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 130df70..71c172e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3904,11 +3904,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 	return ret;
 }
 
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4029,19 +4025,55 @@ commit_trans:
 					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
-	ret = btrfs_qgroup_reserve(root, write_bytes);
-	if (ret)
-		goto out;
 	data_sinfo->bytes_may_use += bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 1);
-out:
 	spin_unlock(&data_sinfo->lock);
 
 	return ret;
 }
 
 /*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_qgroup_reserve(root, write_bytes);
+	return ret;
+}
+
+/*
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	/* align the range */
+	len = round_up(start + len, root->sectorsize) -
+	      round_down(start, root->sectorsize);
+	start = round_down(start, root->sectorsize);
+
+	ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+	if (ret < 0)
+		return ret;
+
+	/* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+	ret = btrfs_qgroup_reserve_data(inode, start, len);
+	return ret;
+}
+
+/*
  * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
@@ -4061,6 +4093,41 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	spin_unlock(&data_sinfo->lock);
 }
 
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_space_info *data_sinfo;
+
+	/* Make sure the range is aligned to sectorsize */
+	len = round_up(start + len, root->sectorsize) -
+	      round_down(start, root->sectorsize);
+	start = round_down(start, root->sectorsize);
+
+	/*
+	 * Free any reserved qgroup data space first
+	 * As it will alloc memory, we can't do it with data sinfo
+	 * spinlock hold.
+	 */
+	btrfs_qgroup_free_data(inode, start, len);
+
+	data_sinfo = root->fs_info->data_sinfo;
+	spin_lock(&data_sinfo->lock);
+	if (WARN_ON(data_sinfo->bytes_may_use < len))
+		data_sinfo->bytes_may_use = 0;
+	else
+		data_sinfo->bytes_may_use -= len;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, len, 0);
+	spin_unlock(&data_sinfo->lock);
+}
+
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
 	struct list_head *head = &info->space_info;
-- 
cgit v1.1


From d9d8b2a51a404c2d45b9dc4c755f62cb3ddb7c79 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:43 +0800
Subject: btrfs: extent-tree: Switch to new check_data_free_space and
 free_reserved_data_space

Use new reserve/free for buffered write and inode cache.

For buffered write case, as nodatacow write won't increase quota account,
so unlike old behavior which does reserve before check nocow, now we
check nocow first and then only reserve data if we can't do nocow write.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/file.c        | 34 +++++++++++++++++++++-------------
 fs/btrfs/relocation.c  |  8 ++++----
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71c172e..3c58fc3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3356,7 +3356,7 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+	ret = __btrfs_check_data_free_space(inode, 0, num_pages);
 	if (ret)
 		goto out_put;
 
@@ -3365,7 +3365,7 @@ again:
 					      &alloc_hint);
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-	btrfs_free_reserved_data_space(inode, num_pages);
+	__btrfs_free_reserved_data_space(inode, 0, num_pages);
 
 out_put:
 	iput(inode);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7031e96..61fb78f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1507,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		}
 
 		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-		ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
-		if (ret == -ENOSPC &&
-		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-					      BTRFS_INODE_PREALLOC))) {
+
+		if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					     BTRFS_INODE_PREALLOC)) {
 			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret < 0)
+				break;
 			if (ret > 0) {
+				/*
+				 * For nodata cow case, no need to reserve
+				 * data space.
+				 */
 				only_release_metadata = true;
 				/*
 				 * our prealloc extent may be smaller than
@@ -1521,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 				num_pages = DIV_ROUND_UP(write_bytes + offset,
 							 PAGE_CACHE_SIZE);
 				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-				ret = 0;
-			} else {
-				ret = -ENOSPC;
+				goto reserve_metadata;
 			}
 		}
-
-		if (ret)
+		ret = __btrfs_check_data_free_space(inode, pos, write_bytes);
+		if (ret < 0)
 			break;
 
+reserve_metadata:
 		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
 		if (ret) {
 			if (!only_release_metadata)
-				btrfs_free_reserved_data_space(inode,
-							       reserve_bytes);
+				__btrfs_free_reserved_data_space(inode, pos,
+							         write_bytes);
 			else
 				btrfs_end_write_no_snapshoting(root);
 			break;
@@ -2566,8 +2570,11 @@ static long btrfs_fallocate(struct file *file, int mode,
 	/*
 	 * Make sure we have enough space before we do the
 	 * allocation.
+	 * XXX: The behavior must be changed to do accurate check first
+	 * and then check data reserved space.
 	 */
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_start,
+					  alloc_end - alloc_start);
 	if (ret)
 		return ret;
 
@@ -2700,7 +2707,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+	__btrfs_free_reserved_data_space(inode, alloc_start,
+					 alloc_end - alloc_start);
 	return ret;
 }
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 58ede0a..490c9e2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3034,8 +3034,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	BUG_ON(cluster->start != cluster->boundary[0]);
 	mutex_lock(&inode->i_mutex);
 
-	ret = btrfs_check_data_free_space(inode, cluster->end +
-					  1 - cluster->start, 0);
+	ret = __btrfs_check_data_free_space(inode, cluster->start,
+					    cluster->end + 1 - cluster->start);
 	if (ret)
 		goto out;
 
@@ -3056,8 +3056,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 			break;
 		nr++;
 	}
-	btrfs_free_reserved_data_space(inode, cluster->end +
-				       1 - cluster->start);
+	__btrfs_free_reserved_data_space(inode, cluster->start,
+					 cluster->end + 1 - cluster->start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.1


From 1ada3a62b56605befdfc34d6d1796601c0869103 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:53 +0800
Subject: btrfs: extent-tree: Add new version of
 btrfs_delalloc_reserve/release_space

Add new version of btrfs_delalloc_reserve_space() and
btrfs_delalloc_release_space() functions, which supports accurate qgroup
reserve.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a55d199..0f1ade1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3473,7 +3473,9 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 					      unsigned short type);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3c58fc3..8c8d8b3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5715,6 +5715,44 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 }
 
 /**
+ * __btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
+ *
+ * This will do the following things
+ *
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
+ *   extents and how much csums will be needed
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+{
+	int ret;
+
+	ret = __btrfs_check_data_free_space(inode, start, len);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_delalloc_reserve_metadata(inode, len);
+	if (ret < 0)
+		__btrfs_free_reserved_data_space(inode, start, len);
+	return ret;
+}
+
+/**
  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
  * @inode: inode we're writing to
  * @num_bytes: the number of bytes we want to allocate
@@ -5747,6 +5785,27 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 }
 
 /**
+ * __btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
+ */
+void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
+{
+	btrfs_delalloc_release_metadata(inode, len);
+	__btrfs_free_reserved_data_space(inode, start, len);
+}
+
+/**
  * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
  * @num_bytes: the number of bytes we want to free up
-- 
cgit v1.1


From df480633b891cf03301d87e56024a8ec3251da5b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:54 +0800
Subject: btrfs: extent-tree: Switch to new delalloc space reserve and release

Use new __btrfs_delalloc_reserve_space() and
__btrfs_delalloc_release_space() to reserve and release space for
delalloc.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c      |  5 +++--
 fs/btrfs/inode-map.c |  6 +++---
 fs/btrfs/inode.c     | 38 +++++++++++++++++++++++---------------
 fs/btrfs/ioctl.c     | 14 +++++++++-----
 4 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 61fb78f..29b3702 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1608,7 +1608,7 @@ again:
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
 			else
-				btrfs_delalloc_release_space(inode,
+				__btrfs_delalloc_release_space(inode, pos,
 							     release_bytes);
 		}
 
@@ -1661,7 +1661,8 @@ again:
 			btrfs_end_write_no_snapshoting(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
 		} else {
-			btrfs_delalloc_release_space(inode, release_bytes);
+			__btrfs_delalloc_release_space(inode, pos,
+						       release_bytes);
 		}
 	}
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582a..78bc09c 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
 	/* Just to make sure we have enough space */
 	prealloc += 8 * PAGE_CACHE_SIZE;
 
-	ret = btrfs_delalloc_reserve_space(inode, prealloc);
+	ret = __btrfs_delalloc_reserve_space(inode, 0, prealloc);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		btrfs_delalloc_release_space(inode, prealloc);
+		__btrfs_delalloc_release_space(inode, 0, prealloc);
 		goto out_put;
 	}
-	btrfs_free_reserved_data_space(inode, prealloc);
+	__btrfs_free_reserved_data_space(inode, 0, prealloc);
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c21ed7b..a3942a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1769,7 +1769,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
-			btrfs_free_reserved_data_space(inode, len);
+			__btrfs_free_reserved_data_space(inode, state->start,
+							 len);
 
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
 				     root->fs_info->delalloc_batch);
@@ -1992,7 +1993,8 @@ again:
 		goto again;
 	}
 
-	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	ret = __btrfs_delalloc_reserve_space(inode, page_start,
+					     PAGE_CACHE_SIZE);
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		end_extent_writepage(page, ret, page_start, page_end);
@@ -4638,14 +4640,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 	if ((offset & (blocksize - 1)) == 0 &&
 	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
-	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	ret = __btrfs_delalloc_reserve_space(inode,
+			round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
 again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
-		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		__btrfs_delalloc_release_space(inode,
+				round_down(from, PAGE_CACHE_SIZE),
+				PAGE_CACHE_SIZE);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -4713,7 +4718,8 @@ again:
 
 out_unlock:
 	if (ret)
-		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		__btrfs_delalloc_release_space(inode, page_start,
+					       PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -7644,7 +7650,7 @@ unlock:
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		btrfs_free_reserved_data_space(inode, len);
+		__btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
 		current->journal_info = dio_data;
@@ -8434,7 +8440,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			mutex_unlock(&inode->i_mutex);
 			relock = true;
 		}
-		ret = btrfs_delalloc_reserve_space(inode, count);
+		ret = __btrfs_delalloc_reserve_space(inode, offset, count);
 		if (ret)
 			goto out;
 		dio_data.outstanding_extents = div64_u64(count +
@@ -8463,11 +8469,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		current->journal_info = NULL;
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
-				btrfs_delalloc_release_space(inode,
-							dio_data.reserve);
+				__btrfs_delalloc_release_space(inode, offset,
+						dio_data.reserve);
 		} else if (ret >= 0 && (size_t)ret < count)
-			btrfs_delalloc_release_space(inode,
-						     count - (size_t)ret);
+			__btrfs_delalloc_release_space(inode, offset,
+						       count - (size_t)ret);
 	}
 out:
 	if (wakeup)
@@ -8675,7 +8681,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_end;
 
 	sb_start_pagefault(inode->i_sb);
-	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	ret = __btrfs_delalloc_reserve_space(inode, page_start,
+					     PAGE_CACHE_SIZE);
 	if (!ret) {
 		ret = file_update_time(vma->vm_file);
 		reserved = 1;
@@ -8694,8 +8704,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 again:
 	lock_page(page);
 	size = i_size_read(inode);
-	page_start = page_offset(page);
-	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
@@ -8772,7 +8780,7 @@ out_unlock:
 	}
 	unlock_page(page);
 out:
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+	__btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 685df7e..70732e6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1119,8 +1119,9 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-	ret = btrfs_delalloc_reserve_space(inode,
-					   page_cnt << PAGE_CACHE_SHIFT);
+	ret = __btrfs_delalloc_reserve_space(inode,
+			start_index << PAGE_CACHE_SHIFT,
+			page_cnt << PAGE_CACHE_SHIFT);
 	if (ret)
 		return ret;
 	i_done = 0;
@@ -1209,8 +1210,9 @@ again:
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
-		btrfs_delalloc_release_space(inode,
-				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+		__btrfs_delalloc_release_space(inode,
+				start_index << PAGE_CACHE_SHIFT,
+				(page_cnt - i_done) << PAGE_CACHE_SHIFT);
 	}
 
 
@@ -1235,7 +1237,9 @@ out:
 		unlock_page(pages[i]);
 		page_cache_release(pages[i]);
 	}
-	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+	__btrfs_delalloc_release_space(inode,
+			start_index << PAGE_CACHE_SHIFT,
+			page_cnt << PAGE_CACHE_SHIFT);
 	return ret;
 
 }
-- 
cgit v1.1


From 7cf5b97650f2ecefbd5afa2d58b61b289b6e3750 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:55 +0800
Subject: btrfs: qgroup: Cleanup old inaccurate facilities

Cleanup the old facilities which use old btrfs_qgroup_reserve() function
call, replace them with the newer version, and remove the "__" prefix in
them.

Also, make btrfs_qgroup_reserve/free() functions private, as they are
now only used inside qgroup codes.

Now, the whole btrfs qgroup is swithed to use the new reserve facilities.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  12 ++----
 fs/btrfs/extent-tree.c | 109 +++++--------------------------------------------
 fs/btrfs/file.c        |  15 ++++---
 fs/btrfs/inode-map.c   |   6 +--
 fs/btrfs/inode.c       |  34 +++++++--------
 fs/btrfs/ioctl.c       |   6 +--
 fs/btrfs/qgroup.c      |  18 ++++----
 fs/btrfs/qgroup.h      |   8 ----
 fs/btrfs/relocation.c  |   8 ++--
 9 files changed, 60 insertions(+), 156 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0f1ade1..0d0f5d2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3452,11 +3452,9 @@ enum btrfs_reserve_flush_enum {
 	BTRFS_RESERVE_FLUSH_ALL,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3472,10 +3470,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 				      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 					      unsigned short type);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8c8d8b3..3a23225 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3356,7 +3356,7 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = __btrfs_check_data_free_space(inode, 0, num_pages);
+	ret = btrfs_check_data_free_space(inode, 0, num_pages);
 	if (ret)
 		goto out_put;
 
@@ -3365,7 +3365,7 @@ again:
 					      &alloc_hint);
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-	__btrfs_free_reserved_data_space(inode, 0, num_pages);
+	btrfs_free_reserved_data_space(inode, 0, num_pages);
 
 out_put:
 	iput(inode);
@@ -4034,27 +4034,11 @@ commit_trans:
 }
 
 /*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-
-	ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
-	if (ret < 0)
-		return ret;
-	ret = btrfs_qgroup_reserve(root, write_bytes);
-	return ret;
-}
-
-/*
  * New check_data_free_space() with ability for precious data reservation
  * Will replace old btrfs_check_data_free_space(), but for patch split,
  * add a new function first and then replace it.
  */
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -4074,33 +4058,13 @@ int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 }
 
 /*
- * Called if we need to clear a data reservation for this inode.
- */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_space_info *data_sinfo;
-
-	/* make sure bytes are sectorsize aligned */
-	bytes = ALIGN(bytes, root->sectorsize);
-
-	data_sinfo = root->fs_info->data_sinfo;
-	spin_lock(&data_sinfo->lock);
-	WARN_ON(data_sinfo->bytes_may_use < bytes);
-	data_sinfo->bytes_may_use -= bytes;
-	trace_btrfs_space_reservation(root->fs_info, "space_info",
-				      data_sinfo->flags, bytes, 0);
-	spin_unlock(&data_sinfo->lock);
-}
-
-/*
  * Called if we need to clear a data reservation for this inode
  * Normally in a error case.
  *
  * This one will handle the per-indoe data rsv map for accurate reserved
  * space framework.
  */
-void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_space_info *data_sinfo;
@@ -5715,7 +5679,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 }
 
 /**
- * __btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
  * delalloc
  * @inode: inode we're writing to
  * @start: start range we are writing to
@@ -5739,53 +5703,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
  * Return 0 for success
  * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
 {
 	int ret;
 
-	ret = __btrfs_check_data_free_space(inode, start, len);
+	ret = btrfs_check_data_free_space(inode, start, len);
 	if (ret < 0)
 		return ret;
 	ret = btrfs_delalloc_reserve_metadata(inode, len);
 	if (ret < 0)
-		__btrfs_free_reserved_data_space(inode, start, len);
+		btrfs_free_reserved_data_space(inode, start, len);
 	return ret;
 }
 
 /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
- * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
- *
- * This will do the following things
- *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
- *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
- * o add it to the fs_info's delalloc inodes list.
- *
- * This will return 0 for success and -ENOSPC if there is no space left.
- */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
-{
-	int ret;
-
-	ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-	if (ret)
-		return ret;
-
-	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
-	if (ret) {
-		btrfs_free_reserved_data_space(inode, num_bytes);
-		return ret;
-	}
-
-	return 0;
-}
-
-/**
- * __btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
  * @start: start position of the space already reserved
  * @len: the len of the space already reserved
@@ -5799,29 +5731,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
  * list if there are no delalloc bytes left.
  * Also it will handle the qgroup reserved space.
  */
-void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
 {
 	btrfs_delalloc_release_metadata(inode, len);
-	__btrfs_free_reserved_data_space(inode, start, len);
-}
-
-/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
- *
- * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
- * called in the case that we don't need the metadata AND data reservations
- * anymore.  So if there is an error or we insert an inline extent.
- *
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- */
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
-{
-	btrfs_delalloc_release_metadata(inode, num_bytes);
-	btrfs_free_reserved_data_space(inode, num_bytes);
+	btrfs_free_reserved_data_space(inode, start, len);
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29b3702..47785c8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1529,7 +1529,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 				goto reserve_metadata;
 			}
 		}
-		ret = __btrfs_check_data_free_space(inode, pos, write_bytes);
+		ret = btrfs_check_data_free_space(inode, pos, write_bytes);
 		if (ret < 0)
 			break;
 
@@ -1537,8 +1537,8 @@ reserve_metadata:
 		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
 		if (ret) {
 			if (!only_release_metadata)
-				__btrfs_free_reserved_data_space(inode, pos,
-							         write_bytes);
+				btrfs_free_reserved_data_space(inode, pos,
+							       write_bytes);
 			else
 				btrfs_end_write_no_snapshoting(root);
 			break;
@@ -1608,7 +1608,7 @@ again:
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
 			else
-				__btrfs_delalloc_release_space(inode, pos,
+				btrfs_delalloc_release_space(inode, pos,
 							     release_bytes);
 		}
 
@@ -1661,8 +1661,7 @@ again:
 			btrfs_end_write_no_snapshoting(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
 		} else {
-			__btrfs_delalloc_release_space(inode, pos,
-						       release_bytes);
+			btrfs_delalloc_release_space(inode, pos, release_bytes);
 		}
 	}
 
@@ -2708,8 +2707,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	__btrfs_free_reserved_data_space(inode, alloc_start,
-					 alloc_end - alloc_start);
+	btrfs_free_reserved_data_space(inode, alloc_start,
+				       alloc_end - alloc_start);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 78bc09c..767a605 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
 	/* Just to make sure we have enough space */
 	prealloc += 8 * PAGE_CACHE_SIZE;
 
-	ret = __btrfs_delalloc_reserve_space(inode, 0, prealloc);
+	ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		__btrfs_delalloc_release_space(inode, 0, prealloc);
+		btrfs_delalloc_release_space(inode, 0, prealloc);
 		goto out_put;
 	}
-	__btrfs_free_reserved_data_space(inode, 0, prealloc);
+	btrfs_free_reserved_data_space(inode, 0, prealloc);
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3942a7..a34e5a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1769,8 +1769,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
-			__btrfs_free_reserved_data_space(inode, state->start,
-							 len);
+			btrfs_free_reserved_data_space(inode, state->start,
+						       len);
 
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
 				     root->fs_info->delalloc_batch);
@@ -1993,8 +1993,8 @@ again:
 		goto again;
 	}
 
-	ret = __btrfs_delalloc_reserve_space(inode, page_start,
-					     PAGE_CACHE_SIZE);
+	ret = btrfs_delalloc_reserve_space(inode, page_start,
+					   PAGE_CACHE_SIZE);
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		end_extent_writepage(page, ret, page_start, page_end);
@@ -4640,7 +4640,7 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 	if ((offset & (blocksize - 1)) == 0 &&
 	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
-	ret = __btrfs_delalloc_reserve_space(inode,
+	ret = btrfs_delalloc_reserve_space(inode,
 			round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
@@ -4648,7 +4648,7 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
-		__btrfs_delalloc_release_space(inode,
+		btrfs_delalloc_release_space(inode,
 				round_down(from, PAGE_CACHE_SIZE),
 				PAGE_CACHE_SIZE);
 		ret = -ENOMEM;
@@ -4718,8 +4718,8 @@ again:
 
 out_unlock:
 	if (ret)
-		__btrfs_delalloc_release_space(inode, page_start,
-					       PAGE_CACHE_SIZE);
+		btrfs_delalloc_release_space(inode, page_start,
+					     PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -7650,7 +7650,7 @@ unlock:
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		__btrfs_free_reserved_data_space(inode, start, len);
+		btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
 		current->journal_info = dio_data;
@@ -8440,7 +8440,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			mutex_unlock(&inode->i_mutex);
 			relock = true;
 		}
-		ret = __btrfs_delalloc_reserve_space(inode, offset, count);
+		ret = btrfs_delalloc_reserve_space(inode, offset, count);
 		if (ret)
 			goto out;
 		dio_data.outstanding_extents = div64_u64(count +
@@ -8469,11 +8469,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		current->journal_info = NULL;
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
-				__btrfs_delalloc_release_space(inode, offset,
-						dio_data.reserve);
+				btrfs_delalloc_release_space(inode, offset,
+							     dio_data.reserve);
 		} else if (ret >= 0 && (size_t)ret < count)
-			__btrfs_delalloc_release_space(inode, offset,
-						       count - (size_t)ret);
+			btrfs_delalloc_release_space(inode, offset,
+						     count - (size_t)ret);
 	}
 out:
 	if (wakeup)
@@ -8684,8 +8684,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	page_start = page_offset(page);
 	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-	ret = __btrfs_delalloc_reserve_space(inode, page_start,
-					     PAGE_CACHE_SIZE);
+	ret = btrfs_delalloc_reserve_space(inode, page_start,
+					   PAGE_CACHE_SIZE);
 	if (!ret) {
 		ret = file_update_time(vma->vm_file);
 		reserved = 1;
@@ -8780,7 +8780,7 @@ out_unlock:
 	}
 	unlock_page(page);
 out:
-	__btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
+	btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 70732e6..7ed033a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1119,7 +1119,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-	ret = __btrfs_delalloc_reserve_space(inode,
+	ret = btrfs_delalloc_reserve_space(inode,
 			start_index << PAGE_CACHE_SHIFT,
 			page_cnt << PAGE_CACHE_SHIFT);
 	if (ret)
@@ -1210,7 +1210,7 @@ again:
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
-		__btrfs_delalloc_release_space(inode,
+		btrfs_delalloc_release_space(inode,
 				start_index << PAGE_CACHE_SHIFT,
 				(page_cnt - i_done) << PAGE_CACHE_SHIFT);
 	}
@@ -1237,7 +1237,7 @@ out:
 		unlock_page(pages[i]);
 		page_cache_release(pages[i]);
 	}
-	__btrfs_delalloc_release_space(inode,
+	btrfs_delalloc_release_space(inode,
 			start_index << PAGE_CACHE_SHIFT,
 			page_cnt << PAGE_CACHE_SHIFT);
 	return ret;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 897a49d..7d5339d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2035,7 +2035,7 @@ out:
 	return ret;
 }
 
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
@@ -2168,6 +2168,11 @@ out:
 	spin_unlock(&fs_info->qgroup_lock);
 }
 
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+					 num_bytes);
+}
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
 	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2517,7 +2522,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
 			&changeset);
 	if (ret < 0)
 		goto cleanup;
-	ret = btrfs_qgroup_reserve(root, changeset.bytes_changed);
+	ret = qgroup_reserve(root, changeset.bytes_changed);
 	if (ret < 0)
 		goto cleanup;
 
@@ -2553,8 +2558,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
 		goto out;
 
 	if (free)
-		btrfs_qgroup_free(BTRFS_I(inode)->root,
-				  changeset.bytes_changed);
+		qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
 out:
 	ulist_free(changeset.range_changed);
 	return ret;
@@ -2604,7 +2608,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
 		return 0;
 
 	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
-	ret = btrfs_qgroup_reserve(root, num_bytes);
+	ret = qgroup_reserve(root, num_bytes);
 	if (ret < 0)
 		return ret;
 	atomic_add(num_bytes, &root->qgroup_meta_rsv);
@@ -2621,7 +2625,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
 	reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
 	if (reserved == 0)
 		return;
-	btrfs_qgroup_free(root, reserved);
+	qgroup_free(root, reserved);
 }
 
 void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
@@ -2632,5 +2636,5 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
 	WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
 	atomic_sub(num_bytes, &root->qgroup_meta_rsv);
-	btrfs_qgroup_free(root, num_bytes);
+	qgroup_free(root, num_bytes);
 }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7d1c87c..adb03da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -71,15 +71,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       u64 ref_root, u64 num_bytes);
-static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
-{
-	return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
-					 num_bytes);
-}
-
 /*
  * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
  * called by everywhere, can't provide good trace for delayed ref case.
@@ -89,7 +82,6 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
 {
 	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
 }
-
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 490c9e2..a7dc456 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3034,8 +3034,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	BUG_ON(cluster->start != cluster->boundary[0]);
 	mutex_lock(&inode->i_mutex);
 
-	ret = __btrfs_check_data_free_space(inode, cluster->start,
-					    cluster->end + 1 - cluster->start);
+	ret = btrfs_check_data_free_space(inode, cluster->start,
+					  cluster->end + 1 - cluster->start);
 	if (ret)
 		goto out;
 
@@ -3056,8 +3056,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 			break;
 		nr++;
 	}
-	__btrfs_free_reserved_data_space(inode, cluster->start,
-					 cluster->end + 1 - cluster->start);
+	btrfs_free_reserved_data_space(inode, cluster->start,
+				       cluster->end + 1 - cluster->start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.1


From 94ed938aba557aa798acf496f09afb289b619fcd Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:56 +0800
Subject: btrfs: qgroup: Add handler for NOCOW and inline

For NOCOW and inline case, there will be no delayed_ref created for
them, so we should free their reserved data space at proper
time(finish_ordered_io for NOCOW and cow_file_inline for inline).

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  7 ++++++-
 fs/btrfs/inode.c       | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3a23225..aaaa774 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4052,7 +4052,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 	if (ret < 0)
 		return ret;
 
-	/* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+	/*
+	 * Use new btrfs_qgroup_reserve_data to reserve precious data space
+	 *
+	 * TODO: Find a good method to avoid reserve data space for NOCOW
+	 * range, but don't impact performance on quota disable case.
+	 */
 	ret = btrfs_qgroup_reserve_data(inode, start, len);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a34e5a9..ef5d369 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+	/*
+	 * Don't forget to free the reserved space, as for inlined extent
+	 * it won't count as data extent, free them directly here.
+	 * And at reserve time, it's always aligned to page size, so
+	 * just free one page here.
+	 */
+	btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -2838,6 +2845,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+		/*
+		 * For mwrite(mmap + memset to write) case, we still reserve
+		 * space for NOCOW range.
+		 * As NOCOW won't cause a new delayed ref, just free the space
+		 */
+		btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+				       ordered_extent->len);
 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (nolock)
 			trans = btrfs_join_transaction_nolock(root);
-- 
cgit v1.1


From b9d0b38928e21560550bd3c1a278d6e004d3bde6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 29 Sep 2015 10:35:16 +0800
Subject: btrfs: Add handler for invalidate page

For btrfs_invalidatepage() and its variant evict_inode_truncate_page(),
there will be pages don't reach disk.
In that case, their reserved space won't be release nor freed by
finish_ordered_io() nor delayed_ref handler.

So we must free their qgroup reserved space, or we will leaking reserved
space again.

So this will patch will call btrfs_qgroup_free_data() for
invalidatepage() and its variant evict_inode_truncate_page().

And due to the nature of new btrfs_qgroup_reserve/free_data() reserved
space will only be reserved or freed once, so for pages which are
already flushed to disk, their reserved space will be released and freed
by delayed_ref handler.

Double free won't be a problem.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef5d369..6670792 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5132,6 +5132,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
 		spin_unlock(&io_tree->lock);
 
 		lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+		/*
+		 * If still has DELALLOC flag, the extent didn't reach disk,
+		 * and its reserved space won't be freed by delayed_ref.
+		 * So we need to free its reserved space here.
+		 * (Refer to comment in btrfs_invalidatepage, case 2)
+		 *
+		 * Note, end is the bytenr of last byte, so we need + 1 here.
+		 */
+		if (state->state & EXTENT_DELALLOC)
+			btrfs_qgroup_free_data(inode, start, end - start + 1);
+
 		clear_extent_bit(io_tree, start, end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -8646,6 +8658,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		}
 	}
 
+	/*
+	 * Qgroup reserved space handler
+	 * Page here will be either
+	 * 1) Already written to disk
+	 *    In this case, its reserved space is released from data rsv map
+	 *    and will be freed by delayed_ref handler finally.
+	 *    So even we call qgroup_free_data(), it won't decrease reserved
+	 *    space.
+	 * 2) Not written to disk
+	 *    This means the reserved space should be freed here.
+	 */
+	btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
 	if (!inode_evicting) {
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
-- 
cgit v1.1


From 81fb6f77a02678ddb0755c24f83c4ed5207da046 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 28 Sep 2015 16:57:53 +0800
Subject: btrfs: qgroup: Add new trace point for qgroup data reserve

Now each qgroup reserve for data will has its ftrace event for better
debugging.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c            |  11 ++++-
 fs/btrfs/qgroup.h            |   8 +++
 include/trace/events/btrfs.h | 113 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7d5339d..879343c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2516,10 +2516,12 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
 
 	changeset.bytes_changed = 0;
 	changeset.range_changed = ulist_alloc(GFP_NOFS);
-
 	ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
 			start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
 			&changeset);
+	trace_btrfs_qgroup_reserve_data(inode, start, len,
+					changeset.bytes_changed,
+					QGROUP_RESERVE);
 	if (ret < 0)
 		goto cleanup;
 	ret = qgroup_reserve(root, changeset.bytes_changed);
@@ -2544,6 +2546,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
 				       int free)
 {
 	struct extent_changeset changeset;
+	int trace_op = QGROUP_RELEASE;
 	int ret;
 
 	changeset.bytes_changed = 0;
@@ -2557,8 +2560,12 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
 	if (ret < 0)
 		goto out;
 
-	if (free)
+	if (free) {
 		qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+		trace_op = QGROUP_FREE;
+	}
+	trace_btrfs_qgroup_release_data(inode, start, len,
+					changeset.bytes_changed, trace_op);
 out:
 	ulist_free(changeset.range_changed);
 	return ret;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index adb03da..686b60f 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
 	struct ulist *old_roots;
 };
 
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE		(1<<0)
+#define QGROUP_RELEASE		(1<<1)
+#define QGROUP_FREE		(1<<2)
+
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -81,6 +88,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
 						 u64 ref_root, u64 num_bytes)
 {
 	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+	trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
 }
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 0b73af9..b4473da 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1117,6 +1117,119 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
 	TP_ARGS(wq)
 );
 
+DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
+
+	TP_PROTO(struct inode *inode, u64 free_reserved),
+
+	TP_ARGS(inode, free_reserved),
+
+	TP_STRUCT__entry(
+		__field(	u64,		rootid		)
+		__field(	unsigned long,	ino		)
+		__field(	u64,		free_reserved	)
+	),
+
+	TP_fast_assign(
+		__entry->rootid		=	BTRFS_I(inode)->root->objectid;
+		__entry->ino		=	inode->i_ino;
+		__entry->free_reserved	=	free_reserved;
+	),
+
+	TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
+		  __entry->rootid, __entry->ino, __entry->free_reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
+
+	TP_PROTO(struct inode *inode, u64 free_reserved),
+
+	TP_ARGS(inode, free_reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
+
+	TP_PROTO(struct inode *inode, u64 free_reserved),
+
+	TP_ARGS(inode, free_reserved)
+);
+
+#define BTRFS_QGROUP_OPERATIONS				\
+	{ QGROUP_RESERVE,	"reserve"	},	\
+	{ QGROUP_RELEASE,	"release"	},	\
+	{ QGROUP_FREE,		"free"		}
+
+DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
+
+	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+	TP_ARGS(inode, start, len, reserved, op),
+
+	TP_STRUCT__entry(
+		__field(	u64,		rootid		)
+		__field(	unsigned long,	ino		)
+		__field(	u64,		start		)
+		__field(	u64,		len		)
+		__field(	u64,		reserved	)
+		__field(	int,		op		)
+	),
+
+	TP_fast_assign(
+		__entry->rootid		= BTRFS_I(inode)->root->objectid;
+		__entry->ino		= inode->i_ino;
+		__entry->start		= start;
+		__entry->len		= len;
+		__entry->reserved	= reserved;
+		__entry->op		= op;
+	),
+
+	TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
+		  __entry->rootid, __entry->ino, __entry->start, __entry->len,
+		  __entry->reserved,
+		  __print_flags((unsigned long)__entry->op, "",
+				BTRFS_QGROUP_OPERATIONS)
+	)
+);
+
+DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
+
+	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+	TP_ARGS(inode, start, len, reserved, op)
+);
+
+DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
+
+	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+	TP_ARGS(inode, start, len, reserved, op)
+);
+
+DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
+
+	TP_PROTO(u64 ref_root, u64 reserved),
+
+	TP_ARGS(ref_root, reserved),
+
+	TP_STRUCT__entry(
+		__field(	u64,		ref_root	)
+		__field(	u64,		reserved	)
+	),
+
+	TP_fast_assign(
+		__entry->ref_root	= ref_root;
+		__entry->reserved	= reserved;
+	),
+
+	TP_printk("root=%llu, reserved=%llu, op=free",
+		  __entry->ref_root, __entry->reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
+
+	TP_PROTO(u64 ref_root, u64 reserved),
+
+	TP_ARGS(ref_root, reserved)
+);
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.1


From 14524a846eb52c18438e9bd5eb8cf1431fd57b44 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:44 +0800
Subject: btrfs: fallocate: Add support to accurate qgroup reserve

Now fallocate will do accurate qgroup reserve space check, unlike old
method, which will always reserve the whole length of the range.

With this patch, fallocate will:
1) Iterate the desired range and mark in data rsv map
   Only range which is going to be allocated will be recorded in data
   rsv map and reserve the space.
   For already allocated range (normal/prealloc extent) they will be
   skipped.
   Also, record the marked range into a new list for later use.

2) If 1) succeeded, do real file extent allocate.
   And at file extent allocation time, corresponding range will be
   removed from the range in data rsv map.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 161 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 117 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 47785c8..1243205 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2542,17 +2542,61 @@ out_only_mutex:
 	return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+	struct list_head list;
+	u64 start;
+	u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+	struct falloc_range *prev = NULL;
+	struct falloc_range *range = NULL;
+
+	if (list_empty(head))
+		goto insert;
+
+	/*
+	 * As fallocate iterate by bytenr order, we only need to check
+	 * the last range.
+	 */
+	prev = list_entry(head->prev, struct falloc_range, list);
+	if (prev->start + prev->len == start) {
+		prev->len += len;
+		return 0;
+	}
+insert:
+	range = kmalloc(sizeof(*range), GFP_NOFS);
+	if (!range)
+		return -ENOMEM;
+	range->start = start;
+	range->len = len;
+	list_add_tail(&range->list, head);
+	return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
 	struct extent_state *cached_state = NULL;
+	struct falloc_range *range;
+	struct falloc_range *tmp;
+	struct list_head reserve_list;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
 	u64 locked_end;
+	u64 actual_end = 0;
 	struct extent_map *em;
 	int blocksize = BTRFS_I(inode)->root->sectorsize;
 	int ret;
@@ -2568,14 +2612,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 		return btrfs_punch_hole(inode, offset, len);
 
 	/*
-	 * Make sure we have enough space before we do the
-	 * allocation.
-	 * XXX: The behavior must be changed to do accurate check first
-	 * and then check data reserved space.
+	 * Only trigger disk allocation, don't trigger qgroup reserve
+	 *
+	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_check_data_free_space(inode, alloc_start,
-					  alloc_end - alloc_start);
-	if (ret)
+	ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+	if (ret < 0)
 		return ret;
 
 	mutex_lock(&inode->i_mutex);
@@ -2583,6 +2625,13 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	/*
+	 * TODO: Move these two operations after we have checked
+	 * accurate reserved space, or fallocate can still fail but
+	 * with page truncated or size expanded.
+	 *
+	 * But that's a minor problem and won't do much harm BTW.
+	 */
 	if (alloc_start > inode->i_size) {
 		ret = btrfs_cont_expand(inode, i_size_read(inode),
 					alloc_start);
@@ -2641,10 +2690,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 		}
 	}
 
+	/* First, check if we exceed the qgroup limit */
+	INIT_LIST_HEAD(&reserve_list);
 	cur_offset = alloc_start;
 	while (1) {
-		u64 actual_end;
-
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				      alloc_end - cur_offset, 0);
 		if (IS_ERR_OR_NULL(em)) {
@@ -2657,54 +2706,78 @@ static long btrfs_fallocate(struct file *file, int mode,
 		last_byte = min(extent_map_end(em), alloc_end);
 		actual_end = min_t(u64, extent_map_end(em), offset + len);
 		last_byte = ALIGN(last_byte, blocksize);
-
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-							last_byte - cur_offset,
-							1 << inode->i_blkbits,
-							offset + len,
-							&alloc_hint);
-		} else if (actual_end > inode->i_size &&
-			   !(mode & FALLOC_FL_KEEP_SIZE)) {
-			struct btrfs_trans_handle *trans;
-			struct btrfs_root *root = BTRFS_I(inode)->root;
-
-			/*
-			 * We didn't need to allocate any more space, but we
-			 * still extended the size of the file so we need to
-			 * update i_size and the inode item.
-			 */
-			trans = btrfs_start_transaction(root, 1);
-			if (IS_ERR(trans)) {
-				ret = PTR_ERR(trans);
-			} else {
-				inode->i_ctime = CURRENT_TIME;
-				i_size_write(inode, actual_end);
-				btrfs_ordered_update_i_size(inode, actual_end,
-							    NULL);
-				ret = btrfs_update_inode(trans, root, inode);
-				if (ret)
-					btrfs_end_transaction(trans, root);
-				else
-					ret = btrfs_end_transaction(trans,
-								    root);
+			ret = add_falloc_range(&reserve_list, cur_offset,
+					       last_byte - cur_offset);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
 			}
+			ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+					last_byte - cur_offset);
+			if (ret < 0)
+				break;
 		}
 		free_extent_map(em);
-		if (ret < 0)
-			break;
-
 		cur_offset = last_byte;
-		if (cur_offset >= alloc_end) {
-			ret = 0;
+		if (cur_offset >= alloc_end)
 			break;
+	}
+
+	/*
+	 * If ret is still 0, means we're OK to fallocate.
+	 * Or just cleanup the list and exit.
+	 */
+	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+		if (!ret)
+			ret = btrfs_prealloc_file_range(inode, mode,
+					range->start,
+					range->len, 1 << inode->i_blkbits,
+					offset + len, &alloc_hint);
+		list_del(&range->list);
+		kfree(range);
+	}
+	if (ret < 0)
+		goto out_unlock;
+
+	if (actual_end > inode->i_size &&
+	    !(mode & FALLOC_FL_KEEP_SIZE)) {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		/*
+		 * We didn't need to allocate any more space, but we
+		 * still extended the size of the file so we need to
+		 * update i_size and the inode item.
+		 */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+		} else {
+			inode->i_ctime = CURRENT_TIME;
+			i_size_write(inode, actual_end);
+			btrfs_ordered_update_i_size(inode, actual_end, NULL);
+			ret = btrfs_update_inode(trans, root, inode);
+			if (ret)
+				btrfs_end_transaction(trans, root);
+			else
+				ret = btrfs_end_transaction(trans, root);
 		}
 	}
+out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
 out:
+	/*
+	 * As we waited the extent range, the data_rsv_map must be empty
+	 * in the range, as written data range will be released from it.
+	 * And for prealloacted extent, it will also be released when
+	 * its metadata is written.
+	 * So this is completely used as cleanup.
+	 */
+	btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_start,
-- 
cgit v1.1


From 51773bec7ea352f3b9afa11ecfc72324c7977335 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 8 Oct 2015 18:19:37 +0800
Subject: btrfs: qgroup: Avoid calling btrfs_free_reserved_data_space in
 clear_bit_hook

In clear_bit_hook, qgroup reserved data is already handled quite well,
either released by finish_ordered_io or invalidatepage.

So calling btrfs_qgroup_free_data() here is completely meaningless, and
since btrfs_qgroup_free_data() will lock io_tree, so it can't be called
with io_tree lock hold.

This patch will add a new function
btrfs_free_reserved_data_space_noquota() for clear_bit_hook() to cease
the lockdep warning.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c | 28 ++++++++++++++++++----------
 fs/btrfs/inode.c       |  4 ++--
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d0f5d2..2135b82 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3455,6 +3455,8 @@ enum btrfs_reserve_flush_enum {
 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+					    u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aaaa774..4660960 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4066,10 +4066,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
  * Called if we need to clear a data reservation for this inode
  * Normally in a error case.
  *
- * This one will handle the per-indoe data rsv map for accurate reserved
- * space framework.
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
  */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+					    u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_space_info *data_sinfo;
@@ -4079,13 +4081,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 	      round_down(start, root->sectorsize);
 	start = round_down(start, root->sectorsize);
 
-	/*
-	 * Free any reserved qgroup data space first
-	 * As it will alloc memory, we can't do it with data sinfo
-	 * spinlock hold.
-	 */
-	btrfs_qgroup_free_data(inode, start, len);
-
 	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
 	if (WARN_ON(data_sinfo->bytes_may_use < len))
@@ -4097,6 +4092,19 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 	spin_unlock(&data_sinfo->lock);
 }
 
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+	btrfs_free_reserved_data_space_noquota(inode, start, len);
+	btrfs_qgroup_free_data(inode, start, len);
+}
+
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
 	struct list_head *head = &info->space_info;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6670792..3229c13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1776,8 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
-			btrfs_free_reserved_data_space(inode, state->start,
-						       len);
+			btrfs_free_reserved_data_space_noquota(inode,
+					state->start, len);
 
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
 				     root->fs_info->delalloc_batch);
-- 
cgit v1.1


From 56fa9d0762ed17153c1bdff3c0aeeecbe522b504 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 13 Oct 2015 09:53:10 +0800
Subject: btrfs: qgroup: Check if qgroup reserved space leaked

Add check at btrfs_destroy_inode() time to detect qgroup reserved space
leak.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c  |  1 +
 fs/btrfs/qgroup.c | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h |  1 +
 3 files changed, 34 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3229c13..df6b93f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9108,6 +9108,7 @@ void btrfs_destroy_inode(struct inode *inode)
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
+	btrfs_qgroup_check_reserved_leak(inode);
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 879343c..158633c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2645,3 +2645,35 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	atomic_sub(num_bytes, &root->qgroup_meta_rsv);
 	qgroup_free(root, num_bytes);
 }
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+	struct extent_changeset changeset;
+	struct ulist_node *unode;
+	struct ulist_iterator iter;
+	int ret;
+
+	changeset.bytes_changed = 0;
+	changeset.range_changed = ulist_alloc(GFP_NOFS);
+	if (WARN_ON(!changeset.range_changed))
+		return;
+
+	ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+			EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+	WARN_ON(ret < 0);
+	if (WARN_ON(changeset.bytes_changed)) {
+		ULIST_ITER_INIT(&iter);
+		while ((unode = ulist_next(changeset.range_changed, &iter))) {
+			btrfs_warn(BTRFS_I(inode)->root->fs_info,
+				"leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+				inode->i_ino, unode->val, unode->aux);
+		}
+		qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+	}
+	ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 686b60f..ecb2c14 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -105,4 +105,5 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
 void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
 void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From 161c3549b45aeef05451b6822d8aaaf39c7bedce Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 24 Sep 2015 16:17:39 -0400
Subject: Btrfs: change how we wait for pending ordered extents

We have a mechanism to make sure we don't lose updates for ordered extents that
were logged in the transaction that is currently running.  We add the ordered
extent to a transaction list and then the transaction waits on all the ordered
extents in that list.  However are substantially large file systems this list
can be extremely large, and can give us soft lockups, since the ordered extents
don't remove themselves from the list when they do complete.

To fix this we simply add a counter to the transaction that is incremented any
time we have a logged extent that needs to be completed in the current
transaction.  Then when the ordered extent finally completes it decrements the
per transaction counter and wakes up the transaction if we are the last ones.
This will eliminate the softlockup.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c      | 20 ----------------
 fs/btrfs/ordered-data.c | 64 ++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ordered-data.h |  2 ++
 fs/btrfs/transaction.c  | 34 +++++---------------------
 fs/btrfs/transaction.h  |  4 ++--
 5 files changed, 60 insertions(+), 64 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bcbb596..dceabb1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4326,25 +4326,6 @@ again:
 	return 0;
 }
 
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
-				       struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_ordered_extent *ordered;
-
-	spin_lock(&fs_info->trans_lock);
-	while (!list_empty(&cur_trans->pending_ordered)) {
-		ordered = list_first_entry(&cur_trans->pending_ordered,
-					   struct btrfs_ordered_extent,
-					   trans_list);
-		list_del_init(&ordered->trans_list);
-		spin_unlock(&fs_info->trans_lock);
-
-		btrfs_put_ordered_extent(ordered);
-		spin_lock(&fs_info->trans_lock);
-	}
-	spin_unlock(&fs_info->trans_lock);
-}
-
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_root *root)
 {
@@ -4356,7 +4337,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	wake_up(&root->fs_info->transaction_wait);
 
-	btrfs_free_pending_ordered(cur_trans, root->fs_info);
 	btrfs_destroy_delayed_inodes(root);
 	btrfs_assert_delayed_root_empty(root);
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 071005f..8c27292 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -490,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 
 	spin_lock_irq(&log->log_extents_lock[index]);
 	while (!list_empty(&log->logged_list[index])) {
+		struct inode *inode;
 		ordered = list_first_entry(&log->logged_list[index],
 					   struct btrfs_ordered_extent,
 					   log_list);
 		list_del_init(&ordered->log_list);
+		inode = ordered->inode;
 		spin_unlock_irq(&log->log_extents_lock[index]);
 
 		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
 		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
-			struct inode *inode = ordered->inode;
 			u64 start = ordered->file_offset;
 			u64 end = ordered->file_offset + ordered->len - 1;
 
@@ -509,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 						   &ordered->flags));
 
 		/*
-		 * If our ordered extent completed it means it updated the
-		 * fs/subvol and csum trees already, so no need to make the
-		 * current transaction's commit wait for it, as we end up
-		 * holding memory unnecessarily and delaying the inode's iput
-		 * until the transaction commit (we schedule an iput for the
-		 * inode when the ordered extent's refcount drops to 0), which
-		 * prevents it from being evictable until the transaction
-		 * commits.
+		 * In order to keep us from losing our ordered extent
+		 * information when committing the transaction we have to make
+		 * sure that any logged extents are completed when we go to
+		 * commit the transaction.  To do this we simply increase the
+		 * current transactions pending_ordered counter and decrement it
+		 * when the ordered extent completes.
 		 */
-		if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
-			btrfs_put_ordered_extent(ordered);
-		else
-			list_add_tail(&ordered->trans_list, &trans->ordered);
-
+		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+			struct btrfs_ordered_inode_tree *tree;
+
+			tree = &BTRFS_I(inode)->ordered_tree;
+			spin_lock_irq(&tree->lock);
+			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+				atomic_inc(&trans->transaction->pending_ordered);
+			}
+			spin_unlock_irq(&tree->lock);
+		}
+		btrfs_put_ordered_extent(ordered);
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
@@ -584,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct rb_node *node;
+	bool dec_pending_ordered = false;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
@@ -593,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	if (tree->last == node)
 		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+		dec_pending_ordered = true;
 	spin_unlock_irq(&tree->lock);
 
+	/*
+	 * The current running transaction is waiting on us, we need to let it
+	 * know that we're complete and wake it up.
+	 */
+	if (dec_pending_ordered) {
+		struct btrfs_transaction *trans;
+
+		/*
+		 * The checks for trans are just a formality, it should be set,
+		 * but if it isn't we don't want to deref/assert under the spin
+		 * lock, so be nice and check if trans is set, but ASSERT() so
+		 * if it isn't set a developer will notice.
+		 */
+		spin_lock(&root->fs_info->trans_lock);
+		trans = root->fs_info->running_transaction;
+		if (trans)
+			atomic_inc(&trans->use_count);
+		spin_unlock(&root->fs_info->trans_lock);
+
+		ASSERT(trans);
+		if (trans) {
+			if (atomic_dec_and_test(&trans->pending_ordered))
+				wake_up(&trans->pending_wait);
+			btrfs_put_transaction(trans);
+		}
+	}
+
 	spin_lock(&root->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
 	root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0..23c9605 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
 				 * in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+				  * complete in the current transaction. */
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9354e7a..68a56c3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -232,6 +232,7 @@ loop:
 	extwriter_counter_init(cur_trans, type);
 	init_waitqueue_head(&cur_trans->writer_wait);
 	init_waitqueue_head(&cur_trans->commit_wait);
+	init_waitqueue_head(&cur_trans->pending_wait);
 	cur_trans->state = TRANS_STATE_RUNNING;
 	/*
 	 * One for this trans handle, one so it will live on until we
@@ -239,6 +240,7 @@ loop:
 	 */
 	atomic_set(&cur_trans->use_count, 2);
 	cur_trans->have_free_bgs = 0;
+	atomic_set(&cur_trans->pending_ordered, 0);
 	cur_trans->start_time = get_seconds();
 	cur_trans->dirty_bg_run = 0;
 
@@ -266,7 +268,6 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
-	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
 	INIT_LIST_HEAD(&cur_trans->io_bgs);
 	INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -551,7 +552,6 @@ again:
 	h->can_flush_pending_bgs = true;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
-	INIT_LIST_HEAD(&h->ordered);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -784,12 +784,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
-	if (!list_empty(&trans->ordered)) {
-		spin_lock(&info->trans_lock);
-		list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-		spin_unlock(&info->trans_lock);
-	}
-
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync) {
 		must_run_delayed_refs =
@@ -1788,25 +1782,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 }
 
 static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-			   struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
 {
-	struct btrfs_ordered_extent *ordered;
-
-	spin_lock(&fs_info->trans_lock);
-	while (!list_empty(&cur_trans->pending_ordered)) {
-		ordered = list_first_entry(&cur_trans->pending_ordered,
-					   struct btrfs_ordered_extent,
-					   trans_list);
-		list_del_init(&ordered->trans_list);
-		spin_unlock(&fs_info->trans_lock);
-
-		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-						   &ordered->flags));
-		btrfs_put_ordered_extent(ordered);
-		spin_lock(&fs_info->trans_lock);
-	}
-	spin_unlock(&fs_info->trans_lock);
+	wait_event(cur_trans->pending_wait,
+		   atomic_read(&cur_trans->pending_ordered) == 0);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1890,7 +1869,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&root->fs_info->trans_lock);
-	list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
 		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
@@ -1949,7 +1927,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_wait_delalloc_flush(root->fs_info);
 
-	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+	btrfs_wait_pending_ordered(cur_trans);
 
 	btrfs_scrub_pause(root);
 	/*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index db6bfd9..bf7b1dd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -46,6 +46,7 @@ struct btrfs_transaction {
 	 */
 	atomic_t num_writers;
 	atomic_t use_count;
+	atomic_t pending_ordered;
 
 	/*
 	 * true if there is free bgs operations in this transaction
@@ -59,9 +60,9 @@ struct btrfs_transaction {
 	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
+	wait_queue_head_t pending_wait;
 	struct list_head pending_snapshots;
 	struct list_head pending_chunks;
-	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
 	struct list_head io_bgs;
@@ -129,7 +130,6 @@ struct btrfs_trans_handle {
 	 */
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
-	struct list_head ordered;
 	struct list_head qgroup_ref_list;
 	struct list_head new_bgs;
 };
-- 
cgit v1.1


From d9ee522ba3ab51b7e3c6dfcf3743216371bc810f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 5 Oct 2015 10:35:29 -0400
Subject: Btrfs: fix qgroup sanity tests

With my changes to allow us to find old roots when resolving indirect refs I
introduced a regression to the sanity tests.  Since we don't really care to go
down into the fs roots we just need to have the old behavior of returning ENOENT
for dummy roots for the sanity tests.  In the future if we want to get fancy we
can populate the test fs trees with the references as well.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ecbc63d..2adc152a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
+	if (btrfs_test_is_dummy_root(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		ret = -ENOENT;
+		goto out;
+	}
+
 	if (path->search_commit_root)
 		root_level = btrfs_header_level(root->commit_root);
 	else if (time_seq == (u64)-1)
-- 
cgit v1.1


From d0bd456074dca089579818312da7cbe726ad2ff9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 23 Sep 2015 14:54:14 -0400
Subject: Btrfs: add fragment=* debug mount option

In tracking down these weird bitmap problems it was helpful to artificially
create an extremely fragmented file system.  These mount options let us either
fragment data or metadata or both.  With these options I could reproduce all
sorts of weird latencies and hangs that occur under extreme fragmentation and
get them fixed.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h                  | 14 +++++++
 fs/btrfs/extent-tree.c            | 82 ++++++++++++++++++++++++++++++++++++---
 fs/btrfs/free-space-cache.c       |  9 ++++-
 fs/btrfs/super.c                  | 30 ++++++++++++++
 fs/btrfs/tests/free-space-tests.c | 22 ++++++++++-
 5 files changed, 150 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49bc792..1638423 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2145,6 +2145,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
 #define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA	(1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA	(1 << 25)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(8192)
@@ -2169,6 +2171,18 @@ struct btrfs_ioctl_defrag_range_args {
 	btrfs_clear_opt(root->fs_info->mount_opt, opt);			\
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+				 struct btrfs_block_group_cache *block_group)
+{
+	return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+	       (btrfs_test_opt(root, FRAGMENT_DATA) &&
+		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
 /*
  * Requests for changes that need to be done during transaction commit.
  *
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2df4bc7..0e32abf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -332,6 +332,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
 		kfree(ctl);
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+				struct btrfs_block_group_cache *block_group)
+{
+	u64 start = block_group->key.objectid;
+	u64 len = block_group->key.offset;
+	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+		root->nodesize : root->sectorsize;
+	u64 step = chunk << 1;
+
+	while (len > chunk) {
+		btrfs_remove_free_space(block_group, start, chunk);
+		start += step;
+		if (len < step)
+			len = 0;
+		else
+			len -= step;
+	}
+}
+#endif
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +409,7 @@ static noinline void caching_thread(struct btrfs_work *work)
 	u64 last = 0;
 	u32 nritems;
 	int ret = -ENOMEM;
+	bool wakeup = true;
 
 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
@@ -400,6 +422,15 @@ static noinline void caching_thread(struct btrfs_work *work)
 
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	/*
+	 * If we're fragmenting we don't want to make anybody think we can
+	 * allocate from this block group until we've had a chance to fragment
+	 * the free space.
+	 */
+	if (btrfs_should_fragment_free_space(extent_root, block_group))
+		wakeup = false;
+#endif
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
 	 * extent for the extent root while also trying to search the extent
@@ -441,7 +472,8 @@ next:
 
 			if (need_resched() ||
 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
-				caching_ctl->progress = last;
+				if (wakeup)
+					caching_ctl->progress = last;
 				btrfs_release_path(path);
 				up_read(&fs_info->commit_root_sem);
 				mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +496,8 @@ next:
 			key.offset = 0;
 			key.type = BTRFS_EXTENT_ITEM_KEY;
 
-			caching_ctl->progress = last;
+			if (wakeup)
+				caching_ctl->progress = last;
 			btrfs_release_path(path);
 			goto next;
 		}
@@ -491,7 +524,8 @@ next:
 
 			if (total_found > (1024 * 1024 * 2)) {
 				total_found = 0;
-				wake_up(&caching_ctl->wait);
+				if (wakeup)
+					wake_up(&caching_ctl->wait);
 			}
 		}
 		path->slots[0]++;
@@ -501,13 +535,27 @@ next:
 	total_found += add_new_free_space(block_group, fs_info, last,
 					  block_group->key.objectid +
 					  block_group->key.offset);
-	caching_ctl->progress = (u64)-1;
-
 	spin_lock(&block_group->lock);
 	block_group->caching_ctl = NULL;
 	block_group->cached = BTRFS_CACHE_FINISHED;
 	spin_unlock(&block_group->lock);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+		u64 bytes_used;
+
+		spin_lock(&block_group->space_info->lock);
+		spin_lock(&block_group->lock);
+		bytes_used = block_group->key.offset -
+			btrfs_block_group_used(&block_group->item);
+		block_group->space_info->bytes_used += bytes_used >> 1;
+		spin_unlock(&block_group->lock);
+		spin_unlock(&block_group->space_info->lock);
+		fragment_free_space(extent_root, block_group);
+	}
+#endif
+
+	caching_ctl->progress = (u64)-1;
 err:
 	btrfs_free_path(path);
 	up_read(&fs_info->commit_root_sem);
@@ -607,6 +655,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 			}
 		}
 		spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+		if (ret == 1 &&
+		    btrfs_should_fragment_free_space(fs_info->extent_root,
+						     cache)) {
+			u64 bytes_used;
+
+			spin_lock(&cache->space_info->lock);
+			spin_lock(&cache->lock);
+			bytes_used = cache->key.offset -
+				btrfs_block_group_used(&cache->item);
+			cache->space_info->bytes_used += bytes_used >> 1;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+			fragment_free_space(fs_info->extent_root, cache);
+		}
+#endif
 		mutex_unlock(&caching_ctl->mutex);
 
 		wake_up(&caching_ctl->wait);
@@ -9624,6 +9688,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	free_excluded_extents(root, cache);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(root, cache)) {
+		u64 new_bytes_used = size - bytes_used;
+
+		bytes_used += new_bytes_used >> 1;
+		fragment_free_space(root, cache);
+	}
+#endif
 	/*
 	 * Call to ensure the corresponding space_info object is created and
 	 * assigned to our block group, but don't update its counters just yet.
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5a9431d..c0eb84e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1951,12 +1951,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 		      struct btrfs_free_space *info)
 {
 	struct btrfs_block_group_cache *block_group = ctl->private;
+	bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+					     block_group))
+		forced = true;
+#endif
 
 	/*
 	 * If we are below the extents threshold then we can add this as an
 	 * extent, and don't have to deal with the bitmap
 	 */
-	if (ctl->free_extents < ctl->extents_thresh) {
+	if (!forced && ctl->free_extents < ctl->extents_thresh) {
 		/*
 		 * If this block group has some small extents we don't want to
 		 * use up all of our free slots in the cache with them, we want
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b23d49d..24154e4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,6 +303,9 @@ enum {
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
 	Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
 	Opt_err,
 };
 
@@ -355,6 +358,11 @@ static match_table_t tokens = {
 	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_fatal_errors, "fatal_errors=%s"},
 	{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+	{Opt_fragment_data, "fragment=data"},
+	{Opt_fragment_metadata, "fragment=metadata"},
+	{Opt_fragment_all, "fragment=all"},
+#endif
 	{Opt_err, NULL},
 };
 
@@ -721,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 			}
 			break;
+#ifdef CONFIG_BTRFS_DEBUG
+		case Opt_fragment_all:
+			btrfs_info(root->fs_info, "fragmenting all space");
+			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+			btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+			break;
+		case Opt_fragment_metadata:
+			btrfs_info(root->fs_info, "fragmenting metadata");
+			btrfs_set_opt(info->mount_opt,
+				      FRAGMENT_METADATA);
+			break;
+		case Opt_fragment_data:
+			btrfs_info(root->fs_info, "fragmenting data");
+			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+			break;
+#endif
 		case Opt_err:
 			btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
@@ -1172,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",fatal_errors=panic");
 	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
 		seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_test_opt(root, FRAGMENT_DATA))
+		seq_puts(seq, ",fragment=data");
+	if (btrfs_test_opt(root, FRAGMENT_METADATA))
+		seq_puts(seq, ",fragment=metadata");
+#endif
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfd..c8c3d70 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include "btrfs-tests.h"
 #include "../ctree.h"
+#include "../disk-io.h"
 #include "../free-space-cache.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
@@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
 		kfree(cache);
 		return NULL;
 	}
+	cache->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!cache->fs_info) {
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+		return NULL;
+	}
 
 	cache->key.objectid = 0;
 	cache->key.offset = 1024 * 1024 * 1024;
@@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 int btrfs_test_free_space_cache(void)
 {
 	struct btrfs_block_group_cache *cache;
-	int ret;
+	struct btrfs_root *root = NULL;
+	int ret = -ENOMEM;
 
 	test_msg("Running btrfs free space cache tests\n");
 
@@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void)
 		return 0;
 	}
 
+	root = btrfs_alloc_dummy_root();
+	if (!root)
+		goto out;
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info)
+		goto out;
+
+	root->fs_info->extent_root = root;
+	cache->fs_info = root->fs_info;
+
 	ret = test_extents(cache);
 	if (ret)
 		goto out;
@@ -904,6 +923,7 @@ out:
 	__btrfs_remove_free_space_cache(cache->free_space_ctl);
 	kfree(cache->free_space_ctl);
 	kfree(cache);
+	btrfs_free_dummy_root(root);
 	test_msg("Free space cache tests finished\n");
 	return ret;
 }
-- 
cgit v1.1


From 0b670dc44c91bd1e5fac15b5ac4c98c8bd255ca2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 23 Sep 2015 17:11:16 -0400
Subject: Btrfs: fix prealloc under heavy fragmentation conditions

If we are heavily fragmented we will continually try to prealloc the largest
extent size we can every time we call btrfs_reserve_extent.  This can be very
expensive when we are heavily fragmented, burning lots of CPU cycles and loops
through the allocator.  So instead notice when we get a smaller chunk from the
allocator than what we specified and use this as the new maximum size we try to
allocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ce55f6..a3e0783 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9687,6 +9687,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 	u64 cur_offset = start;
 	u64 i_size;
 	u64 cur_bytes;
+	u64 last_alloc = (u64)-1;
 	int ret = 0;
 	bool own_trans = true;
 
@@ -9703,6 +9704,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
 		cur_bytes = max(cur_bytes, min_size);
+		/*
+		 * If we are severely fragmented we could end up with really
+		 * small allocations, so if the allocator is returning small
+		 * chunks lets make its job easier by only searching for those
+		 * sized chunks.
+		 */
+		cur_bytes = min(cur_bytes, last_alloc);
 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
 					   *alloc_hint, &ins, 1, 0);
 		if (ret) {
@@ -9711,6 +9719,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 			break;
 		}
 
+		last_alloc = ins.offset;
 		ret = insert_reserved_file_extent(trans, inode,
 						  cur_offset, ins.objectid,
 						  ins.offset, ins.offset,
-- 
cgit v1.1


From 3204d33cda40d9bc97f257c441225d3713916661 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 24 Sep 2015 10:46:10 -0400
Subject: Btrfs: add a flags field to btrfs_transaction

I want to set some per transaction flags, so instead of adding yet another int
lets just convert the current two int indicators to flags and add a flags field
for future use.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/transaction.c | 18 ++++++++----------
 fs/btrfs/transaction.h |  9 ++++-----
 fs/btrfs/volumes.c     |  2 +-
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0e32abf..e0dfe35 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4066,7 +4066,8 @@ commit_trans:
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
 			if (have_pinned_space >= 0 ||
-			    trans->transaction->have_free_bgs ||
+			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+				     &trans->transaction->flags) ||
 			    need_commit > 0) {
 				ret = btrfs_commit_transaction(trans, root);
 				if (ret)
@@ -8938,7 +8939,7 @@ again:
 	 * back off and let this transaction commit
 	 */
 	mutex_lock(&root->fs_info->ro_block_group_mutex);
-	if (trans->transaction->dirty_bg_run) {
+	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
 		u64 transid = trans->transid;
 
 		mutex_unlock(&root->fs_info->ro_block_group_mutex);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 68a56c3..222f9a9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -239,10 +239,9 @@ loop:
 	 * commit the transaction.
 	 */
 	atomic_set(&cur_trans->use_count, 2);
-	cur_trans->have_free_bgs = 0;
 	atomic_set(&cur_trans->pending_ordered, 0);
+	cur_trans->flags = 0;
 	cur_trans->start_time = get_seconds();
-	cur_trans->dirty_bg_run = 0;
 
 	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
 
@@ -1837,7 +1836,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	if (!cur_trans->dirty_bg_run) {
+	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
 		int run_it = 0;
 
 		/* this mutex is also taken before trying to set
@@ -1846,18 +1845,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		 * after a extents from that block group have been
 		 * allocated for cache files.  btrfs_set_block_group_ro
 		 * will wait for the transaction to commit if it
-		 * finds dirty_bg_run = 1
+		 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
 		 *
-		 * The dirty_bg_run flag is also used to make sure only
-		 * one process starts all the block group IO.  It wouldn't
+		 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+		 * only one process starts all the block group IO.  It wouldn't
 		 * hurt to have more than one go through, but there's no
 		 * real advantage to it either.
 		 */
 		mutex_lock(&root->fs_info->ro_block_group_mutex);
-		if (!cur_trans->dirty_bg_run) {
+		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+				      &cur_trans->flags))
 			run_it = 1;
-			cur_trans->dirty_bg_run = 1;
-		}
 		mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
 		if (run_it)
@@ -2127,7 +2125,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	if (cur_trans->have_free_bgs)
+	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
 		btrfs_clear_space_info_full(root->fs_info);
 
 	root->fs_info->last_trans_committed = cur_trans->transid;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index bf7b1dd..109b8b7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,9 @@ enum btrfs_trans_state {
 	TRANS_STATE_MAX			= 6,
 };
 
+#define BTRFS_TRANS_HAVE_FREE_BGS	0
+#define BTRFS_TRANS_DIRTY_BG_RUN	1
+
 struct btrfs_transaction {
 	u64 transid;
 	/*
@@ -48,10 +51,7 @@ struct btrfs_transaction {
 	atomic_t use_count;
 	atomic_t pending_ordered;
 
-	/*
-	 * true if there is free bgs operations in this transaction
-	 */
-	int have_free_bgs;
+	unsigned long flags;
 
 	/* Be protected by fs_info->trans_lock when we want to change it. */
 	enum btrfs_trans_state state;
@@ -81,7 +81,6 @@ struct btrfs_transaction {
 	spinlock_t dropped_roots_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
-	int dirty_bg_run;
 };
 
 #define __TRANS_FREEZABLE	(1U << 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e875b6c..f86d838 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1462,7 +1462,7 @@ again:
 		btrfs_std_error(root->fs_info, ret,
 			    "Failed to remove dev extent item");
 	} else {
-		trans->transaction->have_free_bgs = 1;
+		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
 	}
 out:
 	btrfs_free_path(path);
-- 
cgit v1.1


From 36af4e0737f6aa494e43497a5a34588a1d5cb12f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 25 Sep 2015 16:13:11 -0400
Subject: Btrfs: don't loop in allocator for space cache

The space cache needs to have contiguous allocations, and the allocator tries to
make allocations by reducing the amount of bytes requested and re-searching.
But this just makes us waste time when we are very fragmented, so if we can't
find our space just exit, don't bother trying to search again.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0dfe35..d6b5ef4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7337,7 +7337,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 empty_size, u64 hint_byte,
 			 struct btrfs_key *ins, int is_data, int delalloc)
 {
-	bool final_tried = false;
+	bool final_tried = num_bytes == min_alloc_size;
 	u64 flags;
 	int ret;
 
-- 
cgit v1.1


From 4f4db2174d8d6cdc093cbb79d17fbfe0f4d9fbde Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 29 Sep 2015 11:40:47 -0400
Subject: Btrfs: keep track of max_extent_size per space_info

When we are heavily fragmented we can induce a lot of latency trying to make an
allocation happen that is simply not going to happen.  Thankfully we keep track
of our max_extent_size when going through the allocator, so if we get to the
point where we are exiting find_free_extent with ENOSPC then set our
space_info->max_extent_size so we can keep future allocations from having to pay
this cost.  We reset the max_extent_size whenever we release pinned bytes back
into this space info so we can redo all the work.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/extent-tree.c | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1638423..78a8f56 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1154,6 +1154,10 @@ struct btrfs_space_info {
 				   delalloc/allocations */
 	u64 bytes_readonly;	/* total bytes that are read only */
 
+	u64 max_extent_size;	/* This will hold the maximum extent size of
+				   the space info if we had an ENOSPC in the
+				   allocator. */
+
 	unsigned int full:1;	/* indicates that we cannot allocate any more
 				   chunks for this space */
 	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d6b5ef4..febb5bc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3810,6 +3810,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_readonly = 0;
 	found->bytes_may_use = 0;
 	found->full = 0;
+	found->max_extent_size = 0;
 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
 	found->chunk_alloc = 0;
 	found->flush = 0;
@@ -6158,6 +6159,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		space_info->bytes_pinned -= len;
+		space_info->max_extent_size = 0;
 		percpu_counter_add(&space_info->total_bytes_pinned, -len);
 		if (cache->ro) {
 			space_info->bytes_readonly += len;
@@ -6915,6 +6917,29 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	}
 
 	/*
+	 * If our free space is heavily fragmented we may not be able to make
+	 * big contiguous allocations, so instead of doing the expensive search
+	 * for free space, simply return ENOSPC with our max_extent_size so we
+	 * can go ahead and search for a more manageable chunk.
+	 *
+	 * If our max_extent_size is large enough for our allocation simply
+	 * disable clustering since we will likely not be able to find enough
+	 * space to create a cluster and induce latency trying.
+	 */
+	if (unlikely(space_info->max_extent_size)) {
+		spin_lock(&space_info->lock);
+		if (space_info->max_extent_size &&
+		    num_bytes > space_info->max_extent_size) {
+			ins->offset = space_info->max_extent_size;
+			spin_unlock(&space_info->lock);
+			return -ENOSPC;
+		} else if (space_info->max_extent_size) {
+			use_cluster = false;
+		}
+		spin_unlock(&space_info->lock);
+	}
+
+	/*
 	 * If the space info is for both data and metadata it means we have a
 	 * small filesystem and we can't use the clustering stuff.
 	 */
@@ -7287,8 +7312,12 @@ loop:
 		ret = 0;
 	}
 out:
-	if (ret == -ENOSPC)
+	if (ret == -ENOSPC) {
+		spin_lock(&space_info->lock);
+		space_info->max_extent_size = max_extent_size;
+		spin_unlock(&space_info->lock);
 		ins->offset = max_extent_size;
+	}
 	return ret;
 }
 
-- 
cgit v1.1


From 2968b1f48bd7366dd7310acde1ee6d1bf7791142 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 1 Oct 2015 12:55:18 -0400
Subject: Btrfs: don't continue setting up space cache when enospc

If we hit ENOSPC when setting up a space cache don't bother setting up any of
the other space cache's in this transaction, it'll just induce unnecessary
latency.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 19 +++++++++++++++++++
 fs/btrfs/transaction.h |  1 +
 2 files changed, 20 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index febb5bc..3185c45 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3403,6 +3403,15 @@ again:
 	spin_unlock(&block_group->lock);
 
 	/*
+	 * We hit an ENOSPC when setting up the cache in this transaction, just
+	 * skip doing the setup, we've already cleared the cache so we're safe.
+	 */
+	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+		ret = -ENOSPC;
+		goto out_put;
+	}
+
+	/*
 	 * Try to preallocate enough space based on how big the block group is.
 	 * Keep in mind this has to include any pinned space which could end up
 	 * taking up quite a bit since it's not folded into the other space
@@ -3422,8 +3431,18 @@ again:
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
+	/*
+	 * Our cache requires contiguous chunks so that we don't modify a bunch
+	 * of metadata or split extents when writing the cache out, which means
+	 * we can enospc if we are heavily fragmented in addition to just normal
+	 * out of space conditions.  So if we hit this just skip setting up any
+	 * other block groups for this transaction, maybe we'll unpin enough
+	 * space the next time around.
+	 */
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
+	else if (ret == -ENOSPC)
+		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
 	btrfs_free_reserved_data_space(inode, num_pages);
 
 out_put:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 109b8b7..30ae750 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -34,6 +34,7 @@ enum btrfs_trans_state {
 
 #define BTRFS_TRANS_HAVE_FREE_BGS	0
 #define BTRFS_TRANS_DIRTY_BG_RUN	1
+#define BTRFS_TRANS_CACHE_ENOSPC	2
 
 struct btrfs_transaction {
 	u64 transid;
-- 
cgit v1.1


From a5e681d9bd641c4f0677e87d3a0c92a8f4f16293 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 1 Oct 2015 14:54:10 -0400
Subject: Btrfs: cut down on loops through the allocator

We try really really hard to make allocations, but sometimes it is just not
going to happen, especially when free space is extremely fragmented.  So add a
few short cuts through the looping states.  For example if we couldn't allocate
a chunk, just go straight to the NO_EMPTY_SIZE loop.  If there are no uncached
block groups and we've done a full search, go straight to the ALLOC_CHUNK stage.
And finally if we already have empty_size and empty_cluster set to 0 go ahead
and return -ENOSPC.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3185c45..9f18eb0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6921,6 +6921,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	bool failed_alloc = false;
 	bool use_cluster = true;
 	bool have_caching_bg = false;
+	bool full_search = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -7023,6 +7024,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	}
 search:
 	have_caching_bg = false;
+	if (index == 0 || index == __get_raid_index(flags))
+		full_search = true;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
@@ -7056,6 +7059,7 @@ search:
 have_block_group:
 		cached = block_group_cache_done(block_group);
 		if (unlikely(!cached)) {
+			have_caching_bg = true;
 			ret = cache_block_group(block_group, 0);
 			BUG_ON(ret < 0);
 			ret = 0;
@@ -7228,8 +7232,6 @@ unclustered_alloc:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
-			if (!cached)
-				have_caching_bg = true;
 			goto loop;
 		}
 checks:
@@ -7286,7 +7288,20 @@ loop:
 	 */
 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
 		index = 0;
-		loop++;
+		if (loop == LOOP_CACHING_NOWAIT) {
+			/*
+			 * We want to skip the LOOP_CACHING_WAIT step if we
+			 * don't have any unached bgs and we've alrelady done a
+			 * full search through.
+			 */
+			if (have_caching_bg || !full_search)
+				loop = LOOP_CACHING_WAIT;
+			else
+				loop = LOOP_ALLOC_CHUNK;
+		} else {
+			loop++;
+		}
+
 		if (loop == LOOP_ALLOC_CHUNK) {
 			struct btrfs_trans_handle *trans;
 			int exist = 0;
@@ -7304,6 +7319,15 @@ loop:
 
 			ret = do_chunk_alloc(trans, root, flags,
 					     CHUNK_ALLOC_FORCE);
+
+			/*
+			 * If we can't allocate a new chunk we've already looped
+			 * through at least once, move on to the NO_EMPTY_SIZE
+			 * case.
+			 */
+			if (ret == -ENOSPC)
+				loop = LOOP_NO_EMPTY_SIZE;
+
 			/*
 			 * Do not bail out on ENOSPC since we
 			 * can do more things.
@@ -7320,6 +7344,15 @@ loop:
 		}
 
 		if (loop == LOOP_NO_EMPTY_SIZE) {
+			/*
+			 * Don't loop again if we already have no empty_size and
+			 * no empty_cluster.
+			 */
+			if (empty_size == 0 &&
+			    empty_cluster == 0) {
+				ret = -ENOSPC;
+				goto out;
+			}
 			empty_size = 0;
 			empty_cluster = 0;
 		}
-- 
cgit v1.1


From c759c4e16179e47e099f491011e6acd7858f8625 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Oct 2015 15:25:10 -0400
Subject: Btrfs: don't keep trying to build clusters if we are fragmented

If we are extremely fragmented then we won't be able to create a free_cluster.
So if this happens set last_ptr->fragmented so that all future allcations will
give up trying to create a cluster.  When we unpin extents we will unset
->fragmented if we free up a sufficient amount of space in a block group.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |   3 ++
 fs/btrfs/extent-tree.c      | 102 +++++++++++++++++++++++++++++++++-----------
 fs/btrfs/free-space-cache.c |   1 +
 3 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78a8f56..ced08c9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1232,6 +1232,9 @@ struct btrfs_free_cluster {
 	/* first extent starting offset */
 	u64 window_start;
 
+	/* We did a full search and couldn't create a cluster */
+	bool fragmented;
+
 	struct btrfs_block_group_cache *block_group;
 	/*
 	 * when a cluster is allocated from a block group, we put the
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f18eb0..4757645 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6142,6 +6142,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 	update_global_block_rsv(fs_info);
 }
 
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+		   u64 *empty_cluster)
+{
+	struct btrfs_free_cluster *ret = NULL;
+	bool ssd = btrfs_test_opt(root, SSD);
+
+	*empty_cluster = 0;
+	if (btrfs_mixed_space_info(space_info))
+		return ret;
+
+	if (ssd)
+		*empty_cluster = 2 * 1024 * 1024;
+	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		ret = &root->fs_info->meta_alloc_cluster;
+		if (!ssd)
+			*empty_cluster = 64 * 1024;
+	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+		ret = &root->fs_info->data_alloc_cluster;
+	}
+
+	return ret;
+}
+
 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 			      const bool return_free_space)
 {
@@ -6149,7 +6177,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 	struct btrfs_block_group_cache *cache = NULL;
 	struct btrfs_space_info *space_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_free_cluster *cluster = NULL;
 	u64 len;
+	u64 total_unpinned = 0;
+	u64 empty_cluster = 0;
 	bool readonly;
 
 	while (start <= end) {
@@ -6158,8 +6189,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 		    start >= cache->key.objectid + cache->key.offset) {
 			if (cache)
 				btrfs_put_block_group(cache);
+			total_unpinned = 0;
 			cache = btrfs_lookup_block_group(fs_info, start);
 			BUG_ON(!cache); /* Logic error */
+
+			cluster = fetch_cluster_info(root,
+						     cache->space_info,
+						     &empty_cluster);
+			empty_cluster <<= 1;
 		}
 
 		len = cache->key.objectid + cache->key.offset - start;
@@ -6172,8 +6209,22 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 		}
 
 		start += len;
+		total_unpinned += len;
 		space_info = cache->space_info;
 
+		/*
+		 * If this space cluster has been marked as fragmented and we've
+		 * unpinned enough in this block group to potentially allow a
+		 * cluster to be created inside of it go ahead and clear the
+		 * fragmented check.
+		 */
+		if (cluster && cluster->fragmented &&
+		    total_unpinned > empty_cluster) {
+			spin_lock(&cluster->lock);
+			cluster->fragmented = 0;
+			spin_unlock(&cluster->lock);
+		}
+
 		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
@@ -6911,7 +6962,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	struct btrfs_block_group_cache *block_group = NULL;
 	u64 search_start = 0;
 	u64 max_extent_size = 0;
-	int empty_cluster = 2 * 1024 * 1024;
+	u64 empty_cluster = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
 	int index = __get_raid_index(flags);
@@ -6959,37 +7010,25 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 		spin_unlock(&space_info->lock);
 	}
 
-	/*
-	 * If the space info is for both data and metadata it means we have a
-	 * small filesystem and we can't use the clustering stuff.
-	 */
-	if (btrfs_mixed_space_info(space_info))
-		use_cluster = false;
-
-	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-		last_ptr = &root->fs_info->meta_alloc_cluster;
-		if (!btrfs_test_opt(root, SSD))
-			empty_cluster = 64 * 1024;
-	}
-
-	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-	    btrfs_test_opt(root, SSD)) {
-		last_ptr = &root->fs_info->data_alloc_cluster;
-	}
-
+	last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
 	if (last_ptr) {
 		spin_lock(&last_ptr->lock);
 		if (last_ptr->block_group)
 			hint_byte = last_ptr->window_start;
+		if (last_ptr->fragmented) {
+			/*
+			 * We still set window_start so we can keep track of the
+			 * last place we found an allocation to try and save
+			 * some time.
+			 */
+			hint_byte = last_ptr->window_start;
+			use_cluster = false;
+		}
 		spin_unlock(&last_ptr->lock);
 	}
 
 	search_start = max(search_start, first_logical_byte(root, 0));
 	search_start = max(search_start, hint_byte);
-
-	if (!last_ptr)
-		empty_cluster = 0;
-
 	if (search_start == hint_byte) {
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
@@ -7074,7 +7113,7 @@ have_block_group:
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
 		 */
-		if (last_ptr) {
+		if (last_ptr && use_cluster) {
 			struct btrfs_block_group_cache *used_block_group;
 			unsigned long aligned_cluster;
 			/*
@@ -7200,6 +7239,16 @@ refill_cluster:
 		}
 
 unclustered_alloc:
+		/*
+		 * We are doing an unclustered alloc, set the fragmented flag so
+		 * we don't bother trying to setup a cluster again until we get
+		 * more space.
+		 */
+		if (unlikely(last_ptr)) {
+			spin_lock(&last_ptr->lock);
+			last_ptr->fragmented = 1;
+			spin_unlock(&last_ptr->lock);
+		}
 		spin_lock(&block_group->free_space_ctl->tree_lock);
 		if (cached &&
 		    block_group->free_space_ctl->free_space <
@@ -7361,6 +7410,11 @@ loop:
 	} else if (!ins->objectid) {
 		ret = -ENOSPC;
 	} else if (ins->objectid) {
+		if (!use_cluster && last_ptr) {
+			spin_lock(&last_ptr->lock);
+			last_ptr->window_start = ins->objectid;
+			spin_unlock(&last_ptr->lock);
+		}
 		ret = 0;
 	}
 out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c0eb84e..a5922e8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3063,6 +3063,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 	spin_lock_init(&cluster->refill_lock);
 	cluster->root = RB_ROOT;
 	cluster->max_size = 0;
+	cluster->fragmented = false;
 	INIT_LIST_HEAD(&cluster->block_group_list);
 	cluster->block_group = NULL;
 }
-- 
cgit v1.1


From cef404837002103584c7c82f1e3fc3ec5961f47b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Oct 2015 16:09:42 -0400
Subject: Btrfs: keep track of largest extent in bitmaps

We can waste a lot of time searching through bitmaps when we are heavily
fragmented trying to find large contiguous areas that don't exist in the bitmap.
So keep track of the max extent size when we do a full search of a bitmap so
that next time around we can just skip the expensive searching if our max size
is less than what we are looking for.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 39 ++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/free-space-cache.h |  1 +
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a5922e8..e10c966 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1738,6 +1738,16 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	unsigned long next_zero;
 	unsigned long extent_bits;
 
+	/*
+	 * Skip searching the bitmap if we don't have a contiguous section that
+	 * is large enough for this allocation.
+	 */
+	if (bitmap_info->max_extent_size &&
+	    bitmap_info->max_extent_size < *bytes) {
+		*bytes = bitmap_info->max_extent_size;
+		return -1;
+	}
+
 	i = offset_to_bit(bitmap_info->offset, ctl->unit,
 			  max_t(u64, *offset, bitmap_info->offset));
 	bits = bytes_to_bits(*bytes, ctl->unit);
@@ -1762,6 +1772,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	}
 
 	*bytes = (u64)(max_bits) * ctl->unit;
+	bitmap_info->max_extent_size = *bytes;
 	return -1;
 }
 
@@ -1943,6 +1954,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
 
+	/*
+	 * We set some bytes, we have no idea what the max extent size is
+	 * anymore.
+	 */
+	info->max_extent_size = 0;
+
 	return bytes_to_set;
 
 }
@@ -2782,6 +2799,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	unsigned long want_bits;
 	unsigned long min_bits;
 	unsigned long found_bits;
+	unsigned long max_bits = 0;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
 	int ret;
@@ -2791,6 +2809,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	want_bits = bytes_to_bits(bytes, ctl->unit);
 	min_bits = bytes_to_bits(min_bytes, ctl->unit);
 
+	/*
+	 * Don't bother looking for a cluster in this bitmap if it's heavily
+	 * fragmented.
+	 */
+	if (entry->max_extent_size &&
+	    entry->max_extent_size < cont1_bytes)
+		return -ENOSPC;
 again:
 	found_bits = 0;
 	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2798,13 +2823,19 @@ again:
 					       BITS_PER_BITMAP, i);
 		if (next_zero - i >= min_bits) {
 			found_bits = next_zero - i;
+			if (found_bits > max_bits)
+				max_bits = found_bits;
 			break;
 		}
+		if (next_zero - i > max_bits)
+			max_bits = next_zero - i;
 		i = next_zero;
 	}
 
-	if (!found_bits)
+	if (!found_bits) {
+		entry->max_extent_size = (u64)max_bits * ctl->unit;
 		return -ENOSPC;
+	}
 
 	if (!total_found) {
 		start = i;
@@ -3540,6 +3571,7 @@ again:
 		spin_lock(&ctl->tree_lock);
 		info->offset = offset;
 		info->bytes = bytes;
+		info->max_extent_size = 0;
 		ret = link_free_space(ctl, info);
 		spin_unlock(&ctl->tree_lock);
 		if (ret)
@@ -3567,6 +3599,11 @@ again:
 	}
 
 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
+	/* We used the newly allocated info, set the max_extent_size to bytes */
+	if (!info)
+		bitmap_info->max_extent_size = bytes_added;
+
 	bytes -= bytes_added;
 	offset += bytes_added;
 	spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029..f251865 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
 	struct rb_node offset_index;
 	u64 offset;
 	u64 bytes;
+	u64 max_extent_size;
 	unsigned long *bitmap;
 	struct list_head list;
 };
-- 
cgit v1.1


From 0584f718ed1f351fca5047a4b1ebba9b5ea41215 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Oct 2015 16:12:23 -0400
Subject: Btrfs: don't do extra bitmap search in one bit case

When we make ctl->unit allocations from a bitmap there is no point in searching
for the next 0 in the bitmap.  If we've found a bit we're done and can just exit
the loop.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e10c966..0948d34 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
  */
 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
-			 u64 *bytes)
+			 u64 *bytes, bool for_alloc)
 {
 	unsigned long found_bits = 0;
 	unsigned long max_bits = 0;
@@ -1742,7 +1742,8 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	 * Skip searching the bitmap if we don't have a contiguous section that
 	 * is large enough for this allocation.
 	 */
-	if (bitmap_info->max_extent_size &&
+	if (for_alloc &&
+	    bitmap_info->max_extent_size &&
 	    bitmap_info->max_extent_size < *bytes) {
 		*bytes = bitmap_info->max_extent_size;
 		return -1;
@@ -1753,6 +1754,10 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	bits = bytes_to_bits(*bytes, ctl->unit);
 
 	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+		if (for_alloc && bits == 1) {
+			found_bits = 1;
+			break;
+		}
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
 		extent_bits = next_zero - i;
@@ -1824,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		if (entry->bitmap) {
 			u64 size = *bytes;
 
-			ret = search_bitmap(ctl, entry, &tmp, &size);
+			ret = search_bitmap(ctl, entry, &tmp, &size, true);
 			if (!ret) {
 				*offset = tmp;
 				*bytes = size;
@@ -1885,7 +1890,8 @@ again:
 	search_start = *offset;
 	search_bytes = ctl->unit;
 	search_bytes = min(search_bytes, end - search_start + 1);
-	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+			    false);
 	if (ret < 0 || search_start != *offset)
 		return -EINVAL;
 
@@ -1930,7 +1936,7 @@ again:
 		search_start = *offset;
 		search_bytes = ctl->unit;
 		ret = search_bitmap(ctl, bitmap_info, &search_start,
-				    &search_bytes);
+				    &search_bytes, false);
 		if (ret < 0 || search_start != *offset)
 			return -EAGAIN;
 
@@ -2685,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 	search_start = min_start;
 	search_bytes = bytes;
 
-	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
 	if (err) {
 		if (search_bytes > *max_extent_size)
 			*max_extent_size = search_bytes;
@@ -3262,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 		}
 
 		bytes = minlen;
-		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
 		if (ret2 || start >= end) {
 			spin_unlock(&ctl->tree_lock);
 			mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3415,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
 		u64 count = 1;
 		int ret;
 
-		ret = search_bitmap(ctl, entry, &offset, &count);
+		ret = search_bitmap(ctl, entry, &offset, &count, true);
 		/* Logic error; Should be empty if it can't find anything */
 		ASSERT(!ret);
 
@@ -3600,10 +3606,6 @@ again:
 
 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
 
-	/* We used the newly allocated info, set the max_extent_size to bytes */
-	if (!info)
-		bitmap_info->max_extent_size = bytes_added;
-
 	bytes -= bytes_added;
 	offset += bytes_added;
 	spin_unlock(&ctl->tree_lock);
@@ -3647,7 +3649,7 @@ have_info:
 
 		bit_off = offset;
 		bit_bytes = ctl->unit;
-		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
 		if (!ret) {
 			if (bit_off == offset) {
 				ret = 1;
-- 
cgit v1.1


From 2c3cf7d5f6105bb957df125dfce61d4483b8742d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 22 Oct 2015 09:47:34 +0100
Subject: Btrfs: fix regression when running delayed references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the kernel 4.2 merge window we had a refactoring/rework of the delayed
references implementation in order to fix certain problems with qgroups.
However that rework introduced one more regression that leads to the
following trace when running delayed references for metadata:

[35908.064664] kernel BUG at fs/btrfs/extent-tree.c:1832!
[35908.065201] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[35908.065201] Modules linked in: dm_flakey dm_mod btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc psmouse i2
[35908.065201] CPU: 14 PID: 15014 Comm: kworker/u32:9 Tainted: G        W       4.3.0-rc5-btrfs-next-17+ #1
[35908.065201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[35908.065201] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
[35908.065201] task: ffff880114b7d780 ti: ffff88010c4c8000 task.ti: ffff88010c4c8000
[35908.065201] RIP: 0010:[<ffffffffa04928b5>]  [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
[35908.065201] RSP: 0018:ffff88010c4cbb08  EFLAGS: 00010293
[35908.065201] RAX: 0000000000000000 RBX: ffff88008a661000 RCX: 0000000000000000
[35908.065201] RDX: ffffffffa04dd58f RSI: 0000000000000001 RDI: 0000000000000000
[35908.065201] RBP: ffff88010c4cbb40 R08: 0000000000001000 R09: ffff88010c4cb9f8
[35908.065201] R10: 0000000000000000 R11: 000000000000002c R12: 0000000000000000
[35908.065201] R13: ffff88020a74c578 R14: 0000000000000000 R15: 0000000000000000
[35908.065201] FS:  0000000000000000(0000) GS:ffff88023edc0000(0000) knlGS:0000000000000000
[35908.065201] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[35908.065201] CR2: 00000000015e8708 CR3: 0000000102185000 CR4: 00000000000006e0
[35908.065201] Stack:
[35908.065201]  ffff88010c4cbb18 0000000000000f37 ffff88020a74c578 ffff88015a408000
[35908.065201]  ffff880154a44000 0000000000000000 0000000000000005 ffff88010c4cbbd8
[35908.065201]  ffffffffa0492b9a 0000000000000005 0000000000000000 0000000000000000
[35908.065201] Call Trace:
[35908.065201]  [<ffffffffa0492b9a>] __btrfs_inc_extent_ref+0x8b/0x208 [btrfs]
[35908.065201]  [<ffffffffa0497117>] ? __btrfs_run_delayed_refs+0x4d4/0xd33 [btrfs]
[35908.065201]  [<ffffffffa049773d>] __btrfs_run_delayed_refs+0xafa/0xd33 [btrfs]
[35908.065201]  [<ffffffffa04a976a>] ? join_transaction.isra.10+0x25/0x41f [btrfs]
[35908.065201]  [<ffffffffa04a97ed>] ? join_transaction.isra.10+0xa8/0x41f [btrfs]
[35908.065201]  [<ffffffffa049914d>] btrfs_run_delayed_refs+0x75/0x1dd [btrfs]
[35908.065201]  [<ffffffffa04992f1>] delayed_ref_async_start+0x3c/0x7b [btrfs]
[35908.065201]  [<ffffffffa04d4b4f>] normal_work_helper+0x14c/0x32a [btrfs]
[35908.065201]  [<ffffffffa04d4e93>] btrfs_extent_refs_helper+0x12/0x14 [btrfs]
[35908.065201]  [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[35908.065201]  [<ffffffff81064285>] worker_thread+0x206/0x2c2
[35908.065201]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[35908.065201]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[35908.065201]  [<ffffffff8106904d>] kthread+0xef/0xf7
[35908.065201]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[35908.065201]  [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[35908.065201]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[35908.065201] Code: 6a 01 41 56 41 54 ff 75 10 41 51 4d 89 c1 49 89 c8 48 8d 4d d0 e8 f6 f1 ff ff 48 83 c4 28 85 c0 75 2c 49 81 fc ff 00 00 00 77 02 <0f> 0b 4c 8b 45 30 8b 4d 28 45 31
[35908.065201] RIP  [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
[35908.065201]  RSP <ffff88010c4cbb08>
[35908.310885] ---[ end trace fe4299baf0666457 ]---

This happens because the new delayed references code no longer merges
delayed references that have different sequence values. The following
steps are an example sequence leading to this issue:

1) Transaction N starts, fs_info->tree_mod_seq has value 0;

2) Extent buffer (btree node) A is allocated, delayed reference Ref1 for
   bytenr A is created, with a value of 1 and a seq value of 0;

3) fs_info->tree_mod_seq is incremented to 1;

4) Extent buffer A is deleted through btrfs_del_items(), which calls
   btrfs_del_leaf(), which in turn calls btrfs_free_tree_block(). The
   later returns the metadata extent associated to extent buffer A to
   the free space cache (the range is not pinned), because the extent
   buffer was created in the current transaction (N) and writeback never
   happened for the extent buffer (flag BTRFS_HEADER_FLAG_WRITTEN not set
   in the extent buffer).
   This creates the delayed reference Ref2 for bytenr A, with a value
   of -1 and a seq value of 1;

5) Delayed reference Ref2 is not merged with Ref1 when we create it,
   because they have different sequence numbers (decided at
   add_delayed_ref_tail_merge());

6) fs_info->tree_mod_seq is incremented to 2;

7) Some task attempts to allocate a new extent buffer (done at
   extent-tree.c:find_free_extent()), but due to heavy fragmentation
   and running low on metadata space the clustered allocation fails
   and we fall back to unclustered allocation, which finds the
   extent at offset A, so a new extent buffer at offset A is allocated.
   This creates delayed reference Ref3 for bytenr A, with a value of 1
   and a seq value of 2;

8) Ref3 is not merged neither with Ref2 nor Ref1, again because they
   all have different seq values;

9) We start running the delayed references (__btrfs_run_delayed_refs());

10) The delayed Ref1 is the first one being applied, which ends up
    creating an inline extent backref in the extent tree;

10) Next the delayed reference Ref3 is selected for execution, and not
    Ref2, because select_delayed_ref() always gives a preference for
    positive references (that have an action of BTRFS_ADD_DELAYED_REF);

11) When running Ref3 we encounter alreay the inline extent backref
    in the extent tree at insert_inline_extent_backref(), which makes
    us hit the following BUG_ON:

        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);

    This is always true because owner corresponds to the level of the
    extent buffer/btree node in the btree.

For the scenario described above we hit the BUG_ON because we never merge
references that have different seq values.

We used to do the merging before the 4.2 kernel, more specifically, before
the commmits:

  c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
  c43d160fcd5e ("btrfs: delayed-ref: Cleanup the unneeded functions.")

This issue became more exposed after the following change that was added
to 4.2 as well:

  cffc3374e567 ("Btrfs: fix order by which delayed references are run")

Which in turn fixed another regression by the two commits previously
mentioned.

So fix this by bringing back the delayed reference merge code, with the
proper adaptations so that it operates against the new data structure
(linked list vs old red black tree implementation).

This issue was hit running fstest btrfs/063 in a loop. Several people have
reported this issue in the mailing list when running on kernels 4.2+.

Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).

Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org  # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/delayed-ref.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent-tree.c |  14 ++++++
 2 files changed, 127 insertions(+)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index bd9b63b..2f41580 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 		trans->delayed_ref_updates--;
 }
 
+static bool merge_ref(struct btrfs_trans_handle *trans,
+		      struct btrfs_delayed_ref_root *delayed_refs,
+		      struct btrfs_delayed_ref_head *head,
+		      struct btrfs_delayed_ref_node *ref,
+		      u64 seq)
+{
+	struct btrfs_delayed_ref_node *next;
+	bool done = false;
+
+	next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+				list);
+	while (!done && &next->list != &head->ref_list) {
+		int mod;
+		struct btrfs_delayed_ref_node *next2;
+
+		next2 = list_next_entry(next, list);
+
+		if (next == ref)
+			goto next;
+
+		if (seq && next->seq >= seq)
+			goto next;
+
+		if (next->type != ref->type || next->no_quota != ref->no_quota)
+			goto next;
+
+		if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+		     ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+		    comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
+				   btrfs_delayed_node_to_tree_ref(next),
+				   ref->type))
+			goto next;
+		if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		     ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
+		    comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
+				   btrfs_delayed_node_to_data_ref(next)))
+			goto next;
+
+		if (ref->action == next->action) {
+			mod = next->ref_mod;
+		} else {
+			if (ref->ref_mod < next->ref_mod) {
+				swap(ref, next);
+				done = true;
+			}
+			mod = -next->ref_mod;
+		}
+
+		drop_delayed_ref(trans, delayed_refs, head, next);
+		ref->ref_mod += mod;
+		if (ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, head, ref);
+			done = true;
+		} else {
+			/*
+			 * Can't have multiples of the same ref on a tree block.
+			 */
+			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+next:
+		next = next2;
+	}
+
+	return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_node *ref;
+	u64 seq = 0;
+
+	assert_spin_locked(&head->lock);
+
+	if (list_empty(&head->ref_list))
+		return;
+
+	/* We don't have too many refs to merge for data. */
+	if (head->is_data)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+			       list);
+	while (&ref->list != &head->ref_list) {
+		if (seq && ref->seq >= seq)
+			goto next;
+
+		if (merge_ref(trans, delayed_refs, head, ref, seq)) {
+			if (list_empty(&head->ref_list))
+				break;
+			ref = list_first_entry(&head->ref_list,
+					       struct btrfs_delayed_ref_node,
+					       list);
+			continue;
+		}
+next:
+		ref = list_next_entry(ref, list);
+	}
+}
+
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 92fdbc6..c0f30f5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2502,7 +2502,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 * Or we can get node references of the same type that weren't
+		 * merged when created due to bumps in the tree mod seq, and
+		 * we need to merge them to prevent adding an inline extent
+		 * backref before dropping it (triggering a BUG_ON at
+		 * insert_inline_extent_backref()).
+		 */
 		spin_lock(&locked_ref->lock);
+		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+					 locked_ref);
 
 		/*
 		 * locked_ref is the head node, so we have to go one
-- 
cgit v1.1


From b06c4bf5c874a57254b197f53ddf588e7a24a2bf Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 23 Oct 2015 07:52:54 +0100
Subject: Btrfs: fix regression running delayed references when using qgroups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:

  commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")

Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:

  static int run_delayed_tree_ref(...)
  {
     (...)
     BUG_ON(node->ref_mod != 1);
     (...)
  }

This happens on a scenario like the following:

  1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.

  2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
     It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.

  3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
     It's not merged with the reference at the tail of the list of refs
     for bytenr X because the reference at the tail, Ref2 is incompatible
     due to Ref2->no_quota != Ref3->no_quota.

  4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
     It's not merged with the reference at the tail of the list of refs
     for bytenr X because the reference at the tail, Ref3 is incompatible
     due to Ref3->no_quota != Ref4->no_quota.

  5) We run delayed references, trigger merging of delayed references,
     through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().

  6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
     all other conditions are satisfied too. So Ref1 gets a ref_mod
     value of 2.

  7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
     all other conditions are satisfied too. So Ref2 gets a ref_mod
     value of 2.

  8) Ref1 and Ref2 aren't merged, because they have different values
     for their no_quota field.

  9) Delayed reference Ref1 is picked for running (select_delayed_ref()
     always prefers references with an action == BTRFS_ADD_DELAYED_REF).
     So run_delayed_tree_ref() is called for Ref1 which triggers the
     BUG_ON because Ref1->red_mod != 1 (equals 2).

So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").

The use of no_quota was also buggy in at least two places:

1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
   no_quota to 0 instead of 1 when the following condition was true:
   is_fstree(ref_root) || !fs_info->quota_enabled

2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
   reset a node's no_quota when the condition "!is_fstree(root_objectid)
   || !root->fs_info->quota_enabled" was true but we did it only in
   an unused local stack variable, that is, we never reset the no_quota
   value in the node itself.

This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.

Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).

Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:

  INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
  (...)
  Call Trace:
    [<ffffffff86999201>] schedule+0x74/0x83
    [<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
    [<ffffffff86137ed7>] ? wait_woken+0x74/0x74
    [<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
    [<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
    [<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
    [<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
    [<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
    [<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
    [<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
    [<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
    [<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
    [<ffffffff863e852e>] check_ref+0x64/0xc4
    [<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
    [<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
    [<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
    [<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
  (...)

The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).

Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org  # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |  4 ++--
 fs/btrfs/delayed-ref.c | 28 +++++++----------------
 fs/btrfs/delayed-ref.h |  7 ++----
 fs/btrfs/extent-tree.c | 45 +++++++++++++-----------------------
 fs/btrfs/file.c        | 10 ++++----
 fs/btrfs/inode.c       |  4 ++--
 fs/btrfs/ioctl.c       | 62 +-------------------------------------------------
 fs/btrfs/relocation.c  | 16 ++++++-------
 fs/btrfs/tree-log.c    |  2 +-
 9 files changed, 44 insertions(+), 134 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bc3c711..3fa3c3b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3422,7 +3422,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int no_quota);
+		      u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
 			       int delalloc);
@@ -3435,7 +3435,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset, int no_quota);
+			 u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2f41580..1c3588a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -220,7 +220,7 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 		if (seq && next->seq >= seq)
 			goto next;
 
-		if (next->type != ref->type || next->no_quota != ref->no_quota)
+		if (next->type != ref->type)
 			goto next;
 
 		if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -405,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
 	exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
 			   list);
 	/* No need to compare bytenr nor is_head */
-	if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
-	    exist->seq != ref->seq)
+	if (exist->type != ref->type || exist->seq != ref->seq)
 		goto add_tail;
 
 	if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -639,7 +638,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		     struct btrfs_delayed_ref_head *head_ref,
 		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
 		     u64 num_bytes, u64 parent, u64 ref_root, int level,
-		     int action, int no_quota)
+		     int action)
 {
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -661,7 +660,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->action = action;
 	ref->is_head = 0;
 	ref->in_tree = 1;
-	ref->no_quota = no_quota;
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -694,7 +692,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		     struct btrfs_delayed_ref_head *head_ref,
 		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
 		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
-		     u64 offset, int action, int no_quota)
+		     u64 offset, int action)
 {
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -717,7 +715,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->action = action;
 	ref->is_head = 0;
 	ref->in_tree = 1;
-	ref->no_quota = no_quota;
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -748,17 +745,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota)
+			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		no_quota = 0;
-
 	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
@@ -787,8 +780,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 					bytenr, num_bytes, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-				   num_bytes, parent, ref_root, level, action,
-				   no_quota);
+			     num_bytes, parent, ref_root, level, action);
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
@@ -809,17 +801,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota)
+			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		no_quota = 0;
-
 	BUG_ON(extent_op && !extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
@@ -855,7 +843,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
-				   action, no_quota);
+				   action);
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d4c41e2..f9cf234 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
 
 	unsigned int action:8;
 	unsigned int type:8;
-	unsigned int no_quota:1;
 	/* is this node still in the rbtree? */
 	unsigned int is_head:1;
 	unsigned int in_tree:1;
@@ -244,15 +243,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota);
+			       struct btrfs_delayed_extent_op *extent_op);
 int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota);
+			       struct btrfs_delayed_extent_op *extent_op);
 int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
 				     struct btrfs_trans_handle *trans,
 				     u64 ref_root, u64 bytenr, u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0f30f5..c1f8c7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins,
-				     int no_quota);
+				     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 flags,
 			  int force);
@@ -2073,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset,
-			 int no_quota)
+			 u64 root_objectid, u64 owner, u64 offset)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2086,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
 					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+					BTRFS_ADD_DELAYED_REF, NULL);
 	} else {
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 					num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+					BTRFS_ADD_DELAYED_REF, NULL);
 	}
 	return ret;
 }
@@ -2112,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	u64 num_bytes = node->num_bytes;
 	u64 refs;
 	int ret;
-	int no_quota = node->no_quota;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
-		no_quota = 1;
-
 	path->reada = 1;
 	path->leave_spinning = 1;
 	/* this will setup the path even if it fails to insert the back ref */
@@ -2355,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 						parent, ref_root,
 						extent_op->flags_to_set,
 						&extent_op->key,
-						ref->level, &ins,
-						node->no_quota);
+						ref->level, &ins);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, root, node,
 					     parent, ref_root,
@@ -3192,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, int);
+			    u64, u64, u64, u64, u64, u64);
 
 
 	if (btrfs_test_is_dummy_root(root))
@@ -3233,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset, 1);
+					   key.offset);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = root->nodesize;
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0,
-					   1);
+					   parent, ref_root, level - 1, 0);
 			if (ret)
 				goto fail;
 		}
@@ -6431,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
-	int no_quota = node->no_quota;
 	u32 item_size;
 	u64 refs;
 	u64 bytenr = node->bytenr;
@@ -6440,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
 						 SKINNY_METADATA);
 
-	if (!info->quota_enabled || !is_fstree(root_objectid))
-		no_quota = 1;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -6768,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 					buf->start, buf->len,
 					parent, root->root_key.objectid,
 					btrfs_header_level(buf),
-					BTRFS_DROP_DELAYED_REF, NULL, 0);
+					BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 
@@ -6816,7 +6804,7 @@ out:
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int no_quota)
+		      u64 owner, u64 offset)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6839,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
 					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+					BTRFS_DROP_DELAYED_REF, NULL);
 	} else {
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 						num_bytes,
 						parent, root_objectid, owner,
 						offset, BTRFS_DROP_DELAYED_REF,
-						NULL, no_quota);
+						NULL);
 	}
 	return ret;
 }
@@ -7690,8 +7678,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins,
-				     int no_quota)
+				     int level, struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7781,7 +7768,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
 					 ins->offset, 0,
 					 root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+					 BTRFS_ADD_DELAYED_EXTENT, NULL);
 	return ret;
 }
 
@@ -7995,7 +7982,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 						 ins.objectid, ins.offset,
 						 parent, root_objectid, level,
 						 BTRFS_ADD_DELAYED_EXTENT,
-						 extent_op, 0);
+						 extent_op);
 		if (ret)
 			goto out_free_delayed;
 	}
@@ -8544,7 +8531,7 @@ skip:
 			}
 		}
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-				root->root_key.objectid, level - 1, 0, 0);
+				root->root_key.objectid, level - 1, 0);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	btrfs_tree_unlock(next);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1243205..381be79 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset, 1);
+						start - extent_offset);
 				BUG_ON(ret); /* -ENOMEM */
 			}
 			key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset, 0);
+						extent_offset);
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   ino, orig_offset, 1);
+					   ino, orig_offset);
 		BUG_ON(ret); /* -ENOMEM */
 
 		if (split == start) {
@@ -1231,7 +1231,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset, 0);
+					ino, orig_offset);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	other_start = 0;
@@ -1248,7 +1248,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset, 0);
+					ino, orig_offset);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	if (del_nr == 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a018e47..6f030c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2595,7 +2595,7 @@ again:
 	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
 			new->disk_len, 0,
 			backref->root_id, backref->inum,
-			new->file_pos, 0);	/* start - extent_offset */
+			new->file_pos);	/* start - extent_offset */
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_free_path;
@@ -4541,7 +4541,7 @@ delete:
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset, 0);
+						ino, extent_offset);
 			BUG_ON(ret);
 			if (btrfs_should_throttle_delayed_refs(trans, root))
 				btrfs_async_run_delayed_refs(root,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7ed033a..4df0f2b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3206,41 +3206,6 @@ out:
 	return ret;
 }
 
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr.  If it does then we need to update the quota for this root.  This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     u64 disko)
-{
-	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
-	struct ulist *roots;
-	struct ulist_iterator uiter;
-	struct ulist_node *root_node = NULL;
-	int ret;
-
-	if (!root->fs_info->quota_enabled)
-		return 1;
-
-	btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-	ret = btrfs_find_all_roots(trans, root->fs_info, disko,
-				   tree_mod_seq_elem.seq, &roots);
-	if (ret < 0)
-		goto out;
-	ret = 0;
-	ULIST_ITER_INIT(&uiter);
-	while ((root_node = ulist_next(roots, &uiter))) {
-		if (root_node->val == root->objectid) {
-			ret = 1;
-			break;
-		}
-	}
-	ulist_free(roots);
-out:
-	btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-	return ret;
-}
-
 static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     u64 endoff,
@@ -3499,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 	u32 nritems;
 	int slot;
 	int ret;
-	int no_quota;
 	const u64 len = olen_aligned;
-	u64 last_disko = 0;
 	u64 last_dest_end = destoff;
 
 	ret = -ENOMEM;
@@ -3547,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
 process_slot:
-		no_quota = 1;
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
 			if (ret < 0)
@@ -3699,35 +3661,13 @@ process_slot:
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								datal);
 
-				/*
-				 * We need to look up the roots that point at
-				 * this bytenr and see if the new root does.  If
-				 * it does not we need to make sure we update
-				 * quotas appropriately.
-				 */
-				if (disko && root != BTRFS_I(src)->root &&
-				    disko != last_disko) {
-					no_quota = check_ref(trans, root,
-							     disko);
-					if (no_quota < 0) {
-						btrfs_abort_transaction(trans,
-									root,
-									ret);
-						btrfs_end_transaction(trans,
-								      root);
-						ret = no_quota;
-						goto out;
-					}
-				}
-
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(inode),
-							new_key.offset - datao,
-							no_quota);
+							new_key.offset - datao);
 					if (ret) {
 						btrfs_abort_transaction(trans,
 									root,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a7dc456..b4ca545 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset, 1);
+					   key.objectid, key.offset);
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset, 1);
+					key.objectid, key.offset);
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			break;
@@ -1900,23 +1900,21 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0,
-					1);
+					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0, 1);
+					0);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0,
-					1);
+					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0, 1);
+					0);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0, 1);
+						node->level, 0);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1fffe88..323e12c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -693,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset, 0);
+						key->objectid, offset);
 				if (ret)
 					goto out;
 			} else {
-- 
cgit v1.1


From 849ef9286f30c88113906dc35f44a499c0cb385d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 12 Oct 2015 16:55:54 +0200
Subject: btrfs: check unsupported filters in balance arguments

We don't verify that all the balance filter arguments supplemented by
the flags are actually known to the kernel. Thus we let it silently pass
and do nothing.

At the moment this means only the 'limit' filter, but we're going to add
a few more soon so it's better to have that fixed. Also in older stable
kernels so that it works with newer userspace tools.

Cc: stable@vger.kernel.org # 3.16+
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c   | 5 +++++
 fs/btrfs/volumes.h | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4df0f2b..7375cf2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4691,6 +4691,11 @@ locked:
 		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
+	if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+		ret = -EINVAL;
+		goto out_bargs;
+	}
+
 do_balance:
 	/*
 	 * Ownership of bctl and mutually_exclusive_operation_running
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 99c208f..b8e64ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -381,6 +381,14 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 
+#define BTRFS_BALANCE_ARGS_MASK			\
+	(BTRFS_BALANCE_ARGS_PROFILES |		\
+	 BTRFS_BALANCE_ARGS_USAGE |		\
+	 BTRFS_BALANCE_ARGS_DEVID | 		\
+	 BTRFS_BALANCE_ARGS_DRANGE |		\
+	 BTRFS_BALANCE_ARGS_VRANGE |		\
+	 BTRFS_BALANCE_ARGS_LIMIT)
+
 /*
  * Profile changing flags.  When SOFT is set we won't relocate chunk if
  * it already has the target profile (even though it may be
-- 
cgit v1.1


From 2849a854224487bc578b73b64422c3cb3ef93ff5 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 13 Oct 2015 14:06:48 -0400
Subject: btrfs: fix use after free iterating extrefs

The code for btrfs inode-resolve has never worked properly for
files with enough hard links to trigger extrefs.  It was trying to
get the leaf out of a path after freeing the path:

	btrfs_release_path(path);
	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, slot);

The fix here is to use the extent buffer we cloned just a little higher
up to avoid deadlocks caused by using the leaf in the path.

Signed-off-by: Chris Mason <clm@fb.com>
cc: stable@vger.kernel.org # v3.7+
cc: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2adc152a..6dcdb2e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1834,7 +1834,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 	int found = 0;
 	struct extent_buffer *eb;
 	struct btrfs_inode_extref *extref;
-	struct extent_buffer *leaf;
 	u32 item_size;
 	u32 cur_offset;
 	unsigned long ptr;
@@ -1862,9 +1861,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, slot);
-		ptr = btrfs_item_ptr_offset(leaf, slot);
+		item_size = btrfs_item_size_nr(eb, slot);
+		ptr = btrfs_item_ptr_offset(eb, slot);
 		cur_offset = 0;
 
 		while (cur_offset < item_size) {
@@ -1878,7 +1876,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 			if (ret)
 				break;
 
-			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += btrfs_inode_extref_name_len(eb, extref);
 			cur_offset += sizeof(*extref);
 		}
 		btrfs_tree_read_unlock_blocking(eb);
-- 
cgit v1.1


From 12907fc79818a62a2478f84f7795afa774bf7f9c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 17:16:50 +0200
Subject: btrfs: extend balance filter limit to take minimum and maximum

The 'limit' filter is underdesigned, it should have been a range for
[min,max], with some relaxed semantics when one of the bounds is
missing. Besides that, using a full u64 for a single value is a waste of
bytes.

Let's fix both by extending the use of the u64 bytes for the [min,max]
range. This can be done in a backward compatible way, the range will be
interpreted only if the appropriate flag is set
(BTRFS_BALANCE_ARGS_LIMIT_RANGE).

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h           | 14 ++++++++++++--
 fs/btrfs/volumes.c         | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h         |  1 +
 include/uapi/linux/btrfs.h | 13 ++++++++++++-
 4 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3fa3c3b..460cd69 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -846,8 +846,18 @@ struct btrfs_disk_balance_args {
 	/* BTRFS_BALANCE_ARGS_* */
 	__le64 flags;
 
-	/* BTRFS_BALANCE_ARGS_LIMIT value */
-	__le64 limit;
+	/*
+	 * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+	 * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+	 * and maximum
+	 */
+	union {
+		__le64 limit;
+		struct {
+			__le32 limit_min;
+			__le32 limit_max;
+		};
+	};
 
 	__le64 unused[7];
 } __attribute__ ((__packed__));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f86d838..e2fdd2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3300,6 +3300,16 @@ static int should_balance_chunk(struct btrfs_root *root,
 			return 0;
 		else
 			bargs->limit--;
+	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+		/*
+		 * Same logic as the 'limit' filter; the minimum cannot be
+		 * determined here because we do not have the global informatoin
+		 * about the count of all chunks that satisfy the filters.
+		 */
+		if (bargs->limit_max == 0)
+			return 0;
+		else
+			bargs->limit_max--;
 	}
 
 	return 1;
@@ -3314,6 +3324,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
+	u64 chunk_type;
 	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -3324,9 +3335,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	int ret;
 	int enospc_errors = 0;
 	bool counting = true;
+	/* The single value limit and min/max limits use the same bytes in the */
 	u64 limit_data = bctl->data.limit;
 	u64 limit_meta = bctl->meta.limit;
 	u64 limit_sys = bctl->sys.limit;
+	u32 count_data = 0;
+	u32 count_meta = 0;
+	u32 count_sys = 0;
 
 	/* step one make some room on all the devices */
 	devices = &fs_info->fs_devices->devices;
@@ -3367,6 +3382,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->balance_lock);
 again:
 	if (!counting) {
+		/*
+		 * The single value limit and min/max limits use the same bytes
+		 * in the
+		 */
 		bctl->data.limit = limit_data;
 		bctl->meta.limit = limit_meta;
 		bctl->sys.limit = limit_sys;
@@ -3414,6 +3433,7 @@ again:
 		}
 
 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
 
 		if (!counting) {
 			spin_lock(&fs_info->balance_lock);
@@ -3434,6 +3454,28 @@ again:
 			spin_lock(&fs_info->balance_lock);
 			bctl->stat.expected++;
 			spin_unlock(&fs_info->balance_lock);
+
+			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+				count_data++;
+			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+				count_sys++;
+			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+				count_meta++;
+
+			goto loop;
+		}
+
+		/*
+		 * Apply limit_min filter, no need to check if the LIMITS
+		 * filter is used, limit_min is 0 by default
+		 */
+		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+					count_data < bctl->data.limit_min)
+				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+					count_meta < bctl->meta.limit_min)
+				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+					count_sys < bctl->sys.limit_min)) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			goto loop;
 		}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b8e64ea..628d4e1 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -380,6 +380,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index b6dec05..11f1310 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -217,7 +217,18 @@ struct btrfs_balance_args {
 
 	__u64 flags;
 
-	__u64 limit;		/* limit number of processed chunks */
+	/*
+	 * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+	 * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+	 * and maximum
+	 */
+	union {
+		__u64 limit;		/* limit number of processed chunks */
+		struct {
+			__u32 limit_min;
+			__u32 limit_max;
+		};
+	};
 	__u64 unused[7];
 } __attribute__ ((__packed__));
 
-- 
cgit v1.1


From dee32d0ac3719ef8d640efaf0884111df444730f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabr=C3=ADel=20Arth=C3=BAr=20P=C3=A9tursson?=
 <gabriel@system.is>
Date: Mon, 28 Sep 2015 22:32:41 +0000
Subject: btrfs: add balance filter for stripes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Balance block groups which have the given number of stripes, defined by
a range min..max. This is useful to selectively rebalance only chunks
that do not span enough devices, applies to RAID0/10/5/6.

Signed-off-by: Gabríel Arthúr Pétursson <gabriel@system.is>
[ renamed bargs members, added to the UAPI, wrote the changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h           |  9 ++++++++-
 fs/btrfs/volumes.c         | 19 +++++++++++++++++++
 fs/btrfs/volumes.h         |  1 +
 include/uapi/linux/btrfs.h | 10 +++++++++-
 4 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 460cd69..8ccbf43 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -859,7 +859,14 @@ struct btrfs_disk_balance_args {
 		};
 	};
 
-	__le64 unused[7];
+	/*
+	 * Process chunks that cross stripes_min..stripes_max devices,
+	 * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+	 */
+	__le32 stripes_min;
+	__le32 stripes_max;
+
+	__le64 unused[6];
 } __attribute__ ((__packed__));
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e2fdd2e..9f5d167 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3220,6 +3220,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       struct btrfs_balance_args *bargs)
+{
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+	if (bargs->stripes_min <= num_stripes
+			&& num_stripes <= bargs->stripes_max)
+		return 0;
+
+	return 1;
+}
+
 static int chunk_soft_convert_filter(u64 chunk_type,
 				     struct btrfs_balance_args *bargs)
 {
@@ -3286,6 +3299,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* stripes filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
 	/* soft profile changing mode */
 	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
 	    chunk_soft_convert_filter(chunk_type, bargs)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 628d4e1..a5bf224 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -381,6 +381,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 #define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 11f1310..17a9470 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -229,7 +229,15 @@ struct btrfs_balance_args {
 			__u32 limit_max;
 		};
 	};
-	__u64 unused[7];
+
+	/*
+	 * Process chunks that cross stripes_min..stripes_max devices,
+	 * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+	 */
+	__le32 stripes_min;
+	__le32 stripes_max;
+
+	__u64 unused[6];
 } __attribute__ ((__packed__));
 
 /* report balance progress to userspace */
-- 
cgit v1.1


From bc3094673f22d00794a8606200788e411b4ec80d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 20 Oct 2015 18:22:13 +0200
Subject: btrfs: extend balance filter usage to take minimum and maximum

Similar to the 'limit' filter, we can enhance the 'usage' filter to
accept a range. The change is backward compatible, the range is applied
only in connection with the BTRFS_BALANCE_ARGS_USAGE_RANGE flag.

We don't have a usecase yet, the current syntax has been sufficient. The
enhancement should provide parity with other range-like filters.

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h           | 14 ++++++++++++--
 fs/btrfs/volumes.c         | 41 ++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h         |  1 +
 include/uapi/linux/btrfs.h |  8 +++++++-
 4 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8ccbf43..4001585 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -823,8 +823,18 @@ struct btrfs_disk_balance_args {
 	 */
 	__le64 profiles;
 
-	/* usage filter */
-	__le64 usage;
+	/*
+	 * usage filter
+	 * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+	 * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+	 */
+	union {
+		__le64 usage;
+		struct {
+			__le32 usage_min;
+			__le32 usage_max;
+		};
+	};
 
 	/* devid filter */
 	__le64 devid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9f5d167..17ed76d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3059,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
 	 * (albeit full) chunks.
 	 */
 	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
 		bctl->data.usage = 90;
 	}
 	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
 		bctl->sys.usage = 90;
 	}
 	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
 		bctl->meta.usage = 90;
@@ -3124,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 			      struct btrfs_balance_args *bargs)
 {
 	struct btrfs_block_group_cache *cache;
+	u64 chunk_used;
+	u64 user_thresh_min;
+	u64 user_thresh_max;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	if (bargs->usage_min == 0)
+		user_thresh_min = 0;
+	else
+		user_thresh_min = div_factor_fine(cache->key.offset,
+					bargs->usage_min);
+
+	if (bargs->usage_max == 0)
+		user_thresh_max = 1;
+	else if (bargs->usage_max > 100)
+		user_thresh_max = cache->key.offset;
+	else
+		user_thresh_max = div_factor_fine(cache->key.offset,
+					bargs->usage_max);
+
+	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
+		u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
 	u64 chunk_used, user_thresh;
 	int ret = 1;
 
 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
 	chunk_used = btrfs_block_group_used(&cache->item);
 
-	if (bargs->usage == 0)
+	if (bargs->usage_min == 0)
 		user_thresh = 1;
 	else if (bargs->usage > 100)
 		user_thresh = cache->key.offset;
@@ -3279,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root,
 	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
 	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
 		return 0;
+	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+	    chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
 	}
 
 	/* devid filter */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a5bf224..2a396cb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -382,6 +382,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 #define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
 #define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE	(1ULL << 8)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 17a9470..dea8931 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -206,7 +206,13 @@ struct btrfs_ioctl_feature_flags {
  */
 struct btrfs_balance_args {
 	__u64 profiles;
-	__u64 usage;
+	union {
+		__le64 usage;
+		struct {
+			__le32 usage_min;
+			__le32 usage_max;
+		};
+	};
 	__u64 devid;
 	__u64 pstart;
 	__u64 pend;
-- 
cgit v1.1


From b66d62ba1e0d7ada5b89afffe19d12662b5c92c9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Oct 2015 19:15:50 +0200
Subject: btrfs: add balance filters limits, stripes and usage to supported
 mask

Enable the extended 'limit' syntax (a range), the new 'stripes' and
extended 'usage' syntax (a range) filters in the filters mask. The patch
comes separate and not within the series that introduced the new filters
because the patch adding the mask was merged in a late rc. The
integration branch was based on an older rc and could not merge the
patch due to the missing changes.

Prerequisities:
* btrfs: check unsupported filters in balance arguments
* btrfs: extend balance filter limit to take minimum and maximum
* btrfs: add balance filter for stripes
* btrfs: extend balance filter usage to take minimum and maximum

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2a396cb..ec57123 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -390,7 +390,10 @@ struct map_lookup {
 	 BTRFS_BALANCE_ARGS_DEVID | 		\
 	 BTRFS_BALANCE_ARGS_DRANGE |		\
 	 BTRFS_BALANCE_ARGS_VRANGE |		\
-	 BTRFS_BALANCE_ARGS_LIMIT)
+	 BTRFS_BALANCE_ARGS_LIMIT |		\
+	 BTRFS_BALANCE_ARGS_LIMIT_RANGE |	\
+	 BTRFS_BALANCE_ARGS_STRIPES_RANGE |	\
+	 BTRFS_BALANCE_ARGS_USAGE_RANGE)
 
 /*
  * Profile changing flags.  When SOFT is set we won't relocate chunk if
-- 
cgit v1.1


From 0a0e8b89389266bed9cc074c57ea662e4b9b2621 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 26 Oct 2015 09:19:43 +0800
Subject: btrfs: qgroup: Don't copy extent buffer to do qgroup rescan

Ancient qgroup code call memcpy() on a extent buffer and use it for leaf
iteration.

As extent buffer contains lock, pointers to pages, it's never sane to do
such copy.

The following bug may be caused by this insane operation:
[92098.841309] general protection fault: 0000 [#1] SMP
[92098.841338] Modules linked in: ...
[92098.841814] CPU: 1 PID: 24655 Comm: kworker/u4:12 Not tainted
4.3.0-rc1 #1
[92098.841868] Workqueue: btrfs-qgroup-rescan btrfs_qgroup_rescan_helper
[btrfs]
[92098.842261] Call Trace:
[92098.842277]  [<ffffffffc035a5d8>] ? read_extent_buffer+0xb8/0x110
[btrfs]
[92098.842304]  [<ffffffffc0396d00>] ? btrfs_find_all_roots+0x60/0x70
[btrfs]
[92098.842329]  [<ffffffffc039af3d>]
btrfs_qgroup_rescan_worker+0x28d/0x5a0 [btrfs]

Where btrfs_qgroup_rescan_worker+0x28d is btrfs_disk_key_to_cpu(),
called in reading key from the copied extent_buffer.

This patch will use btrfs_clone_extent_buffer() to a better copy of
extent buffer to deal such case.

Reported-by: Stephane Lesimple <stephane_btrfs@lesimple.fr>
Suggested-by: Filipe Manana <fdmanana@kernel.org>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 158633c..31d1934 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2192,10 +2192,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-		   struct btrfs_trans_handle *trans,
-		   struct extent_buffer *scratch_leaf)
+		   struct btrfs_trans_handle *trans)
 {
 	struct btrfs_key found;
+	struct extent_buffer *scratch_leaf = NULL;
 	struct ulist *roots = NULL;
 	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	u64 num_bytes;
@@ -2233,7 +2233,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
 
 	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-	memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+	if (!scratch_leaf) {
+		ret = -ENOMEM;
+		mutex_unlock(&fs_info->qgroup_rescan_lock);
+		goto out;
+	}
+	extent_buffer_get(scratch_leaf);
+	btrfs_tree_read_lock(scratch_leaf);
+	btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
 	slot = path->slots[0];
 	btrfs_release_path(path);
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2259,6 +2267,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 			goto out;
 	}
 out:
+	if (scratch_leaf) {
+		btrfs_tree_read_unlock_blocking(scratch_leaf);
+		free_extent_buffer(scratch_leaf);
+	}
 	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 
 	return ret;
@@ -2270,16 +2282,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 						     qgroup_rescan_work);
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans = NULL;
-	struct extent_buffer *scratch_leaf = NULL;
 	int err = -ENOMEM;
 	int ret = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
-	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
-	if (!scratch_leaf)
-		goto out;
 
 	err = 0;
 	while (!err) {
@@ -2291,8 +2299,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 		if (!fs_info->quota_enabled) {
 			err = -EINTR;
 		} else {
-			err = qgroup_rescan_leaf(fs_info, path, trans,
-						 scratch_leaf);
+			err = qgroup_rescan_leaf(fs_info, path, trans);
 		}
 		if (err > 0)
 			btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2301,7 +2308,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	}
 
 out:
-	kfree(scratch_leaf);
 	btrfs_free_path(path);
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
-- 
cgit v1.1


From 696249132158014d594896df3a81390616069c5c Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 26 Oct 2015 15:06:19 +0900
Subject: btrfs: clear PF_NOFREEZE in cleaner_kthread()

cleaner_kthread() kthread calls try_to_freeze() at the beginning of every
cleanup attempt. This operation can't ever succeed though, as the kthread
hasn't marked itself as freezable.

Before (hopefully eventually) kthread freezing gets converted to fileystem
freezing, we'd rather mark cleaner_kthread() freezable (as my
understanding is that it can generate filesystem I/O during suspend).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 86a11a9..2d46675 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1762,6 +1762,7 @@ static int cleaner_kthread(void *arg)
 	int again;
 	struct btrfs_trans_handle *trans;
 
+	set_freezable();
 	do {
 		again = 0;
 
-- 
cgit v1.1


From 5846a3c26873e86b034c702a8bc202aa76082369 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 26 Oct 2015 14:11:18 +0800
Subject: btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans

Between btrfs_allocerved_file_extent() and
btrfs_add_delayed_qgroup_reserve(), there is a window that delayed_refs
are run and delayed ref head maybe freed before
btrfs_add_delayed_qgroup_reserve().

This will cause btrfs_dad_delayed_qgroup_reserve() to return -ENOENT,
and cause transaction to be aborted.

This patch will record qgroup reserve space info into delayed_ref_head
at btrfs_add_delayed_ref(), to eliminate the race window.

Reported-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/delayed-ref.c | 22 +++++++++++++++++-----
 fs/btrfs/delayed-ref.h |  2 +-
 fs/btrfs/extent-tree.c | 14 ++++++++------
 fs/btrfs/inode.c       | 12 ++++--------
 5 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4001585..a2e73f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3430,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
-				     u64 offset, struct btrfs_key *ins);
+				     u64 offset, u64 ram_bytes,
+				     struct btrfs_key *ins);
 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   u64 root_objectid, u64 owner, u64 offset,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 1c3588a..e06dd75 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -535,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		     struct btrfs_trans_handle *trans,
 		     struct btrfs_delayed_ref_node *ref,
 		     struct btrfs_qgroup_extent_record *qrecord,
-		     u64 bytenr, u64 num_bytes, int action, int is_data)
+		     u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+		     int action, int is_data)
 {
 	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -544,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	int count_mod = 1;
 	int must_insert_reserved = 0;
 
+	/* If reserved is provided, it must be a data extent. */
+	BUG_ON(!is_data && reserved);
+
 	/*
 	 * the head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
@@ -593,6 +597,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 
 	/* Record qgroup extent info if provided */
 	if (qrecord) {
+		if (ref_root && reserved) {
+			head_ref->qgroup_ref_root = ref_root;
+			head_ref->qgroup_reserved = reserved;
+		}
+
 		qrecord->bytenr = bytenr;
 		qrecord->num_bytes = num_bytes;
 		qrecord->old_roots = NULL;
@@ -611,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
 	if (existing) {
+		WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+			&& existing->qgroup_reserved);
 		update_existing_head_ref(delayed_refs, &existing->node, ref);
 		/*
 		 * we've updated the existing ref, free the newly
@@ -777,7 +788,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * the spin lock
 	 */
 	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-					bytenr, num_bytes, action, 0);
+					bytenr, num_bytes, 0, 0, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 			     num_bytes, parent, ref_root, level, action);
@@ -800,7 +811,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, int action,
+			       u64 owner, u64 offset, u64 reserved, int action,
 			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_data_ref *ref;
@@ -839,7 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * the spin lock
 	 */
 	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-					bytenr, num_bytes, action, 1);
+					bytenr, num_bytes, ref_root, reserved,
+					action, 1);
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
@@ -894,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	spin_lock(&delayed_refs->lock);
 
 	add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
-			     num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+			     num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
 			     extent_op->is_data);
 
 	spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f9cf234..00ed02c 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -248,7 +248,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, int action,
+			       u64 owner, u64 offset, u64 reserved, int action,
 			       struct btrfs_delayed_extent_op *extent_op);
 int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
 				     struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c1f8c7e..f50c7c2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2087,8 +2087,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 					BTRFS_ADD_DELAYED_REF, NULL);
 	} else {
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-					num_bytes,
-					parent, root_objectid, owner, offset,
+					num_bytes, parent, root_objectid,
+					owner, offset, 0,
 					BTRFS_ADD_DELAYED_REF, NULL);
 	}
 	return ret;
@@ -6832,8 +6832,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 						num_bytes,
 						parent, root_objectid, owner,
-						offset, BTRFS_DROP_DELAYED_REF,
-						NULL);
+						offset, 0,
+						BTRFS_DROP_DELAYED_REF, NULL);
 	}
 	return ret;
 }
@@ -7759,7 +7759,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
-				     u64 offset, struct btrfs_key *ins)
+				     u64 offset, u64 ram_bytes,
+				     struct btrfs_key *ins)
 {
 	int ret;
 
@@ -7768,7 +7769,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
 					 ins->offset, 0,
 					 root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+					 ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+					 NULL);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6f030c2..4439fbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2127,17 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_file_extent(trans, root,
 					root->root_key.objectid,
-					btrfs_ino(inode), file_pos, &ins);
-	if (ret < 0)
-		goto out;
+					btrfs_ino(inode), file_pos,
+					ram_bytes, &ins);
 	/*
-	 * Release the reserved range from inode dirty range map, and
-	 * move it to delayed ref codes, as now accounting only happens at
-	 * commit_transaction() time.
+	 * Release the reserved range from inode dirty range map, as it is
+	 * already moved into delayed_ref_head
 	 */
 	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
-	ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
-			root->objectid, disk_bytenr, ram_bytes);
 out:
 	btrfs_free_path(path);
 
-- 
cgit v1.1


From 90ce321da8ce3d00e19586993887346286208386 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 26 Oct 2015 14:11:19 +0800
Subject: btrfs: qgroup: Fix a rebase bug which will cause qgroup double free

When rebasing my patchset, I forgot to pick up a cleanup patch to remove
old hotfix in 4.2 release.

Witouth the cleanup, it will screw up new qgroup reserve framework and
always cause minus reserved number.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 31d1934..46476c2 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 			}
 		}
 
-		/* For exclusive extent, free its reserved bytes too */
-		if (nr_old_roots == 0 && nr_new_roots == 1 &&
-		    cur_new_count == nr_new_roots)
-			qg->reserved -= num_bytes;
 		if (dirty)
 			qgroup_dirty(fs_info, qg);
 	}
-- 
cgit v1.1


From 485290a734f14279fa9376b3d6021a2dc1f82356 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 29 Oct 2015 17:28:46 +0800
Subject: btrfs: Fix a data space underflow warning

Even with quota disabled, generic/127 will trigger a kernel warning by
underflow data space info.

The bug is caused by buffered write, which in case of short copy, the
start parameter for btrfs_delalloc_release_space() is wrong, and
round_up/down() in btrfs_delalloc_release() extents the range to page
aligned, decreasing one more page than expected.

This patch will fix it by passing correct start.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 381be79..e6df6f1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1604,12 +1604,17 @@ again:
 				BTRFS_I(inode)->outstanding_extents++;
 				spin_unlock(&BTRFS_I(inode)->lock);
 			}
-			if (only_release_metadata)
+			if (only_release_metadata) {
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
-			else
-				btrfs_delalloc_release_space(inode, pos,
+			} else {
+				u64 __pos;
+
+				__pos = round_down(pos, root->sectorsize) +
+					(dirty_pages << PAGE_CACHE_SHIFT);
+				btrfs_delalloc_release_space(inode, __pos,
 							     release_bytes);
+			}
 		}
 
 		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
-- 
cgit v1.1


From 13a0db5a53f56d1f77e8902dd23258c99c2154b8 Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Mon, 2 Nov 2015 13:59:46 +0530
Subject: Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT
 state

When executing generic/001 in a loop on a ppc64 machine (with both sectorsize
and nodesize set to 64k), the following call trace is observed,

WARNING: at /root/repos/linux/fs/btrfs/locking.c:253
Modules linked in:
CPU: 2 PID: 8353 Comm: umount Not tainted 4.3.0-rc5-13676-ga5e681d #54
task: c0000000f2b1f560 ti: c0000000f6008000 task.ti: c0000000f6008000
NIP: c000000000520c88 LR: c0000000004a3b34 CTR: 0000000000000000
REGS: c0000000f600a820 TRAP: 0700   Not tainted  (4.3.0-rc5-13676-ga5e681d)
MSR: 8000000102029032 <SF,VEC,EE,ME,IR,DR,RI>  CR: 24444884  XER: 00000000
CFAR: c0000000004a3b30 SOFTE: 1
GPR00: c0000000004a3b34 c0000000f600aaa0 c00000000108ac00 c0000000f5a808c0
GPR04: 0000000000000000 c0000000f600ae60 0000000000000000 0000000000000005
GPR08: 00000000000020a1 0000000000000001 c0000000f2b1f560 0000000000000030
GPR12: 0000000084842882 c00000000fdc0900 c0000000f600ae60 c0000000f070b800
GPR16: 0000000000000000 c0000000f3c8a000 0000000000000000 0000000000000049
GPR20: 0000000000000001 0000000000000001 c0000000f5aa01f8 0000000000000000
GPR24: 0f83e0f83e0f83e1 c0000000f5a808c0 c0000000f3c8d000 c000000000000000
GPR28: c0000000f600ae74 0000000000000001 c0000000f3c8d000 c0000000f5a808c0
NIP [c000000000520c88] .btrfs_tree_lock+0x48/0x2a0
LR [c0000000004a3b34] .btrfs_lock_root_node+0x44/0x80
Call Trace:
[c0000000f600aaa0] [c0000000f600ab80] 0xc0000000f600ab80 (unreliable)
[c0000000f600ab80] [c0000000004a3b34] .btrfs_lock_root_node+0x44/0x80
[c0000000f600ac00] [c0000000004a99dc] .btrfs_search_slot+0xa8c/0xc00
[c0000000f600ad40] [c0000000004ab878] .btrfs_insert_empty_items+0x98/0x120
[c0000000f600adf0] [c00000000050da44] .btrfs_finish_chunk_alloc+0x1d4/0x620
[c0000000f600af20] [c0000000004be854] .btrfs_create_pending_block_groups+0x1d4/0x2c0
[c0000000f600b020] [c0000000004bf188] .do_chunk_alloc+0x3c8/0x420
[c0000000f600b100] [c0000000004c27cc] .find_free_extent+0xbfc/0x1030
[c0000000f600b260] [c0000000004c2ce8] .btrfs_reserve_extent+0xe8/0x250
[c0000000f600b330] [c0000000004c2f90] .btrfs_alloc_tree_block+0x140/0x590
[c0000000f600b440] [c0000000004a47b4] .__btrfs_cow_block+0x124/0x780
[c0000000f600b530] [c0000000004a4fc0] .btrfs_cow_block+0xf0/0x250
[c0000000f600b5e0] [c0000000004a917c] .btrfs_search_slot+0x22c/0xc00
[c0000000f600b720] [c00000000050aa40] .btrfs_remove_chunk+0x1b0/0x9f0
[c0000000f600b850] [c0000000004c4e04] .btrfs_delete_unused_bgs+0x434/0x570
[c0000000f600b950] [c0000000004d3cb8] .close_ctree+0x2e8/0x3b0
[c0000000f600ba20] [c00000000049d178] .btrfs_put_super+0x18/0x30
[c0000000f600ba90] [c000000000243cd4] .generic_shutdown_super+0xa4/0x1a0
[c0000000f600bb10] [c0000000002441d8] .kill_anon_super+0x18/0x30
[c0000000f600bb90] [c00000000049c898] .btrfs_kill_super+0x18/0xc0
[c0000000f600bc10] [c0000000002444f8] .deactivate_locked_super+0x98/0xe0
[c0000000f600bc90] [c000000000269f94] .cleanup_mnt+0x54/0xa0
[c0000000f600bd10] [c0000000000bd744] .task_work_run+0xc4/0x100
[c0000000f600bdb0] [c000000000016334] .do_notify_resume+0x74/0x80
[c0000000f600be30] [c0000000000098b8] .ret_from_except_lite+0x64/0x68
Instruction dump:
fba1ffe8 fbc1fff0 fbe1fff8 7c791b78 f8010010 f821ff21 e94d0290 81030040
812a04e8 7d094a78 7d290034 5529d97e <0b090000> 3b400000 3be30050 3bc3004c

The above call trace is seen even on x86_64; albeit very rarely and that too
with nodesize set to 64k and with nospace_cache mount option being used.

The reason for the above call trace is,
btrfs_remove_chunk
  check_system_chunk
    Allocate chunk if required
  For each physical stripe on underlying device,
    btrfs_free_dev_extent
      ...
      Take lock on Device tree's root node
      btrfs_cow_block("dev tree's root node");
        btrfs_reserve_extent
          find_free_extent
	    index = BTRFS_RAID_DUP;
	    have_caching_bg = false;

            When in LOOP_CACHING_NOWAIT state, Assume we find a block group
	    which is being cached; Hence have_caching_bg is set to true

            When repeating the search for the next RAID index, we set
	    have_caching_bg to false.

Hence right after completing the LOOP_CACHING_NOWAIT state, we incorrectly
skip LOOP_CACHING_WAIT state and move to LOOP_ALLOC_CHUNK state where we
allocate a chunk and try to add entries corresponding to the chunk's physical
stripe into the device tree. When doing so the task deadlocks itself waiting
for the blocking lock on the root node of the device tree.

This commit fixes the issue by introducing a new local variable to help
indicate as to whether a block group of any RAID type is being cached.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f50c7c2..99a8e57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7029,6 +7029,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	bool failed_alloc = false;
 	bool use_cluster = true;
 	bool have_caching_bg = false;
+	bool orig_have_caching_bg = false;
 	bool full_search = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
@@ -7378,6 +7379,10 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
+	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+		&& !orig_have_caching_bg)
+		orig_have_caching_bg = true;
+
 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
 		goto search;
 
@@ -7400,7 +7405,7 @@ loop:
 			 * don't have any unached bgs and we've alrelady done a
 			 * full search through.
 			 */
-			if (have_caching_bg || !full_search)
+			if (orig_have_caching_bg || !full_search)
 				loop = LOOP_CACHING_WAIT;
 			else
 				loop = LOOP_ALLOC_CHUNK;
-- 
cgit v1.1


From 2959a32a858a2c44bbbce83d19c158d54cc5998a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 2 Nov 2015 12:32:44 +0000
Subject: Btrfs: fix hole punching when using the no-holes feature

When we are using the no-holes feature, if we punch a hole into a file
range that already contains a hole which overlaps the range we are passing
to fallocate(), we end up removing the extent map that represents the
existing hole without adding a new one. This happens because with the
no-holes feature we do not have explicit extent items to represent holes
and therefore the call to __btrfs_drop_extents(), made from
btrfs_punch_hole(), returns an end offset to the variable drop_end that
is smaller than the end of the range passed to fallocate(), while it
drops all existing extent maps in that range.
Normally having a missing extent map is not a problem, for example for
a readpages() operation we just end up building the extent map by
looking at the fs/subvol tree for a matching extent item (or a lack of
one for implicit holes). However for an fsync that uses the fast path,
which needs to look at the list of modified extent maps, this means
the fsync will not record information about the complete hole we had
before the fallocate() call into the log tree, resulting in a file with
content/layout that does not match what we had neither before nor after
the hole punch operation.

The following test case for fstests reproduces the issue. It fails without
this change because we get a file with a different digest after the fsync
log replay and also with a different extent/hole layout.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
     _cleanup_flakey
     rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/punch
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs generic
  _supported_os Linux
  _require_scratch
  _require_xfs_io_command "fpunch"
  _require_xfs_io_command "fiemap"
  _require_dm_target flakey
  _require_metadata_journaling $SCRATCH_DEV

  # This test was motivated by an issue found in btrfs when the btrfs
  # no-holes feature is enabled (introduced in kernel 3.14). So enable
  # the feature if the fs being tested is btrfs.
  if [ $FSTYP == "btrfs" ]; then
      _require_btrfs_fs_feature "no_holes"
      _require_btrfs_mkfs_feature "no-holes"
      MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes"
  fi

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create out test file with some data and then fsync it.
  # We do the fsync only to make sure the last fsync we do in this test
  # triggers the fast code path of btrfs' fsync implementation, a
  # condition necessary to trigger the bug btrfs had.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 128K" \
                  -c "fsync"                  \
                  $SCRATCH_MNT/foobar | _filter_xfs_io

  # Now punch a hole against the range [96K, 128K[.
  $XFS_IO_PROG -c "fpunch 96K 32K" $SCRATCH_MNT/foobar

  # Punch another hole against a range that overlaps the previous range
  # and ends beyond eof.
  $XFS_IO_PROG -c "fpunch 64K 128K" $SCRATCH_MNT/foobar

  # Punch another hole against a range that overlaps the first range
  # ([96K, 128K[) and ends at eof.
  $XFS_IO_PROG -c "fpunch 32K 96K" $SCRATCH_MNT/foobar

  # Fsync our file. We want to verify that, after a power failure and
  # mounting the filesystem again, the file content reflects all the hole
  # punch operations.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar

  echo "File digest before power failure:"
  md5sum $SCRATCH_MNT/foobar | _filter_scratch

  echo "Fiemap before power failure:"
  $XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/foobar | _filter_fiemap

  # Silently drop all writes and umount to simulate a crash/power failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again, mount to trigger log replay and validate file
  # contents.
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  echo "File digest after log replay:"
  # Must match the same digest we got before the power failure.
  md5sum $SCRATCH_MNT/foobar | _filter_scratch

  echo "Fiemap after log replay:"
  # Must match the same extent listing we got before the power failure.
  $XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/foobar | _filter_fiemap

  _unmount_flakey

  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e6df6f1..cfa243c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2494,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	/*
+	 * If we are using the NO_HOLES feature we might have had already an
+	 * hole that overlaps a part of the region [lockstart, lockend] and
+	 * ends at (or beyond) lockend. Since we have no file extent items to
+	 * represent holes, drop_end can be less than lockend and so we must
+	 * make sure we have an extent map representing the existing hole (the
+	 * call to __btrfs_drop_extents() might have dropped the existing extent
+	 * map representing the existing hole), otherwise the fast fsync path
+	 * will not record the existence of the hole region
+	 * [existing_hole_start, lockend].
+	 */
+	if (drop_end <= lockend)
+		drop_end = lockend + 1;
+	/*
 	 * Don't insert file hole extent item if it's for a range beyond eof
 	 * (because it's useless) or if it represents a 0 bytes range (when
 	 * cur_offset == drop_end).
-- 
cgit v1.1