From 128a37852234c1bd68eee4e7447f5362778009b8 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 21 Sep 2015 09:43:06 +0200
Subject: fs: fix data races on inode->i_flctx

locks_get_lock_context() uses cmpxchg() to install i_flctx.
cmpxchg() is a release operation which is correct. But it uses
a plain load to load i_flctx. This is incorrect. Subsequent loads
from i_flctx can hoist above the load of i_flctx pointer itself
and observe uninitialized garbage there. This in turn can lead
to corruption of ctx->flc_lock and other members.

Documentation/memory-barriers.txt explicitly requires to use
a barrier in such context:
"A load-load control dependency requires a full read memory barrier".

Use smp_load_acquire() in locks_get_lock_context() and in bunch
of other functions that can proceed concurrently with
locks_get_lock_context().

The data race was found with KernelThreadSanitizer (KTSAN).

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 2a54c80..316e474 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
 static struct file_lock_context *
 locks_get_lock_context(struct inode *inode, int type)
 {
-	struct file_lock_context *new;
+	struct file_lock_context *ctx;
 
-	if (likely(inode->i_flctx) || type == F_UNLCK)
+	/* paired with cmpxchg() below */
+	ctx = smp_load_acquire(&inode->i_flctx);
+	if (likely(ctx) || type == F_UNLCK)
 		goto out;
 
-	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
-	if (!new)
+	ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+	if (!ctx)
 		goto out;
 
-	spin_lock_init(&new->flc_lock);
-	INIT_LIST_HEAD(&new->flc_flock);
-	INIT_LIST_HEAD(&new->flc_posix);
-	INIT_LIST_HEAD(&new->flc_lease);
+	spin_lock_init(&ctx->flc_lock);
+	INIT_LIST_HEAD(&ctx->flc_flock);
+	INIT_LIST_HEAD(&ctx->flc_posix);
+	INIT_LIST_HEAD(&ctx->flc_lease);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
 	 * free the context we just allocated.
 	 */
-	if (cmpxchg(&inode->i_flctx, NULL, new))
-		kmem_cache_free(flctx_cache, new);
+	if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+		kmem_cache_free(flctx_cache, ctx);
+		ctx = smp_load_acquire(&inode->i_flctx);
+	}
 out:
-	return inode->i_flctx;
+	return ctx;
 }
 
 void
@@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	struct file_lock_context *ctx;
 	struct inode *inode = file_inode(filp);
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
 		fl->fl_type = F_UNLCK;
 		return;
@@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix))
 		return 0;
 
@@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	struct file_lock *new_fl, *fl, *tmp;
 	unsigned long break_time;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
@@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	new_fl->fl_flags = type;
 
 	/* typically we will check that ctx is non-NULL before calling */
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		WARN_ON_ONCE(1);
 		return error;
@@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
 	bool has_lease = false;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		if (!list_empty(&ctx->flc_lease)) {
@@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
 	struct inode *inode = file_inode(filp);
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		time_out_leases(file_inode(filp), &dispose);
@@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
 	struct file_lock *fl, *victim = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		trace_generic_delete_lease(inode, NULL);
 		return error;
@@ -2359,13 +2367,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	struct file_lock lock;
-	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+	struct file_lock_context *ctx;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
+	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
 	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
@@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
 
 /* The i_flctx must be valid when calling into here */
 static void
-locks_remove_flock(struct file *filp)
+locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 {
 	struct file_lock fl = {
 		.fl_owner = filp,
@@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
 		.fl_end = OFFSET_MAX,
 	};
 	struct inode *inode = file_inode(filp);
-	struct file_lock_context *flctx = inode->i_flctx;
 
 	if (list_empty(&flctx->flc_flock))
 		return;
@@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
 
 /* The i_flctx must be valid when calling into here */
 static void
-locks_remove_lease(struct file *filp)
+locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
-	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl, *tmp;
 	LIST_HEAD(dispose);
 
@@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
  */
 void locks_remove_file(struct file *filp)
 {
-	if (!file_inode(filp)->i_flctx)
+	struct file_lock_context *ctx;
+
+	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+	if (!ctx)
 		return;
 
 	/* remove any OFD locks */
 	locks_remove_posix(filp, filp);
 
 	/* remove flock locks */
-	locks_remove_flock(filp);
+	locks_remove_flock(filp, ctx);
 
 	/* remove any leases */
-	locks_remove_lease(filp);
+	locks_remove_lease(filp, ctx);
 }
 
 /**
@@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
 	struct file_lock_context *ctx;
 	int id = 0;
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx)
 		return;
 
-- 
cgit v1.1


From e1305df1283cbe1aa57093f8766b2dfe650ed5ff Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Sun, 13 Sep 2015 14:15:29 +0200
Subject: jffs2: drop null test before destroy functions

Remove unneeded NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@ expression x; @@
-if (x != NULL)
  \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 fs/jffs2/malloc.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index b8fd651..ce11897 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void)
 
 void jffs2_destroy_slab_caches(void)
 {
-	if(full_dnode_slab)
-		kmem_cache_destroy(full_dnode_slab);
-	if(raw_dirent_slab)
-		kmem_cache_destroy(raw_dirent_slab);
-	if(raw_inode_slab)
-		kmem_cache_destroy(raw_inode_slab);
-	if(tmp_dnode_info_slab)
-		kmem_cache_destroy(tmp_dnode_info_slab);
-	if(raw_node_ref_slab)
-		kmem_cache_destroy(raw_node_ref_slab);
-	if(node_frag_slab)
-		kmem_cache_destroy(node_frag_slab);
-	if(inode_cache_slab)
-		kmem_cache_destroy(inode_cache_slab);
+	kmem_cache_destroy(full_dnode_slab);
+	kmem_cache_destroy(raw_dirent_slab);
+	kmem_cache_destroy(raw_inode_slab);
+	kmem_cache_destroy(tmp_dnode_info_slab);
+	kmem_cache_destroy(raw_node_ref_slab);
+	kmem_cache_destroy(node_frag_slab);
+	kmem_cache_destroy(inode_cache_slab);
 #ifdef CONFIG_JFFS2_FS_XATTR
-	if (xattr_datum_cache)
-		kmem_cache_destroy(xattr_datum_cache);
-	if (xattr_ref_cache)
-		kmem_cache_destroy(xattr_ref_cache);
+	kmem_cache_destroy(xattr_datum_cache);
+	kmem_cache_destroy(xattr_ref_cache);
 #endif
 }
 
-- 
cgit v1.1


From 4b813f09405823dcbb27a434f6ddcd1741e55e9b Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Tue, 22 Sep 2015 07:41:21 -0500
Subject: GFS2: fallocate: do not rely on file_update_time to mark the inode
 dirty

Previously __gfs2_fallocate() relied on file_update_time() marking the
inode dirty, but that's not a safe assumption as that function doesn't
dirty the inode in some cases. Mark the inode dirty explicitly.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cf4ab89..71cd138 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -897,8 +897,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
 		i_size_write(inode, pos + count);
-		/* Marks the inode as dirty */
 		file_update_time(file);
+		mark_inode_dirty(inode);
 	}
 
 	return generic_write_sync(file, pos, count);
-- 
cgit v1.1


From 6de20eb0dee4c802d9d228d6c08d00386e172e52 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Wed, 23 Sep 2015 08:45:43 -0500
Subject: GFS2: Set s_mode before parsing mount options

In the generic mount_bdev() function, deactivate_locked_super() is
called after the fill_super() call fails, at which point s_mode has been
set. kill_block_super() expects this and dumps a warning when
FMODE_EXCL is not set in s_mode.

In gfs2_mount() we call deactivate_locked_super() on failure of
gfs2_mount_args(), at which point s_mode has not yet been set. This
causes kill_block_super() to dump a stack trace when gfs2 fails to mount
with invalid options. Set s_mode earlier in gfs2_mount() to avoid that.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/ops_fstype.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 02586e7..baab99b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1291,6 +1291,9 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		up_write(&s->s_umount);
 		blkdev_put(bdev, mode);
 		down_write(&s->s_umount);
+	} else {
+		/* s_mode must be set before deactivate_locked_super calls */
+		s->s_mode = mode;
 	}
 
 	memset(&args, 0, sizeof(args));
@@ -1314,7 +1317,6 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
-- 
cgit v1.1


From b5799018827e67867fda77c3cde10fc080606fe2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 23 Sep 2015 12:44:17 -0400
Subject: ext4: move sysfs code from super.c to fs/ext4/sysfs.c

Also statically allocate the ext4_kset and ext4_feat objects, since we
only need exactly one of each, and it's simpler and less code if we
drop the dynamic allocation and deallocation when it's not needed.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/Makefile |   2 +-
 fs/ext4/ext4.h   |   5 +
 fs/ext4/super.c  | 426 +++----------------------------------------------------
 fs/ext4/sysfs.c  | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 410 insertions(+), 410 deletions(-)
 create mode 100644 fs/ext4/sysfs.c

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 75285ea..f52cf54 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,7 @@ ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
 		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-		xattr_trusted.o inline.o readpage.o
+		xattr_trusted.o inline.o readpage.o sysfs.o
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fd1f28b..76daccf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2903,6 +2903,11 @@ extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
+/* sysfs.c */
+extern int ext4_register_sysfs(struct super_block *sb);
+extern int __init ext4_init_sysfs(void);
+extern void ext4_exit_sysfs(void);
+
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a63c7b0..4a574fb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,10 +55,8 @@
 #include <trace/events/ext4.h>
 
 static struct proc_dir_entry *ext4_proc_root;
-static struct kset *ext4_kset;
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
-static struct ext4_features *ext4_feat;
 static int ext4_mballoc_ready;
 static struct ratelimit_state ext4_mount_msg_ratelimit;
 
@@ -83,7 +81,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
-static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 static struct file_system_type ext2_fs_type = {
@@ -2470,335 +2467,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 	return ret;
 }
 
-/* sysfs supprt */
-
-struct ext4_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
-			 const char *, size_t);
-	union {
-		int offset;
-		int deprecated_val;
-	} u;
-};
-
-static int parse_strtoull(const char *buf,
-		unsigned long long max, unsigned long long *value)
-{
-	int ret;
-
-	ret = kstrtoull(skip_spaces(buf), 0, value);
-	if (!ret && *value > max)
-		ret = -EINVAL;
-	return ret;
-}
-
-static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
-					      struct ext4_sb_info *sbi,
-					      char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(s64) EXT4_C2B(sbi,
-			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-}
-
-static ssize_t session_write_kbytes_show(struct ext4_attr *a,
-					 struct ext4_sb_info *sbi, char *buf)
-{
-	struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
-	if (!sb->s_bdev->bd_part)
-		return snprintf(buf, PAGE_SIZE, "0\n");
-	return snprintf(buf, PAGE_SIZE, "%lu\n",
-			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-			 sbi->s_sectors_written_start) >> 1);
-}
-
-static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
-					  struct ext4_sb_info *sbi, char *buf)
-{
-	struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
-	if (!sb->s_bdev->bd_part)
-		return snprintf(buf, PAGE_SIZE, "0\n");
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-			(unsigned long long)(sbi->s_kbytes_written +
-			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
-}
-
-static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
-					  struct ext4_sb_info *sbi,
-					  const char *buf, size_t count)
-{
-	unsigned long t;
-	int ret;
-
-	ret = kstrtoul(skip_spaces(buf), 0, &t);
-	if (ret)
-		return ret;
-
-	if (t && (!is_power_of_2(t) || t > 0x40000000))
-		return -EINVAL;
-
-	sbi->s_inode_readahead_blks = t;
-	return count;
-}
-
-static ssize_t sbi_ui_show(struct ext4_attr *a,
-			   struct ext4_sb_info *sbi, char *buf)
-{
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t sbi_ui_store(struct ext4_attr *a,
-			    struct ext4_sb_info *sbi,
-			    const char *buf, size_t count)
-{
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-	unsigned long t;
-	int ret;
-
-	ret = kstrtoul(skip_spaces(buf), 0, &t);
-	if (ret)
-		return ret;
-	*ui = t;
-	return count;
-}
-
-static ssize_t es_ui_show(struct ext4_attr *a,
-			   struct ext4_sb_info *sbi, char *buf)
-{
-
-	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
-			   a->u.offset);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t reserved_clusters_show(struct ext4_attr *a,
-				  struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
-}
-
-static ssize_t reserved_clusters_store(struct ext4_attr *a,
-				   struct ext4_sb_info *sbi,
-				   const char *buf, size_t count)
-{
-	unsigned long long val;
-	int ret;
-
-	if (parse_strtoull(buf, -1ULL, &val))
-		return -EINVAL;
-	ret = ext4_reserve_clusters(sbi, val);
-
-	return ret ? ret : count;
-}
-
-static ssize_t trigger_test_error(struct ext4_attr *a,
-				  struct ext4_sb_info *sbi,
-				  const char *buf, size_t count)
-{
-	int len = count;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (len && buf[len-1] == '\n')
-		len--;
-
-	if (len)
-		ext4_error(sbi->s_sb, "%.*s", len, buf);
-	return count;
-}
-
-static ssize_t sbi_deprecated_show(struct ext4_attr *a,
-				   struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
-}
-
-#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = {			\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
-	.show	= _show,					\
-	.store	= _store,					\
-	.u = {							\
-		.offset = offsetof(struct ext4_sb_info, _elname),\
-	},							\
-}
-
-#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
-static struct ext4_attr ext4_attr_##_name = {				\
-	.attr = {.name = __stringify(_name), .mode = _mode },		\
-	.show	= _show,						\
-	.store	= _store,						\
-	.u = {								\
-		.offset = offsetof(struct ext4_super_block, _elname),	\
-	},								\
-}
-
-#define EXT4_ATTR(name, mode, show, store) \
-static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
-
-#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
-#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
-#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
-
-#define EXT4_RO_ATTR_ES_UI(name, elname)	\
-	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
-#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
-	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
-
-#define ATTR_LIST(name) &ext4_attr_##name.attr
-#define EXT4_DEPRECATED_ATTR(_name, _val)	\
-static struct ext4_attr ext4_attr_##_name = {			\
-	.attr = {.name = __stringify(_name), .mode = 0444 },	\
-	.show	= sbi_deprecated_show,				\
-	.u = {							\
-		.deprecated_val = _val,				\
-	},							\
-}
-
-EXT4_RO_ATTR(delayed_allocation_blocks);
-EXT4_RO_ATTR(session_write_kbytes);
-EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RW_ATTR(reserved_clusters);
-EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
-		 inode_readahead_blks_store, s_inode_readahead_blks);
-EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
-EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
-EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
-EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
-EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
-EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
-
-static struct attribute *ext4_attrs[] = {
-	ATTR_LIST(delayed_allocation_blocks),
-	ATTR_LIST(session_write_kbytes),
-	ATTR_LIST(lifetime_write_kbytes),
-	ATTR_LIST(reserved_clusters),
-	ATTR_LIST(inode_readahead_blks),
-	ATTR_LIST(inode_goal),
-	ATTR_LIST(mb_stats),
-	ATTR_LIST(mb_max_to_scan),
-	ATTR_LIST(mb_min_to_scan),
-	ATTR_LIST(mb_order2_req),
-	ATTR_LIST(mb_stream_req),
-	ATTR_LIST(mb_group_prealloc),
-	ATTR_LIST(max_writeback_mb_bump),
-	ATTR_LIST(extent_max_zeroout_kb),
-	ATTR_LIST(trigger_fs_error),
-	ATTR_LIST(err_ratelimit_interval_ms),
-	ATTR_LIST(err_ratelimit_burst),
-	ATTR_LIST(warning_ratelimit_interval_ms),
-	ATTR_LIST(warning_ratelimit_burst),
-	ATTR_LIST(msg_ratelimit_interval_ms),
-	ATTR_LIST(msg_ratelimit_burst),
-	ATTR_LIST(errors_count),
-	ATTR_LIST(first_error_time),
-	ATTR_LIST(last_error_time),
-	NULL,
-};
-
-/* Features this copy of ext4 supports */
-EXT4_INFO_ATTR(lazy_itable_init);
-EXT4_INFO_ATTR(batched_discard);
-EXT4_INFO_ATTR(meta_bg_resize);
-EXT4_INFO_ATTR(encryption);
-
-static struct attribute *ext4_feat_attrs[] = {
-	ATTR_LIST(lazy_itable_init),
-	ATTR_LIST(batched_discard),
-	ATTR_LIST(meta_bg_resize),
-	ATTR_LIST(encryption),
-	NULL,
-};
-
-static ssize_t ext4_attr_show(struct kobject *kobj,
-			      struct attribute *attr, char *buf)
-{
-	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
-						s_kobj);
-	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
-	return a->show ? a->show(a, sbi, buf) : 0;
-}
-
-static ssize_t ext4_attr_store(struct kobject *kobj,
-			       struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
-						s_kobj);
-	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
-	return a->store ? a->store(a, sbi, buf, len) : 0;
-}
-
-static void ext4_sb_release(struct kobject *kobj)
-{
-	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
-						s_kobj);
-	complete(&sbi->s_kobj_unregister);
-}
-
-static const struct sysfs_ops ext4_attr_ops = {
-	.show	= ext4_attr_show,
-	.store	= ext4_attr_store,
-};
-
-static struct kobj_type ext4_ktype = {
-	.default_attrs	= ext4_attrs,
-	.sysfs_ops	= &ext4_attr_ops,
-	.release	= ext4_sb_release,
-};
-
-static void ext4_feat_release(struct kobject *kobj)
-{
-	complete(&ext4_feat->f_kobj_unregister);
-}
-
-static ssize_t ext4_feat_show(struct kobject *kobj,
-			      struct attribute *attr, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "supported\n");
-}
-
-/*
- * We can not use ext4_attr_show/store because it relies on the kobject
- * being embedded in the ext4_sb_info structure which is definitely not
- * true in this case.
- */
-static const struct sysfs_ops ext4_feat_ops = {
-	.show	= ext4_feat_show,
-	.store	= NULL,
-};
-
-static struct kobj_type ext4_feat_ktype = {
-	.default_attrs	= ext4_feat_attrs,
-	.sysfs_ops	= &ext4_feat_ops,
-	.release	= ext4_feat_release,
-};
-
 /*
  * Check whether this filesystem can be mounted based on
  * the features present and the RDONLY/RDWR mount requested.
@@ -3403,10 +3071,10 @@ int ext4_calculate_overhead(struct super_block *sb)
 	return 0;
 }
 
-
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+static void ext4_set_resv_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t resv_clusters;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	/*
 	 * There's no need to reserve anything when we aren't using extents.
@@ -3415,7 +3083,7 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 	 * to keep ext2/3 backward compatibility.
 	 */
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
-		return 0;
+		return;
 	/*
 	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
 	 * This should cover the situations where we can not afford to run
@@ -3424,26 +3092,13 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 	 * allocation would require 1, or 2 blocks, higher numbers are
 	 * very rare.
 	 */
-	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
-			EXT4_SB(sb)->s_cluster_bits;
+	resv_clusters = (ext4_blocks_count(sbi->s_es) >>
+			 sbi->s_cluster_bits);
 
 	do_div(resv_clusters, 50);
 	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
 
-	return resv_clusters;
-}
-
-
-static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
-{
-	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
-				sbi->s_cluster_bits;
-
-	if (count >= clusters)
-		return -EINVAL;
-
-	atomic64_set(&sbi->s_resv_clusters, count);
-	return 0;
+	atomic64_set(&sbi->s_resv_clusters, resv_clusters);
 }
 
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -4192,12 +3847,7 @@ no_journal:
 			 "available");
 	}
 
-	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
-	if (err) {
-		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
-			 "reserved pool", ext4_calculate_resv_clusters(sb));
-		goto failed_mount4a;
-	}
+	ext4_set_resv_clusters(sb);
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
@@ -4248,10 +3898,7 @@ no_journal:
 	if (err)
 		goto failed_mount6;
 
-	sbi->s_kobj.kset = ext4_kset;
-	init_completion(&sbi->s_kobj_unregister);
-	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
-				   "%s", sb->s_id);
+	err = ext4_register_sysfs(sb);
 	if (err)
 		goto failed_mount7;
 
@@ -5586,37 +5233,6 @@ static struct file_system_type ext4_fs_type = {
 };
 MODULE_ALIAS_FS("ext4");
 
-static int __init ext4_init_feat_adverts(void)
-{
-	struct ext4_features *ef;
-	int ret = -ENOMEM;
-
-	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
-	if (!ef)
-		goto out;
-
-	ef->f_kobj.kset = ext4_kset;
-	init_completion(&ef->f_kobj_unregister);
-	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
-				   "features");
-	if (ret) {
-		kfree(ef);
-		goto out;
-	}
-
-	ext4_feat = ef;
-	ret = 0;
-out:
-	return ret;
-}
-
-static void ext4_exit_feat_adverts(void)
-{
-	kobject_put(&ext4_feat->f_kobj);
-	wait_for_completion(&ext4_feat->f_kobj_unregister);
-	kfree(ext4_feat);
-}
-
 /* Shared across all ext4 file systems */
 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
@@ -5643,21 +5259,16 @@ static int __init ext4_init_fs(void)
 
 	err = ext4_init_pageio();
 	if (err)
-		goto out7;
+		goto out5;
 
 	err = ext4_init_system_zone();
 	if (err)
-		goto out6;
-	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
-	if (!ext4_kset) {
-		err = -ENOMEM;
-		goto out5;
-	}
+		goto out4;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 
-	err = ext4_init_feat_adverts();
+	err = ext4_init_sysfs();
 	if (err)
-		goto out4;
+		goto out3;
 
 	err = ext4_init_mballoc();
 	if (err)
@@ -5682,16 +5293,14 @@ out1:
 	ext4_mballoc_ready = 0;
 	ext4_exit_mballoc();
 out2:
-	ext4_exit_feat_adverts();
-out4:
+	ext4_exit_sysfs();
+out3:
 	if (ext4_proc_root)
 		remove_proc_entry("fs/ext4", NULL);
-	kset_unregister(ext4_kset);
-out5:
 	ext4_exit_system_zone();
-out6:
+out4:
 	ext4_exit_pageio();
-out7:
+out5:
 	ext4_exit_es();
 
 	return err;
@@ -5706,9 +5315,8 @@ static void __exit ext4_exit_fs(void)
 	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
 	ext4_exit_mballoc();
-	ext4_exit_feat_adverts();
+	ext4_exit_sysfs();
 	remove_proc_entry("fs/ext4", NULL);
-	kset_unregister(ext4_kset);
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
 	ext4_exit_es();
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
new file mode 100644
index 0000000..9fdc21b
--- /dev/null
+++ b/fs/ext4/sysfs.c
@@ -0,0 +1,387 @@
+/*
+ *  linux/fs/ext4/sysfs.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+struct ext4_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+			 const char *, size_t);
+	union {
+		int offset;
+		int deprecated_val;
+	} u;
+};
+
+static int parse_strtoull(const char *buf,
+		unsigned long long max, unsigned long long *value)
+{
+	int ret;
+
+	ret = kstrtoull(skip_spaces(buf), 0, value);
+	if (!ret && *value > max)
+		ret = -EINVAL;
+	return ret;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(s64) EXT4_C2B(sbi,
+			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+					 struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			 sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)(sbi->s_kbytes_written +
+			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+
+	if (t && (!is_power_of_2(t) || t > 0x40000000))
+		return -EINVAL;
+
+	sbi->s_inode_readahead_blks = t;
+	return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+	*ui = t;
+	return count;
+}
+
+static ssize_t es_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+
+	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
+			   a->u.offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >>
+				 sbi->s_cluster_bits);
+
+	if (parse_strtoull(buf, -1ULL, &val))
+		return -EINVAL;
+
+	if (val >= clusters)
+		return -EINVAL;
+
+	atomic64_set(&sbi->s_resv_clusters, val);
+	return count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi,
+				  const char *buf, size_t count)
+{
+	int len = count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (len && buf[len-1] == '\n')
+		len--;
+
+	if (len)
+		ext4_error(sbi->s_sb, "%.*s", len, buf);
+	return count;
+}
+
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.u = {							\
+		.offset = offsetof(struct ext4_sb_info, _elname),\
+	},							\
+}
+
+#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
+static struct ext4_attr ext4_attr_##_name = {				\
+	.attr = {.name = __stringify(_name), .mode = _mode },		\
+	.show	= _show,						\
+	.store	= _store,						\
+	.u = {								\
+		.offset = offsetof(struct ext4_super_block, _elname),	\
+	},								\
+}
+
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+
+#define EXT4_RO_ATTR_ES_UI(name, elname)	\
+	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
+	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)	\
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = 0444 },	\
+	.show	= sbi_deprecated_show,				\
+	.u = {							\
+		.deprecated_val = _val,				\
+	},							\
+}
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+
+static struct attribute *ext4_attrs[] = {
+	ATTR_LIST(delayed_allocation_blocks),
+	ATTR_LIST(session_write_kbytes),
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(reserved_clusters),
+	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(inode_goal),
+	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_max_to_scan),
+	ATTR_LIST(mb_min_to_scan),
+	ATTR_LIST(mb_order2_req),
+	ATTR_LIST(mb_stream_req),
+	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_max_zeroout_kb),
+	ATTR_LIST(trigger_fs_error),
+	ATTR_LIST(err_ratelimit_interval_ms),
+	ATTR_LIST(err_ratelimit_burst),
+	ATTR_LIST(warning_ratelimit_interval_ms),
+	ATTR_LIST(warning_ratelimit_burst),
+	ATTR_LIST(msg_ratelimit_interval_ms),
+	ATTR_LIST(msg_ratelimit_burst),
+	ATTR_LIST(errors_count),
+	ATTR_LIST(first_error_time),
+	ATTR_LIST(last_error_time),
+	NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
+EXT4_INFO_ATTR(encryption);
+
+static struct attribute *ext4_feat_attrs[] = {
+	ATTR_LIST(lazy_itable_init),
+	ATTR_LIST(batched_discard),
+	ATTR_LIST(meta_bg_resize),
+	ATTR_LIST(encryption),
+	NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+	.show	= ext4_attr_show,
+	.store	= ext4_attr_store,
+};
+
+static struct kobj_type ext4_sb_ktype = {
+	.default_attrs	= ext4_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_sb_release,
+};
+
+static struct kobj_type ext4_ktype = {
+	.sysfs_ops	= &ext4_attr_ops,
+};
+
+static struct kset ext4_kset = {
+	.kobj   = {.ktype = &ext4_ktype},
+};
+
+static ssize_t ext4_feat_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "supported\n");
+}
+
+/*
+ * We can not use ext4_attr_show/store because it relies on the kobject
+ * being embedded in the ext4_sb_info structure which is definitely not
+ * true in this case.
+ */
+static const struct sysfs_ops ext4_feat_ops = {
+	.show	= ext4_feat_show,
+	.store	= NULL,
+};
+
+static struct kobj_type ext4_feat_ktype = {
+	.default_attrs	= ext4_feat_attrs,
+	.sysfs_ops	= &ext4_feat_ops,
+};
+
+static struct kobject ext4_feat = {
+	.kset	= &ext4_kset,
+};
+
+int ext4_register_sysfs(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	sbi->s_kobj.kset = &ext4_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	return kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
+				    "%s", sb->s_id);
+}
+
+int __init ext4_init_sysfs(void)
+{
+	int ret;
+
+	kobject_set_name(&ext4_kset.kobj, "ext4");
+	ext4_kset.kobj.parent = fs_kobj;
+	ret = kset_register(&ext4_kset);
+	if (ret)
+		return ret;
+
+	ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype,
+				   NULL, "features");
+	if (ret)
+		kset_unregister(&ext4_kset);
+	return ret;
+}
+
+void ext4_exit_sysfs(void)
+{
+	kobject_put(&ext4_feat);
+	kset_unregister(&ext4_kset);
+}
+
-- 
cgit v1.1


From 76d33bca5581b1dd5c3157fa168db849a784ada4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 23 Sep 2015 12:45:17 -0400
Subject: ext4: refactor sysfs support code

Make the code more easily extensible as well as taking up less
compiled space.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/sysfs.c | 261 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 128 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9fdc21b..87ac089 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -15,37 +15,35 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
+typedef enum {
+	attr_noop,
+	attr_delayed_allocation_blocks,
+	attr_session_write_kbytes,
+	attr_lifetime_write_kbytes,
+	attr_reserved_clusters,
+	attr_inode_readahead,
+	attr_trigger_test_error,
+	attr_feature,
+	attr_pointer_ui,
+	attr_pointer_atomic,
+} attr_id_t;
+
+typedef enum {
+	ptr_explicit,
+	ptr_ext4_sb_info_offset,
+	ptr_ext4_super_block_offset,
+} attr_ptr_t;
+
 struct ext4_attr {
 	struct attribute attr;
-	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
-			 const char *, size_t);
+	short attr_id;
+	short attr_ptr;
 	union {
 		int offset;
-		int deprecated_val;
+		void *explicit_ptr;
 	} u;
 };
 
-static int parse_strtoull(const char *buf,
-		unsigned long long max, unsigned long long *value)
-{
-	int ret;
-
-	ret = kstrtoull(skip_spaces(buf), 0, value);
-	if (!ret && *value > max)
-		ret = -EINVAL;
-	return ret;
-}
-
-static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
-					      struct ext4_sb_info *sbi,
-					      char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(s64) EXT4_C2B(sbi,
-			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-}
-
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 					 struct ext4_sb_info *sbi, char *buf)
 {
@@ -89,46 +87,6 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 	return count;
 }
 
-static ssize_t sbi_ui_show(struct ext4_attr *a,
-			   struct ext4_sb_info *sbi, char *buf)
-{
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t sbi_ui_store(struct ext4_attr *a,
-			    struct ext4_sb_info *sbi,
-			    const char *buf, size_t count)
-{
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-	unsigned long t;
-	int ret;
-
-	ret = kstrtoul(skip_spaces(buf), 0, &t);
-	if (ret)
-		return ret;
-	*ui = t;
-	return count;
-}
-
-static ssize_t es_ui_show(struct ext4_attr *a,
-			   struct ext4_sb_info *sbi, char *buf)
-{
-
-	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
-			   a->u.offset);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t reserved_clusters_show(struct ext4_attr *a,
-				  struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
-}
-
 static ssize_t reserved_clusters_store(struct ext4_attr *a,
 				   struct ext4_sb_info *sbi,
 				   const char *buf, size_t count)
@@ -136,11 +94,10 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a,
 	unsigned long long val;
 	ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >>
 				 sbi->s_cluster_bits);
+	int ret;
 
-	if (parse_strtoull(buf, -1ULL, &val))
-		return -EINVAL;
-
-	if (val >= clusters)
+	ret = kstrtoull(skip_spaces(buf), 0, &val);
+	if (!ret || val >= clusters)
 		return -EINVAL;
 
 	atomic64_set(&sbi->s_resv_clusters, val);
@@ -164,60 +121,51 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
 	return count;
 }
 
-static ssize_t sbi_deprecated_show(struct ext4_attr *a,
-				   struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+#define EXT4_ATTR(_name,_mode,_id)					\
+static struct ext4_attr ext4_attr_##_name = {				\
+	.attr = {.name = __stringify(_name), .mode = _mode },		\
+	.attr_id = attr_##_id,						\
 }
 
-#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+#define EXT4_ATTR_FUNC(_name,_mode)  EXT4_ATTR(_name,_mode,_name)
+
+#define EXT4_ATTR_FEATURE(_name)   EXT4_ATTR(_name, 0444, feature)
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname)	\
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
-	.show	= _show,					\
-	.store	= _store,					\
+	.attr_id = attr_##_id,					\
+	.attr_ptr = ptr_##_struct##_offset,			\
 	.u = {							\
-		.offset = offsetof(struct ext4_sb_info, _elname),\
+		.offset = offsetof(struct _struct, _elname),\
 	},							\
 }
 
-#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
-static struct ext4_attr ext4_attr_##_name = {				\
-	.attr = {.name = __stringify(_name), .mode = _mode },		\
-	.show	= _show,						\
-	.store	= _store,						\
-	.u = {								\
-		.offset = offsetof(struct ext4_super_block, _elname),	\
-	},								\
-}
-
-#define EXT4_ATTR(name, mode, show, store) \
-static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
-
-#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
-#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
-#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RO_ATTR_ES_UI(_name,_elname)				\
+	EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
 
-#define EXT4_RO_ATTR_ES_UI(name, elname)	\
-	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
-#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
-	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
+	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
 
-#define ATTR_LIST(name) &ext4_attr_##name.attr
-#define EXT4_DEPRECATED_ATTR(_name, _val)	\
+#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
 static struct ext4_attr ext4_attr_##_name = {			\
-	.attr = {.name = __stringify(_name), .mode = 0444 },	\
-	.show	= sbi_deprecated_show,				\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.attr_id = attr_##_id,					\
+	.attr_ptr = ptr_explicit,				\
 	.u = {							\
-		.deprecated_val = _val,				\
+		.explicit_ptr = _ptr,				\
 	},							\
 }
 
-EXT4_RO_ATTR(delayed_allocation_blocks);
-EXT4_RO_ATTR(session_write_kbytes);
-EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RW_ATTR(reserved_clusters);
-EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
-		 inode_readahead_blks_store, s_inode_readahead_blks);
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
+EXT4_ATTR_FUNC(session_write_kbytes, 0444);
+EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+EXT4_ATTR_FUNC(reserved_clusters, 0644);
+
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
+		 ext4_sb_info, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
@@ -225,9 +173,8 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
-EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
@@ -238,6 +185,9 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
 
+static unsigned int old_bump_val = 128;
+EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
+
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
@@ -267,10 +217,10 @@ static struct attribute *ext4_attrs[] = {
 };
 
 /* Features this copy of ext4 supports */
-EXT4_INFO_ATTR(lazy_itable_init);
-EXT4_INFO_ATTR(batched_discard);
-EXT4_INFO_ATTR(meta_bg_resize);
-EXT4_INFO_ATTR(encryption);
+EXT4_ATTR_FEATURE(lazy_itable_init);
+EXT4_ATTR_FEATURE(batched_discard);
+EXT4_ATTR_FEATURE(meta_bg_resize);
+EXT4_ATTR_FEATURE(encryption);
 
 static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
@@ -280,14 +230,56 @@ static struct attribute *ext4_feat_attrs[] = {
 	NULL,
 };
 
+static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
+{
+	switch (a->attr_ptr) {
+	case ptr_explicit:
+		return a->u.explicit_ptr;
+	case ptr_ext4_sb_info_offset:
+		return (void *) (((char *) sbi) + a->u.offset);
+	case ptr_ext4_super_block_offset:
+		return (void *) (((char *) sbi->s_es) + a->u.offset);
+	}
+	return NULL;
+}
+
 static ssize_t ext4_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 						s_kobj);
 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
-	return a->show ? a->show(a, sbi, buf) : 0;
+	void *ptr = calc_ptr(a, sbi);
+
+	switch (a->attr_id) {
+	case attr_delayed_allocation_blocks:
+		return snprintf(buf, PAGE_SIZE, "%llu\n",
+				(s64) EXT4_C2B(sbi,
+		       percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+	case attr_session_write_kbytes:
+		return session_write_kbytes_show(a, sbi, buf);
+	case attr_lifetime_write_kbytes:
+		return lifetime_write_kbytes_show(a, sbi, buf);
+	case attr_reserved_clusters:
+		return snprintf(buf, PAGE_SIZE, "%llu\n",
+				(unsigned long long)
+				atomic64_read(&sbi->s_resv_clusters));
+	case attr_inode_readahead:
+	case attr_pointer_ui:
+		if (!ptr)
+			return 0;
+		return snprintf(buf, PAGE_SIZE, "%u\n",
+				*((unsigned int *) ptr));
+	case attr_pointer_atomic:
+		if (!ptr)
+			return 0;
+		return snprintf(buf, PAGE_SIZE, "%d\n",
+				atomic_read((atomic_t *) ptr));
+	case attr_feature:
+		return snprintf(buf, PAGE_SIZE, "supported\n");
+	}
+
+	return 0;
 }
 
 static ssize_t ext4_attr_store(struct kobject *kobj,
@@ -297,8 +289,27 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 						s_kobj);
 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+	void *ptr = calc_ptr(a, sbi);
+	unsigned long t;
+	int ret;
 
-	return a->store ? a->store(a, sbi, buf, len) : 0;
+	switch (a->attr_id) {
+	case attr_reserved_clusters:
+		return reserved_clusters_store(a, sbi, buf, len);
+	case attr_pointer_ui:
+		if (!ptr)
+			return 0;
+		ret = kstrtoul(skip_spaces(buf), 0, &t);
+		if (ret)
+			return ret;
+		*((unsigned int *) ptr) = t;
+		return len;
+	case attr_inode_readahead:
+		return inode_readahead_blks_store(a, sbi, buf, len);
+	case attr_trigger_test_error:
+		return trigger_test_error(a, sbi, buf, len);
+	}
+	return 0;
 }
 
 static void ext4_sb_release(struct kobject *kobj)
@@ -327,25 +338,9 @@ static struct kset ext4_kset = {
 	.kobj   = {.ktype = &ext4_ktype},
 };
 
-static ssize_t ext4_feat_show(struct kobject *kobj,
-			      struct attribute *attr, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "supported\n");
-}
-
-/*
- * We can not use ext4_attr_show/store because it relies on the kobject
- * being embedded in the ext4_sb_info structure which is definitely not
- * true in this case.
- */
-static const struct sysfs_ops ext4_feat_ops = {
-	.show	= ext4_feat_show,
-	.store	= NULL,
-};
-
 static struct kobj_type ext4_feat_ktype = {
 	.default_attrs	= ext4_feat_attrs,
-	.sysfs_ops	= &ext4_feat_ops,
+	.sysfs_ops	= &ext4_attr_ops,
 };
 
 static struct kobject ext4_feat = {
-- 
cgit v1.1


From ebd173beb8db5b8b315fa1c5bbac86c54059397a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 23 Sep 2015 12:46:17 -0400
Subject: ext4: move procfs registration code to fs/ext4/sysfs.c

This allows us to refactor the procfs code, which saves a bit of
compiled space.  More importantly it isolates most of the procfs
support code into a single file, so it's easier to #ifdef it out if
the proc file system has been disabled.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h           |  3 +++
 fs/ext4/extents_status.c | 60 ++----------------------------------------
 fs/ext4/extents_status.h |  2 ++
 fs/ext4/mballoc.c        |  9 +------
 fs/ext4/super.c          | 41 +++--------------------------
 fs/ext4/sysfs.c          | 68 ++++++++++++++++++++++++++++++++++++++++++++++--
 6 files changed, 77 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 76daccf..766b7f7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2245,6 +2245,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
 extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 
 /* mballoc.c */
+extern const struct file_operations ext4_seq_mb_groups_fops;
 extern long ext4_mb_stats;
 extern long ext4_mb_max_to_scan;
 extern int ext4_mb_init(struct super_block *);
@@ -2372,6 +2373,7 @@ extern int ext4_group_extend(struct super_block *sb,
 extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
+extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
 extern int ext4_calculate_overhead(struct super_block *sb);
 extern void ext4_superblock_csum_set(struct super_block *sb);
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
@@ -2905,6 +2907,7 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
 /* sysfs.c */
 extern int ext4_register_sysfs(struct super_block *sb);
+extern void ext4_unregister_sysfs(struct super_block *sb);
 extern int __init ext4_init_sysfs(void);
 extern void ext4_exit_sysfs(void);
 
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 26724ae..ac748b3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1089,20 +1089,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
 	return nr_shrunk;
 }
 
-static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
 {
-	return *pos ? NULL : SEQ_START_TOKEN;
-}
-
-static void *
-ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	return NULL;
-}
-
-static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
-{
-	struct ext4_sb_info *sbi = seq->private;
+	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
 	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
 	struct ext4_inode_info *ei, *max = NULL;
 	unsigned int inode_cnt = 0;
@@ -1143,45 +1132,6 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
-	.start = ext4_es_seq_shrinker_info_start,
-	.next  = ext4_es_seq_shrinker_info_next,
-	.stop  = ext4_es_seq_shrinker_info_stop,
-	.show  = ext4_es_seq_shrinker_info_show,
-};
-
-static int
-ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
-{
-	int ret;
-
-	ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = PDE_DATA(inode);
-	}
-
-	return ret;
-}
-
-static int
-ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
-{
-	return seq_release(inode, file);
-}
-
-static const struct file_operations ext4_es_seq_shrinker_info_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ext4_es_seq_shrinker_info_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= ext4_es_seq_shrinker_info_release,
-};
-
 int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 {
 	int err;
@@ -1210,10 +1160,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	if (err)
 		goto err2;
 
-	if (sbi->s_proc)
-		proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
-				 &ext4_es_seq_shrinker_info_fops, sbi);
-
 	return 0;
 
 err2:
@@ -1225,8 +1171,6 @@ err1:
 
 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
-	if (sbi->s_proc)
-		remove_proc_entry("es_shrinker_info", sbi->s_proc);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
 	unregister_shrinker(&sbi->s_es_shrinker);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 691b526..f7aa24f 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -172,4 +172,6 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
 extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 
+extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+
 #endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 34b610e..b0f7ee5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2333,7 +2333,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 
 }
 
-static const struct file_operations ext4_mb_seq_groups_fops = {
+const struct file_operations ext4_seq_mb_groups_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ext4_mb_seq_groups_open,
 	.read		= seq_read,
@@ -2661,10 +2661,6 @@ int ext4_mb_init(struct super_block *sb)
 	if (ret != 0)
 		goto out_free_locality_groups;
 
-	if (sbi->s_proc)
-		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
-				 &ext4_mb_seq_groups_fops, sb);
-
 	return 0;
 
 out_free_locality_groups:
@@ -2705,9 +2701,6 @@ int ext4_mb_release(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
-	if (sbi->s_proc)
-		remove_proc_entry("mb_groups", sbi->s_proc);
-
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
 			grinfo = ext4_get_group_info(sb, i);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4a574fb..7ef3fa5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,7 +34,6 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
-#include <linux/proc_fs.h>
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -54,7 +53,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
 
-static struct proc_dir_entry *ext4_proc_root;
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
 static int ext4_mballoc_ready;
@@ -797,6 +795,7 @@ static void ext4_put_super(struct super_block *sb)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
+	ext4_unregister_sysfs(sb);
 	ext4_es_unregister_shrinker(sbi);
 	del_timer_sync(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
@@ -811,12 +810,6 @@ static void ext4_put_super(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY))
 		ext4_commit_super(sb, 1);
 
-	if (sbi->s_proc) {
-		remove_proc_entry("options", sbi->s_proc);
-		remove_proc_entry(sb->s_id, ext4_proc_root);
-	}
-	kobject_del(&sbi->s_kobj);
-
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kvfree(sbi->s_group_desc);
@@ -1877,7 +1870,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
 	return _ext4_show_options(seq, root->d_sb, 0);
 }
 
-static int options_seq_show(struct seq_file *seq, void *offset)
+int ext4_seq_options_show(struct seq_file *seq, void *offset)
 {
 	struct super_block *sb = seq->private;
 	int rc;
@@ -1888,19 +1881,6 @@ static int options_seq_show(struct seq_file *seq, void *offset)
 	return rc;
 }
 
-static int options_open_fs(struct inode *inode, struct file *file)
-{
-	return single_open(file, options_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ext4_seq_options_fops = {
-	.owner = THIS_MODULE,
-	.open = options_open_fs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			    int read_only)
 {
@@ -3616,13 +3596,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (ext4_proc_root)
-		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
-	if (sbi->s_proc)
-		proc_create_data("options", S_IRUGO, sbi->s_proc,
-				 &ext4_seq_options_fops, sb);
-
 	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -3960,7 +3933,7 @@ cantfind_ext4:
 
 #ifdef CONFIG_QUOTA
 failed_mount8:
-	kobject_del(&sbi->s_kobj);
+	ext4_unregister_sysfs(sb);
 #endif
 failed_mount7:
 	ext4_unregister_li_request(sb);
@@ -4000,10 +3973,6 @@ failed_mount2:
 failed_mount:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
-	if (sbi->s_proc) {
-		remove_proc_entry("options", sbi->s_proc);
-		remove_proc_entry(sb->s_id, ext4_proc_root);
-	}
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -5264,7 +5233,6 @@ static int __init ext4_init_fs(void)
 	err = ext4_init_system_zone();
 	if (err)
 		goto out4;
-	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 
 	err = ext4_init_sysfs();
 	if (err)
@@ -5295,8 +5263,6 @@ out1:
 out2:
 	ext4_exit_sysfs();
 out3:
-	if (ext4_proc_root)
-		remove_proc_entry("fs/ext4", NULL);
 	ext4_exit_system_zone();
 out4:
 	ext4_exit_pageio();
@@ -5316,7 +5282,6 @@ static void __exit ext4_exit_fs(void)
 	destroy_inodecache();
 	ext4_exit_mballoc();
 	ext4_exit_sysfs();
-	remove_proc_entry("fs/ext4", NULL);
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
 	ext4_exit_es();
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 87ac089..62bef0f 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -34,6 +34,9 @@ typedef enum {
 	ptr_ext4_super_block_offset,
 } attr_ptr_t;
 
+static const char *proc_dirname = "fs/ext4";
+static struct proc_dir_entry *ext4_proc_root;
+
 struct ext4_attr {
 	struct attribute attr;
 	short attr_id;
@@ -347,14 +350,71 @@ static struct kobject ext4_feat = {
 	.kset	= &ext4_kset,
 };
 
+#define PROC_FILE_SHOW_DEFN(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+	return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
+} \
+\
+const struct file_operations ext4_seq_##name##_fops = { \
+	.owner		= THIS_MODULE, \
+	.open		= name##_open, \
+	.read		= seq_read, \
+	.llseek		= seq_lseek, \
+	.release	= single_release, \
+}
+
+#define PROC_FILE_LIST(name) \
+	{ __stringify(name), &ext4_seq_##name##_fops }
+
+PROC_FILE_SHOW_DEFN(es_shrinker_info);
+PROC_FILE_SHOW_DEFN(options);
+
+static struct ext4_proc_files {
+	const char *name;
+	const struct file_operations *fops;
+} proc_files[] = {
+	PROC_FILE_LIST(options),
+	PROC_FILE_LIST(es_shrinker_info),
+	PROC_FILE_LIST(mb_groups),
+	{ NULL, NULL },
+};
+
 int ext4_register_sysfs(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_proc_files *p;
+	int err;
 
 	sbi->s_kobj.kset = &ext4_kset;
 	init_completion(&sbi->s_kobj_unregister);
-	return kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
-				    "%s", sb->s_id);
+	err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
+				   "%s", sb->s_id);
+	if (err)
+		return err;
+
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+	if (sbi->s_proc) {
+		for (p = proc_files; p->name; p++)
+			proc_create_data(p->name, S_IRUGO, sbi->s_proc,
+					 p->fops, sb);
+	}
+	return 0;
+}
+
+void ext4_unregister_sysfs(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_proc_files *p;
+
+	if (sbi->s_proc) {
+		for (p = proc_files; p->name; p++)
+			remove_proc_entry(p->name, sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
+	kobject_del(&sbi->s_kobj);
 }
 
 int __init ext4_init_sysfs(void)
@@ -371,6 +431,8 @@ int __init ext4_init_sysfs(void)
 				   NULL, "features");
 	if (ret)
 		kset_unregister(&ext4_kset);
+	else
+		ext4_proc_root = proc_mkdir(proc_dirname, NULL);
 	return ret;
 }
 
@@ -378,5 +440,7 @@ void ext4_exit_sysfs(void)
 {
 	kobject_put(&ext4_feat);
 	kset_unregister(&ext4_kset);
+	remove_proc_entry(proc_dirname, NULL);
+	ext4_proc_root = NULL;
 }
 
-- 
cgit v1.1


From 5948edbcbf43759586cdf2656d293ea7de310280 Mon Sep 17 00:00:00 2001
From: Roman Pen <r.peniaev@gmail.com>
Date: Tue, 15 Sep 2015 08:27:25 -0600
Subject: fs/mpage.c: forgotten WRITE_SYNC in case of data integrity write

In case of wbc->sync_mode == WB_SYNC_ALL we need to do data integrity
write, thus mark request as WRITE_SYNC.

akpm: afaict this change will cause the data integrity write bios to be
placed onto the second queue in cfq_io_cq.cfqq[], which presumably results
in special treatment.  The documentation for REQ_SYNC is horrid.

Signed-off-by: Roman Pen <r.peniaev@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/mpage.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index 778a4dd..2ebf916 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
+	int wr = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -590,7 +591,7 @@ page_is_mapped:
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 
 alloc_new:
 	if (bio == NULL) {
@@ -617,7 +618,7 @@ alloc_new:
 	wbc_account_io(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 		goto alloc_new;
 	}
 
@@ -627,7 +628,7 @@ alloc_new:
 	set_page_writeback(page);
 	unlock_page(page);
 	if (boundary || (first_unmapped != blocks_per_page)) {
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 		if (boundary_block) {
 			write_boundary_block(boundary_bdev,
 					boundary_block, 1 << blkbits);
@@ -639,7 +640,7 @@ alloc_new:
 
 confused:
 	if (bio)
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 
 	if (mpd->use_writepage) {
 		ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +696,11 @@ mpage_writepages(struct address_space *mapping,
 		};
 
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
-		if (mpd.bio)
-			mpage_bio_submit(WRITE, mpd.bio);
+		if (mpd.bio) {
+			int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+				  WRITE_SYNC : WRITE);
+			mpage_bio_submit(wr, mpd.bio);
+		}
 	}
 	blk_finish_plug(&plug);
 	return ret;
@@ -713,8 +717,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
 		.use_writepage = 0,
 	};
 	int ret = __mpage_writepage(page, wbc, &mpd);
-	if (mpd.bio)
-		mpage_bio_submit(WRITE, mpd.bio);
+	if (mpd.bio) {
+		int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+			  WRITE_SYNC : WRITE);
+		mpage_bio_submit(wr, mpd.bio);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepage);
-- 
cgit v1.1


From 53cbf3b157a0428d40989ab1c7df9228a1976fc2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Mon, 17 Aug 2015 10:31:46 +0800
Subject: fs: direct-io: don't dirtying pages for ITER_BVEC/ITER_KVEC direct
 read

When direct read IO is submitted from kernel, it is often
unnecessary to dirty pages, for example of loop, dirtying pages
have been considered in the upper filesystem(over loop) side
already, and they don't need to be dirtied again.

So this patch doesn't dirtying pages for ITER_BVEC/ITER_KVEC
direct read, and loop should be the 1st case to use ITER_BVEC/ITER_KVEC
for direct read I/O.

The patch is based on previous Dave's patch.

Reviewed-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/direct-io.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1125629..3ae0e04 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -120,6 +120,7 @@ struct dio {
 	int page_errors;		/* errno from get_user_pages() */
 	int is_async;			/* is IO async ? */
 	bool defer_completion;		/* defer AIO completion to workqueue? */
+	bool should_dirty;		/* if pages should be dirtied */
 	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
@@ -393,7 +394,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
-	if (dio->is_async && dio->rw == READ)
+	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
 	if (sdio->submit_io)
@@ -464,14 +465,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (bio->bi_error)
 		dio->io_error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (dio->is_async && dio->rw == READ && dio->should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 		err = bio->bi_error;
 	} else {
 		bio_for_each_segment_all(bvec, bio, i) {
 			struct page *page = bvec->bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (dio->rw == READ && !PageCompound(page) &&
+					dio->should_dirty)
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -1219,6 +1221,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
+	dio->should_dirty = (iter->type == ITER_IOVEC);
 	sdio.iter = iter;
 	sdio.final_block_in_request =
 		(offset + iov_iter_count(iter)) >> blkbits;
-- 
cgit v1.1


From d4eb6dee471250661a5183a7336b18c85990e26d Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Thu, 24 Sep 2015 12:38:22 +0200
Subject: ext4: Update EXT4_USE_FOR_EXT2 description

Configuration option EXT4_USE_FOR_EXT2 has no effect on ext3 support.
Support for ext3 is always included now.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Fixes: c290ea01ab ("fs: Remove ext3 filesystem driver")
Cc: Jan Kara <jack@suse.cz>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Signed-off-by: Jan Kara <jack@suse.com>
---
 fs/ext4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 47728da..b46e9fc 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -63,7 +63,7 @@ config EXT4_FS
 	  If unsure, say N.
 
 config EXT4_USE_FOR_EXT2
-	bool "Use ext4 for ext2/ext3 file systems"
+	bool "Use ext4 for ext2 file systems"
 	depends on EXT4_FS
 	depends on EXT2_FS=n
 	default y
-- 
cgit v1.1


From ff9f84b7d79ddccab4c293c9d3e289f95ae594f7 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sat, 26 Sep 2015 09:48:58 -0500
Subject: [SMB3] Missing null tcon check

Pointed out by Dan Carpenter via smatch code analysis tool

CC: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steve French <steve.french@primarydata.com>
---
 fs/cifs/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ce83e2e..597a417 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -922,7 +922,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	if (tcon && tcon->bad_network_name)
 		return -ENOENT;
 
-	if ((tcon->seal) &&
+	if ((tcon && tcon->seal) &&
 	    ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) {
 		cifs_dbg(VFS, "encryption requested but no server support");
 		return -EOPNOTSUPP;
-- 
cgit v1.1


From 1bc81eeeba99ed20b570423c311826811e162e6d Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 28 Sep 2015 22:15:38 +0800
Subject: jffs2: remove unnecessary new_valid_dev check

As new_valid_dev always returns 1, so !new_valid_dev check is not
needed, remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 fs/jffs2/dir.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 8118002..d211b8e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
 	uint32_t alloclen;
 	int ret;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	ri = jffs2_alloc_raw_inode();
 	if (!ri)
 		return -ENOMEM;
-- 
cgit v1.1


From 16c863bbf4dabc1ff67d5c67d2d6e614211528d4 Mon Sep 17 00:00:00 2001
From: fangwei <fangwei1@huawei.com>
Date: Tue, 7 Jul 2015 10:54:26 +0800
Subject: jffs2: remove unneeded kfree

c->oobbuf hasn't been kmalloced in jffs2_dataflash_setup, so
there is no need to free it.

Signed-off-by: Wei Fang <fangwei1@huawei.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 fs/jffs2/wbuf.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 09ed551..63f31c0 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
 #ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
 	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
 	if (!c->wbuf_verify) {
-		kfree(c->oobbuf);
 		kfree(c->wbuf);
 		return -ENOMEM;
 	}
-- 
cgit v1.1


From cf6f54e3f133229f02a90c04fe0ff9dd9d3264b4 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 8 Jul 2015 11:46:36 +0200
Subject: UBIFS: Kill unneeded locking in ubifs_init_security

Fixes the following lockdep splat:
[    1.244527] =============================================
[    1.245193] [ INFO: possible recursive locking detected ]
[    1.245193] 4.2.0-rc1+ #37 Not tainted
[    1.245193] ---------------------------------------------
[    1.245193] cp/742 is trying to acquire lock:
[    1.245193]  (&sb->s_type->i_mutex_key#9){+.+.+.}, at: [<ffffffff812b3f69>] ubifs_init_security+0x29/0xb0
[    1.245193]
[    1.245193] but task is already holding lock:
[    1.245193]  (&sb->s_type->i_mutex_key#9){+.+.+.}, at: [<ffffffff81198e7f>] path_openat+0x3af/0x1280
[    1.245193]
[    1.245193] other info that might help us debug this:
[    1.245193]  Possible unsafe locking scenario:
[    1.245193]
[    1.245193]        CPU0
[    1.245193]        ----
[    1.245193]   lock(&sb->s_type->i_mutex_key#9);
[    1.245193]   lock(&sb->s_type->i_mutex_key#9);
[    1.245193]
[    1.245193]  *** DEADLOCK ***
[    1.245193]
[    1.245193]  May be due to missing lock nesting notation
[    1.245193]
[    1.245193] 2 locks held by cp/742:
[    1.245193]  #0:  (sb_writers#5){.+.+.+}, at: [<ffffffff811ad37f>] mnt_want_write+0x1f/0x50
[    1.245193]  #1:  (&sb->s_type->i_mutex_key#9){+.+.+.}, at: [<ffffffff81198e7f>] path_openat+0x3af/0x1280
[    1.245193]
[    1.245193] stack backtrace:
[    1.245193] CPU: 2 PID: 742 Comm: cp Not tainted 4.2.0-rc1+ #37
[    1.245193] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140816_022509-build35 04/01/2014
[    1.245193]  ffffffff8252d530 ffff88007b023a38 ffffffff814f6f49 ffffffff810b56c5
[    1.245193]  ffff88007c30cc80 ffff88007b023af8 ffffffff810a150d ffff88007b023a68
[    1.245193]  000000008101302a ffff880000000000 00000008f447e23f ffffffff8252d500
[    1.245193] Call Trace:
[    1.245193]  [<ffffffff814f6f49>] dump_stack+0x4c/0x65
[    1.245193]  [<ffffffff810b56c5>] ? console_unlock+0x1c5/0x510
[    1.245193]  [<ffffffff810a150d>] __lock_acquire+0x1a6d/0x1ea0
[    1.245193]  [<ffffffff8109fa78>] ? __lock_is_held+0x58/0x80
[    1.245193]  [<ffffffff810a1a93>] lock_acquire+0xd3/0x270
[    1.245193]  [<ffffffff812b3f69>] ? ubifs_init_security+0x29/0xb0
[    1.245193]  [<ffffffff814fc83b>] mutex_lock_nested+0x6b/0x3a0
[    1.245193]  [<ffffffff812b3f69>] ? ubifs_init_security+0x29/0xb0
[    1.245193]  [<ffffffff812b3f69>] ? ubifs_init_security+0x29/0xb0
[    1.245193]  [<ffffffff812b3f69>] ubifs_init_security+0x29/0xb0
[    1.245193]  [<ffffffff8128e286>] ubifs_create+0xa6/0x1f0
[    1.245193]  [<ffffffff81198e7f>] ? path_openat+0x3af/0x1280
[    1.245193]  [<ffffffff81195d15>] vfs_create+0x95/0xc0
[    1.245193]  [<ffffffff8119929c>] path_openat+0x7cc/0x1280
[    1.245193]  [<ffffffff8109ffe3>] ? __lock_acquire+0x543/0x1ea0
[    1.245193]  [<ffffffff81088f20>] ? sched_clock_cpu+0x90/0xc0
[    1.245193]  [<ffffffff81088c00>] ? calc_global_load_tick+0x60/0x90
[    1.245193]  [<ffffffff81088f20>] ? sched_clock_cpu+0x90/0xc0
[    1.245193]  [<ffffffff811a9cef>] ? __alloc_fd+0xaf/0x180
[    1.245193]  [<ffffffff8119ac55>] do_filp_open+0x75/0xd0
[    1.245193]  [<ffffffff814ffd86>] ? _raw_spin_unlock+0x26/0x40
[    1.245193]  [<ffffffff811a9cef>] ? __alloc_fd+0xaf/0x180
[    1.245193]  [<ffffffff81189bd9>] do_sys_open+0x129/0x200
[    1.245193]  [<ffffffff81189cc9>] SyS_open+0x19/0x20
[    1.245193]  [<ffffffff81500717>] entry_SYSCALL_64_fastpath+0x12/0x6f

While the lockdep splat is a false positive, becuase path_openat holds i_mutex
of the parent directory and ubifs_init_security() tries to acquire i_mutex
of a new inode, it reveals that taking i_mutex in ubifs_init_security() is
in vain because it is only being called in the inode allocation path
and therefore nobody else can see the inode yet.

Cc: stable@vger.kernel.org # 3.20-
Reported-and-tested-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Reviewed-and-tested-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: dedekind1@gmail.com
---
 fs/ubifs/xattr.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 96f3448..fd65b3f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -652,11 +652,8 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
 {
 	int err;
 
-	mutex_lock(&inode->i_mutex);
 	err = security_inode_init_security(inode, dentry, qstr,
 					   &init_xattrs, 0);
-	mutex_unlock(&inode->i_mutex);
-
 	if (err) {
 		struct ubifs_info *c = dentry->i_sb->s_fs_info;
 		ubifs_err(c, "cannot initialize security for inode %lu, error %d",
-- 
cgit v1.1


From a1c83681d52704c766d576bea2d7f2f99e8dde23 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 12 Aug 2015 15:59:44 +0530
Subject: fs: Drop unlikely before IS_ERR(_OR_NULL)

IS_ERR(_OR_NULL) already contain an 'unlikely' compiler flag and there
is no need to do that again from its callers. Drop it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Jeff Layton <jlayton@poochiereds.net>
Reviewed-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve French <smfrench@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/cifs/readdir.c            | 2 +-
 fs/ecryptfs/inode.c          | 2 +-
 fs/ext4/extents.c            | 6 +++---
 fs/ext4/namei.c              | 2 +-
 fs/namei.c                   | 4 ++--
 fs/ncpfs/dir.c               | 2 +-
 fs/nfs/objlayout/objio_osd.c | 2 +-
 fs/proc/proc_sysctl.c        | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b1eede3..0557c45 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -84,7 +84,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
 	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
 
 	dentry = d_hash_and_lookup(parent, name);
-	if (unlikely(IS_ERR(dentry)))
+	if (IS_ERR(dentry))
 		return;
 
 	if (dentry) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3c4db11..e2e47ba 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -270,7 +270,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 
 	ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
 					    mode);
-	if (unlikely(IS_ERR(ecryptfs_inode))) {
+	if (IS_ERR(ecryptfs_inode)) {
 		ecryptfs_printk(KERN_WARNING, "Failed to create file in"
 				"lower filesystem\n");
 		rc = PTR_ERR(ecryptfs_inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2553aa8..799f017 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -899,7 +899,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 
 		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
 					    flags);
-		if (unlikely(IS_ERR(bh))) {
+		if (IS_ERR(bh)) {
 			ret = PTR_ERR(bh);
 			goto err;
 		}
@@ -5792,7 +5792,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		int split = 0;
 
 		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
-		if (unlikely(IS_ERR(path1))) {
+		if (IS_ERR(path1)) {
 			*erp = PTR_ERR(path1);
 			path1 = NULL;
 		finish:
@@ -5800,7 +5800,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 			goto repeat;
 		}
 		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
-		if (unlikely(IS_ERR(path2))) {
+		if (IS_ERR(path2)) {
 			*erp = PTR_ERR(path2);
 			path2 = NULL;
 			goto finish;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 011dcfb..0554e4b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1429,7 +1429,7 @@ restart:
 				}
 				num++;
 				bh = ext4_getblk(NULL, dir, b++, 0);
-				if (unlikely(IS_ERR(bh))) {
+				if (IS_ERR(bh)) {
 					if (ra_max == 0) {
 						ret = bh;
 						goto cleanup_and_exit;
diff --git a/fs/namei.c b/fs/namei.c
index fbbcf09..d3f8406 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1942,7 +1942,7 @@ OK:
 		if (err) {
 			const char *s = get_link(nd);
 
-			if (unlikely(IS_ERR(s)))
+			if (IS_ERR(s))
 				return PTR_ERR(s);
 			err = 0;
 			if (unlikely(!s)) {
@@ -3356,7 +3356,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 		return ERR_PTR(-ELOOP);
 
 	filename = getname_kernel(name);
-	if (unlikely(IS_ERR(filename)))
+	if (IS_ERR(filename))
 		return ERR_CAST(filename);
 
 	set_nameidata(&nd, -1, filename);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 93575e9..356816e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -597,7 +597,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
 	qname.name = __name;
 
 	newdent = d_hash_and_lookup(dentry, &qname);
-	if (unlikely(IS_ERR(newdent)))
+	if (IS_ERR(newdent))
 		goto end_advance;
 	if (!newdent) {
 		newdent = d_alloc(dentry, &qname);
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5aaed36..5c0c6b5 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -124,7 +124,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 
 retry_lookup:
 	od = osduld_info_lookup(&odi);
-	if (unlikely(IS_ERR(od))) {
+	if (IS_ERR(od)) {
 		err = PTR_ERR(od);
 		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
 		if (err == -ENODEV && retry_flag) {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fdda62e..fe5b6e6 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir,
 found:
 	subdir->header.nreg++;
 failed:
-	if (unlikely(IS_ERR(subdir))) {
+	if (IS_ERR(subdir)) {
 		pr_err("sysctl could not get directory: ");
 		sysctl_print_dir(dir);
 		pr_cont("/%*.*s %ld\n",
-- 
cgit v1.1


From 636db7a96cbd95f7842246a3f5f842ba791cd393 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Fri, 4 Sep 2015 01:54:22 +0200
Subject: debugfs: document that debugfs_remove*() accepts NULL and error
 values

According to commit a59d6293e537 ("debugfs: change parameter check in
debugfs_remove() functions"), this is meant to make cleanup easier for
callers. In that case it ought to be documented.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/debugfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c711be8..5d8f35f 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,7 +533,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 /**
  * debugfs_remove - removes a file or directory from the debugfs filesystem
  * @dentry: a pointer to a the dentry of the file or directory to be
- *          removed.
+ *          removed.  If this parameter is NULL or an error value, nothing
+ *          will be done.
  *
  * This function removes a file or directory in debugfs that was previously
  * created with a call to another debugfs function (like
@@ -565,7 +566,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
 
 /**
  * debugfs_remove_recursive - recursively removes a directory
- * @dentry: a pointer to a the dentry of the directory to be removed.
+ * @dentry: a pointer to a the dentry of the directory to be removed.  If this
+ *          parameter is NULL or an error value, nothing will be done.
  *
  * This function recursively removes a directory tree in debugfs that
  * was previously created with a call to another debugfs function
-- 
cgit v1.1


From 96f3136e51580ed68a2952341c8b9e2d7f853472 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:46 +0800
Subject: Btrfs: rename btrfs_sysfs_add_one to btrfs_sysfs_add_mounted

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/sysfs.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..afce306 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4004,7 +4004,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
 
 /* xattr.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..4622d38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2933,7 +2933,7 @@ retry_root_backup:
 		goto fail_fsdev_sysfs;
 	}
 
-	ret = btrfs_sysfs_add_one(fs_info);
+	ret = btrfs_sysfs_add_mounted(fs_info);
 	if (ret) {
 		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
 		goto fail_fsdev_sysfs;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc..cabf840 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -736,7 +736,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 	return error;
 }
 
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 {
 	int error;
 	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
-- 
cgit v1.1


From 6618a59bfc0a0490f2a51df37c67878e23285670 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:47 +0800
Subject: Btrfs: rename btrfs_sysfs_remove_one to btrfs_sysfs_remove_mounted

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/disk-io.c | 4 ++--
 fs/btrfs/sysfs.c   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index afce306..4484063 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4005,7 +4005,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4622d38..7673473 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3113,7 +3113,7 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 
 fail_sysfs:
-	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_mounted(fs_info);
 
 fail_fsdev_sysfs:
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3788,7 +3788,7 @@ void close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_mounted(fs_info);
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 
 	btrfs_free_fs_roots(fs_info);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index cabf840..095a302 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 	}
 }
 
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 {
 	btrfs_reset_fs_info_ptr(fs_info);
 
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 
 	return 0;
 failure:
-	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_mounted(fs_info);
 	return error;
 }
 
-- 
cgit v1.1


From e3bd6973bcf134a56786a8bd248d1740249352ec Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:48 +0800
Subject: Btrfs: rename btrfs_kobj_add_device to btrfs_sysfs_add_device_link

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/sysfs.c       | 4 ++--
 fs/btrfs/sysfs.h       | 2 +-
 fs/btrfs/volumes.c     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd59..200c972 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -375,7 +375,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
-	ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
 	if (ret)
 		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 095a302..df67f6b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 	return 0;
 }
 
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 				struct btrfs_device *one_device)
 {
 	int error = 0;
@@ -744,7 +744,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 
 	btrfs_set_fs_info_ptr(fs_info);
 
-	error = btrfs_kobj_add_device(fs_devs, NULL);
+	error = btrfs_sysfs_add_device_link(fs_devs, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527..6529680 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,7 +82,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
 extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
 int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..d22029a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2309,7 +2309,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 				    tmp + 1);
 
 	/* add sysfs device entry */
-	btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+	btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
 
 	/*
 	 * we've got more storage, clear any full flags on the space
-- 
cgit v1.1


From 325760404820e070bb20be0ce57e8d684d69a2ac Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:49 +0800
Subject: Btrfs: rename btrfs_kobj_rm_device to btrfs_sysfs_rm_device_link

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/sysfs.c       | 6 +++---
 fs/btrfs/sysfs.h       | 2 +-
 fs/btrfs/volumes.c     | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 200c972..7aed8a8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -586,7 +586,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	mutex_unlock(&uuid_mutex);
 
 	/* replace the sysfs entry */
-	btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
 	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 
 	/* write back the superblocks */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index df67f6b..52319d1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -557,7 +557,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
 	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
-	btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
 }
 
 const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
 
 /* when one_device is NULL, it removes all device links */
 
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device)
 {
 	struct hd_struct *disk;
@@ -750,7 +750,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 
 	error = sysfs_create_files(super_kobj, btrfs_attrs);
 	if (error) {
-		btrfs_kobj_rm_device(fs_devs, NULL);
+		btrfs_sysfs_rm_device_link(fs_devs, NULL);
 		return error;
 	}
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6529680..9c09522 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -84,7 +84,7 @@ extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
 int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
 int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d22029a..003d3dd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1801,7 +1801,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		device->fs_devices->open_devices--;
 		/* remove sysfs entry */
-		btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+		btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
 	}
 
 	call_rcu(&device->rcu, free_device);
@@ -1971,7 +1971,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	WARN_ON(!tgtdev);
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 
-	btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
 
 	if (tgtdev->bdev) {
 		btrfs_scratch_superblock(tgtdev);
@@ -2388,7 +2388,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 error_trans:
 	btrfs_end_transaction(trans, root);
 	rcu_string_free(device->name);
-	btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+	btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
-- 
cgit v1.1


From c1b7e474595b1b3a4463450cc22aa18b926093a3 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:50 +0800
Subject: Btrfs: rename super_kobj to fsid_kobj

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c   | 36 ++++++++++++++++++------------------
 fs/btrfs/volumes.c |  2 +-
 fs/btrfs/volumes.h |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 52319d1..e0ac859 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
 	NULL,
 };
 
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
 {
 	struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
 
-	memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+	memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
 	complete(&fs_devs->kobj_unregister);
 }
 
 static struct kobj_type btrfs_ktype = {
 	.sysfs_ops	= &kobj_sysfs_ops,
-	.release	= btrfs_release_super_kobj,
+	.release	= btrfs_release_fsid_kobj,
 };
 
 static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
 {
 	if (kobj->ktype != &btrfs_ktype)
 		return NULL;
-	return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+	return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
 }
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 			attrs[0] = &fa->kobj_attr.attr;
 			if (add) {
 				int ret;
-				ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+				ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
 							&agroup);
 				if (ret)
 					return ret;
 			} else
-				sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+				sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
 						    &agroup);
 		}
 
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 		fs_devs->device_dir_kobj = NULL;
 	}
 
-	if (fs_devs->super_kobj.state_initialized) {
-		kobject_del(&fs_devs->super_kobj);
-		kobject_put(&fs_devs->super_kobj);
+	if (fs_devs->fsid_kobj.state_initialized) {
+		kobject_del(&fs_devs->fsid_kobj);
+		kobject_put(&fs_devs->fsid_kobj);
 		wait_for_completion(&fs_devs->kobj_unregister);
 	}
 }
@@ -555,8 +555,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
 		kobject_put(fs_info->space_info_kobj);
 	}
 	addrm_unknown_feature_attrs(fs_info, false);
-	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
-	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+	sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+	sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
 }
 
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 {
 	if (!fs_devs->device_dir_kobj)
 		fs_devs->device_dir_kobj = kobject_create_and_add("devices",
-						&fs_devs->super_kobj);
+						&fs_devs->fsid_kobj);
 
 	if (!fs_devs->device_dir_kobj)
 		return -ENOMEM;
@@ -730,8 +730,8 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 	int error;
 
 	init_completion(&fs_devs->kobj_unregister);
-	fs_devs->super_kobj.kset = btrfs_kset;
-	error = kobject_init_and_add(&fs_devs->super_kobj,
+	fs_devs->fsid_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs_devs->fsid_kobj,
 				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
 	return error;
 }
@@ -740,7 +740,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 {
 	int error;
 	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
-	struct kobject *super_kobj = &fs_devs->super_kobj;
+	struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
 
 	btrfs_set_fs_info_ptr(fs_info);
 
@@ -748,13 +748,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 	if (error)
 		return error;
 
-	error = sysfs_create_files(super_kobj, btrfs_attrs);
+	error = sysfs_create_files(fsid_kobj, btrfs_attrs);
 	if (error) {
 		btrfs_sysfs_rm_device_link(fs_devs, NULL);
 		return error;
 	}
 
-	error = sysfs_create_group(super_kobj,
+	error = sysfs_create_group(fsid_kobj,
 				   &btrfs_feature_attr_group);
 	if (error)
 		goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 		goto failure;
 
 	fs_info->space_info_kobj = kobject_create_and_add("allocation",
-						  super_kobj);
+						  fsid_kobj);
 	if (!fs_info->space_info_kobj) {
 		error = -ENOMEM;
 		goto failure;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 003d3dd..6438792 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2350,7 +2350,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		 */
 		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
 						root->fs_info->fsid);
-		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+		if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
 								fsid_buf))
 			pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ca784a..413a70c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
-	struct kobject super_kobj;
+	struct kobject fsid_kobj;
 	struct kobject *device_dir_kobj;
 	struct completion kobj_unregister;
 };
-- 
cgit v1.1


From 92fc03fbdcbe2523be3f7e6b8e95fee9563a10d2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:51 +0800
Subject: Btrfs: SB read failure should return EIO for __bread failure

This will return EIO when __bread() fails to read SB,
instead of EINVAL.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 18 +++++++++++++++---
 fs/btrfs/volumes.c |  8 ++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7673473..357d9f9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2653,8 +2653,8 @@ int open_ctree(struct super_block *sb,
 	 * Read super block and check the signature bytes only
 	 */
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-	if (!bh) {
-		err = -EINVAL;
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
 		goto fail_alloc;
 	}
 
@@ -3196,6 +3196,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	int i;
 	u64 transid = 0;
 	u64 bytenr;
+	int ret = -EINVAL;
 
 	/* we would like to check all the supers, but that would make
 	 * a btrfs mount succeed after a mkfs from a different FS.
@@ -3209,13 +3210,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 			break;
 		bh = __bread(bdev, bytenr / 4096,
 					BTRFS_SUPER_INFO_SIZE);
-		if (!bh)
+		/*
+		 * If we fail to read from the underlying devices, as of now
+		 * the best option we have is to mark it EIO.
+		 */
+		if (!bh) {
+			ret = -EIO;
 			continue;
+		}
 
 		super = (struct btrfs_super_block *)bh->b_data;
 		if (btrfs_super_bytenr(super) != bytenr ||
 		    btrfs_super_magic(super) != BTRFS_MAGIC) {
 			brelse(bh);
+			ret = -EINVAL;
 			continue;
 		}
 
@@ -3227,6 +3235,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 			brelse(bh);
 		}
 	}
+
+	if (!latest)
+		return ERR_PTR(ret);
+
 	return latest;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6438792..e8a6182 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -211,8 +211,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 	}
 	invalidate_bdev(*bdev);
 	*bh = btrfs_read_dev_super(*bdev);
-	if (!*bh) {
-		ret = -EINVAL;
+	if (IS_ERR(*bh)) {
+		ret = PTR_ERR(*bh);
 		blkdev_put(*bdev, flags);
 		goto error;
 	}
@@ -6746,8 +6746,8 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
 	struct btrfs_super_block *disk_super;
 
 	bh = btrfs_read_dev_super(device->bdev);
-	if (!bh)
-		return -EINVAL;
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 
 	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-- 
cgit v1.1


From 57d816a15ba2c2690c57635134bc01cf4da4623c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:52 +0800
Subject: Btrfs: __btrfs_std_error() logic should be consistent w/out
 CONFIG_PRINTK defined

error handling logic behaves differently with or without
CONFIG_PRINTK defined, since there are two copies of the same
function which a bit of different logic

One, when CONFIG_PRINTK is defined, code is

__btrfs_std_error(..)
{
::
       save_error_info(fs_info);
       if (sb->s_flags & MS_BORN)
               btrfs_handle_error(fs_info);
}

and two when CONFIG_PRINTK is not defined, the code is

__btrfs_std_error(..)
{
::
       if (sb->s_flags & MS_BORN) {
               save_error_info(fs_info);
               btrfs_handle_error(fs_info);
        }
}

I doubt if this was intentional ? and appear to have caused since
we maintain two copies of the same function and they got diverged
with commits.

Now to decide which logic is correct reviewed changes as below,

 533574c6bc30cf526cc1c41bde050c854a945efb
Commit added two copies of this function

 cf79ffb5b79e8a2b587fbf218809e691bb396c98
Commit made change to only one copy of the function and to the
copy when CONFIG_PRINTK is defined.

To fix this, instead of maintaining two copies of same function
approach, maintain single function, and just put the extra
portion of the code under CONFIG_PRINTK define.

This patch just does that. And keeps code of with CONFIG_PRINTK
defined.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 11d1eab..b23d49d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 	}
 }
 
-#ifdef CONFIG_PRINTK
 /*
  * __btrfs_std_error decodes expected errors from the caller and
  * invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		       unsigned int line, int errno, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
 	const char *errstr;
+#endif
 
 	/*
 	 * Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
   		return;
 
+#ifdef CONFIG_PRINTK
 	errstr = btrfs_decode_error(errno);
 	if (fmt) {
 		struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
 			sb->s_id, function, line, errno, errstr);
 	}
+#endif
 
 	/* Don't go through full error handling during mount */
 	save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		btrfs_handle_error(fs_info);
 }
 
+#ifdef CONFIG_PRINTK
 static const char * const logtypes[] = {
 	"emergency",
 	"alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 
 	va_end(args);
 }
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-		       unsigned int line, int errno, const char *fmt, ...)
-{
-	struct super_block *sb = fs_info->sb;
-
-	/*
-	 * Special case: if the error is EROFS, and we're already
-	 * under MS_RDONLY, then it is safe here.
-	 */
-	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
-		return;
-
-	/* Don't go through full error handling during mount */
-	if (sb->s_flags & MS_BORN) {
-		save_error_info(fs_info);
-		btrfs_handle_error(fs_info);
-	}
-}
 #endif
 
 /*
-- 
cgit v1.1


From a4553fefb59cb0336f543fa567170b47e90142a9 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 25 Sep 2015 14:43:01 +0800
Subject: Btrfs: consolidate btrfs_error() to btrfs_std_error()

btrfs_error() and btrfs_std_error() does the same thing
and calls _btrfs_std_error(), so consolidate them together.
And the main motivation is that btrfs_error() is closely
named with btrfs_err(), one handles error action the other
is to log the error, so don't closely name them.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Suggested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       |  6 +++---
 fs/btrfs/ctree.h       |  9 +--------
 fs/btrfs/disk-io.c     |  8 ++++----
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/inode-item.c  |  2 +-
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/relocation.c  |  2 +-
 fs/btrfs/root-tree.c   |  4 ++--
 fs/btrfs/transaction.c |  2 +-
 fs/btrfs/tree-log.c    |  8 ++++----
 fs/btrfs/volumes.c     | 14 +++++++-------
 11 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745ea..1063315 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			return ret;
 		if (refs == 0) {
 			ret = -EROFS;
-			btrfs_std_error(root->fs_info, ret);
+			btrfs_std_error(root->fs_info, ret, NULL);
 			return ret;
 		}
 	} else {
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		child = read_node_slot(root, mid, 0);
 		if (!child) {
 			ret = -EROFS;
-			btrfs_std_error(root->fs_info, ret);
+			btrfs_std_error(root->fs_info, ret, NULL);
 			goto enospc;
 		}
 
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		 */
 		if (!left) {
 			ret = -EROFS;
-			btrfs_std_error(root->fs_info, ret);
+			btrfs_std_error(root->fs_info, ret, NULL);
 			goto enospc;
 		}
 		wret = balance_node_right(trans, root, mid, left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4484063..a86051e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4127,14 +4127,7 @@ do {								\
 				  __LINE__, (errno));		\
 } while (0)
 
-#define btrfs_std_error(fs_info, errno)				\
-do {								\
-	if ((errno))						\
-		__btrfs_std_error((fs_info), __func__,		\
-				   __LINE__, (errno), NULL);	\
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...)		\
+#define btrfs_std_error(fs_info, errno, fmt, args...)		\
 do {								\
 	__btrfs_std_error((fs_info), __func__, __LINE__,	\
 			  (errno), fmt, ##args);		\
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 357d9f9..0f8e33f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2377,7 +2377,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	/* returns with log_tree_root freed on success */
 	ret = btrfs_recover_log_trees(log_tree_root);
 	if (ret) {
-		btrfs_error(tree_root->fs_info, ret,
+		btrfs_std_error(tree_root->fs_info, ret,
 			    "Failed to recover log tree");
 		free_extent_buffer(log_tree_root->node);
 		kfree(log_tree_root);
@@ -3556,7 +3556,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		if (ret) {
 			mutex_unlock(
 				&root->fs_info->fs_devices->device_list_mutex);
-			btrfs_error(root->fs_info, ret,
+			btrfs_std_error(root->fs_info, ret,
 				    "errors while submitting device barriers.");
 			return ret;
 		}
@@ -3596,7 +3596,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 		/* FUA is masked off if unsupported and can't be the reason */
-		btrfs_error(root->fs_info, -EIO,
+		btrfs_std_error(root->fs_info, -EIO,
 			    "%d errors while writing supers", total_errors);
 		return -EIO;
 	}
@@ -3614,7 +3614,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	}
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
-		btrfs_error(root->fs_info, -EIO,
+		btrfs_std_error(root->fs_info, -EIO,
 			    "%d errors while writing supers", total_errors);
 		return -EIO;
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..5069f1b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8688,7 +8688,7 @@ out:
 	if (!for_reloc && root_dropped == false)
 		btrfs_add_dead_root(root);
 	if (err && err != -EAGAIN)
-		btrfs_std_error(root->fs_info, err);
+		btrfs_std_error(root->fs_info, err, NULL);
 	return err;
 }
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c..be4d22a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	 */
 	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
 					    name, name_len, &extref)) {
-		btrfs_std_error(root->fs_info, -ENOENT);
+		btrfs_std_error(root->fs_info, -ENOENT, NULL);
 		ret = -EROFS;
 		goto out;
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..f704d1c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4806,7 +4806,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 	/* update qgroup status and info */
 	err = btrfs_run_qgroups(trans, root->fs_info);
 	if (err < 0)
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "failed to update qgroup status and info\n");
 	err = btrfs_end_transaction(trans, root);
 	if (err && !ret)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 303babe..58ede0a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2418,7 +2418,7 @@ again:
 	}
 out:
 	if (ret) {
-		btrfs_std_error(root->fs_info, ret);
+		btrfs_std_error(root->fs_info, ret, NULL);
 		if (!list_empty(&reloc_roots))
 			free_reloc_roots(&reloc_roots);
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..6d90851 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -283,7 +283,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 			trans = btrfs_join_transaction(tree_root);
 			if (IS_ERR(trans)) {
 				err = PTR_ERR(trans);
-				btrfs_error(tree_root->fs_info, err,
+				btrfs_std_error(tree_root->fs_info, err,
 					    "Failed to start trans to delete "
 					    "orphan item");
 				break;
@@ -292,7 +292,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 						    root_key.objectid);
 			btrfs_end_transaction(trans, tree_root);
 			if (err) {
-				btrfs_error(tree_root->fs_info, err,
+				btrfs_std_error(tree_root->fs_info, err,
 					    "Failed to delete root orphan "
 					    "item");
 				break;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 74bc333..3482d9e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2135,7 +2135,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	if (ret) {
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "Error while writing out transaction");
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		goto scrub_continue;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace..c3f9a9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5314,7 +5314,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
 	ret = walk_log_tree(trans, log_root_tree, &wc);
 	if (ret) {
-		btrfs_error(fs_info, ret, "Failed to pin buffers while "
+		btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
 			    "recovering log root tree.");
 		goto error;
 	}
@@ -5328,7 +5328,7 @@ again:
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
 		if (ret < 0) {
-			btrfs_error(fs_info, ret,
+			btrfs_std_error(fs_info, ret,
 				    "Couldn't find tree log root.");
 			goto error;
 		}
@@ -5346,7 +5346,7 @@ again:
 		log = btrfs_read_fs_root(log_root_tree, &found_key);
 		if (IS_ERR(log)) {
 			ret = PTR_ERR(log);
-			btrfs_error(fs_info, ret,
+			btrfs_std_error(fs_info, ret,
 				    "Couldn't read tree log root.");
 			goto error;
 		}
@@ -5361,7 +5361,7 @@ again:
 			free_extent_buffer(log->node);
 			free_extent_buffer(log->commit_root);
 			kfree(log);
-			btrfs_error(fs_info, ret, "Couldn't read target root "
+			btrfs_std_error(fs_info, ret, "Couldn't read target root "
 				    "for tree log recovery.");
 			goto error;
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e8a6182..99f294c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1402,7 +1402,7 @@ again:
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_dev_extent);
 	} else {
-		btrfs_error(root->fs_info, ret, "Slot search failed");
+		btrfs_std_error(root->fs_info, ret, "Slot search failed");
 		goto out;
 	}
 
@@ -1410,7 +1410,7 @@ again:
 
 	ret = btrfs_del_item(trans, root, path);
 	if (ret) {
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "Failed to remove dev extent item");
 	} else {
 		trans->transaction->have_free_bgs = 1;
@@ -2368,7 +2368,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 		ret = btrfs_relocate_sys_chunks(root);
 		if (ret < 0)
-			btrfs_error(root->fs_info, ret,
+			btrfs_std_error(root->fs_info, ret,
 				    "Failed to relocate sys chunks after "
 				    "device initialization. This can be fixed "
 				    "using the \"btrfs balance\" command.");
@@ -2613,7 +2613,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 	else if (ret > 0) { /* Logic error or corruption */
-		btrfs_error(root->fs_info, -ENOENT,
+		btrfs_std_error(root->fs_info, -ENOENT,
 			    "Failed lookup while freeing chunk.");
 		ret = -ENOENT;
 		goto out;
@@ -2621,7 +2621,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_item(trans, root, path);
 	if (ret < 0)
-		btrfs_error(root->fs_info, ret,
+		btrfs_std_error(root->fs_info, ret,
 			    "Failed to delete chunk item.");
 out:
 	btrfs_free_path(path);
@@ -2806,7 +2806,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		btrfs_std_error(root->fs_info, ret);
+		btrfs_std_error(root->fs_info, ret, NULL);
 		return ret;
 	}
 
@@ -3461,7 +3461,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	unset_balance_control(fs_info);
 	ret = del_balance_item(fs_info->tree_root);
 	if (ret)
-		btrfs_std_error(fs_info, ret);
+		btrfs_std_error(fs_info, ret, NULL);
 
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
-- 
cgit v1.1


From b2f73922d119686323f14fbbe46587f863852328 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 30 Sep 2015 15:59:17 +0200
Subject: fs/proc, core/debug: Don't expose absolute kernel addresses via wchan

So the /proc/PID/stat 'wchan' field (the 30th field, which contains
the absolute kernel address of the kernel function a task is blocked in)
leaks absolute kernel addresses to unprivileged user-space:

        seq_put_decimal_ull(m, ' ', wchan);

The absolute address might also leak via /proc/PID/wchan as well, if
KALLSYMS is turned off or if the symbol lookup fails for some reason:

static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long wchan;
        char symname[KSYM_NAME_LEN];

        wchan = get_wchan(task);

        if (lookup_symbol_name(wchan, symname) < 0) {
                if (!ptrace_may_access(task, PTRACE_MODE_READ))
                        return 0;
                seq_printf(m, "%lu", wchan);
        } else {
                seq_printf(m, "%s", symname);
        }

        return 0;
}

This isn't ideal, because for example it trivially leaks the KASLR offset
to any local attacker:

  fomalhaut:~> printf "%016lx\n" $(cat /proc/$$/stat | cut -d' ' -f35)
  ffffffff8123b380

Most real-life uses of wchan are symbolic:

  ps -eo pid:10,tid:10,wchan:30,comm

and procps uses /proc/PID/wchan, not the absolute address in /proc/PID/stat:

  triton:~/tip> strace -f ps -eo pid:10,tid:10,wchan:30,comm 2>&1 | grep wchan | tail -1
  open("/proc/30833/wchan", O_RDONLY)     = 6

There's one compatibility quirk here: procps relies on whether the
absolute value is non-zero - and we can provide that functionality
by outputing "0" or "1" depending on whether the task is blocked
(whether there's a wchan address).

These days there appears to be very little legitimate reason
user-space would be interested in  the absolute address. The
absolute address is mostly historic: from the days when we
didn't have kallsyms and user-space procps had to do the
decoding itself via the System.map.

So this patch sets all numeric output to "0" or "1" and keeps only
symbolic output, in /proc/PID/wchan.

( The absolute sleep address can generally still be profiled via
  perf, by tasks with sufficient privileges. )

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: kasan-dev <kasan-dev@googlegroups.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20150930135917.GA3285@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c | 16 ++++++++++++++--
 fs/proc/base.c  |  9 +++------
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index f60f012..eed2050 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -375,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task, int whole)
 {
-	unsigned long vsize, eip, esp, wchan = ~0UL;
+	unsigned long vsize, eip, esp, wchan = 0;
 	int priority, nice;
 	int tty_pgrp = -1, tty_nr = 0;
 	sigset_t sigign, sigcatch;
@@ -507,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
 	seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
 	seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
-	seq_put_decimal_ull(m, ' ', wchan);
+
+	/*
+	 * We used to output the absolute kernel address, but that's an
+	 * information leak - so instead we show a 0/1 flag here, to signal
+	 * to user-space whether there's a wchan field in /proc/PID/wchan.
+	 *
+	 * This works with older implementations of procps as well.
+	 */
+	if (wchan)
+		seq_puts(m, " 1");
+	else
+		seq_puts(m, " 0");
+
 	seq_put_decimal_ull(m, ' ', 0);
 	seq_put_decimal_ull(m, ' ', 0);
 	seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b25eee4..29595af 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 
 	wchan = get_wchan(task);
 
-	if (lookup_symbol_name(wchan, symname) < 0) {
-		if (!ptrace_may_access(task, PTRACE_MODE_READ))
-			return 0;
-		seq_printf(m, "%lu", wchan);
-	} else {
+	if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
 		seq_printf(m, "%s", symname);
-	}
+	else
+		seq_putc(m, '0');
 
 	return 0;
 }
-- 
cgit v1.1


From d74a625987a134b00749e3912bdb6d3ac83fd71d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:56 +0800
Subject: Btrfs: use BTRFS_ERROR_DEV_MISSING_NOT_FOUND when missing device is
 not found

Use btrfs specific error code BTRFS_ERROR_DEV_MISSING_NOT_FOUND instead
of -ENOENT.  Next this removes the logging when user specifies "missing"
and we don't find it in the kernel device list. Logging are for system
events not for user input errors.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 99f294c..c3e2591 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2041,10 +2041,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
 			}
 		}
 
-		if (!*device) {
-			btrfs_err(root->fs_info, "no missing device found");
-			return -ENOENT;
-		}
+		if (!*device)
+			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 
 		return 0;
 	} else {
-- 
cgit v1.1


From 29c36d72535deb3d8961b3fb4b6a565190a6c63b Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:58 +0800
Subject: Btrfs: add btrfs_read_dev_one_super() to read one specific SB

This uses a chunk of code from btrfs_read_dev_super() and creates
a function called btrfs_read_dev_one_super() so that next patch
can use it for scratch superblock.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[renamed bufhead to bh]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 53 +++++++++++++++++++++++++++++++++--------------------
 fs/btrfs/disk-io.h |  2 ++
 2 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0f8e33f..c7c739f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3188,6 +3188,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+			struct buffer_head **bh_ret)
+{
+	struct buffer_head *bh;
+	struct btrfs_super_block *super;
+	u64 bytenr;
+
+	bytenr = btrfs_sb_offset(copy_num);
+	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+		return -EINVAL;
+
+	bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+	/*
+	 * If we fail to read from the underlying devices, as of now
+	 * the best option we have is to mark it EIO.
+	 */
+	if (!bh)
+		return -EIO;
+
+	super = (struct btrfs_super_block *)bh->b_data;
+	if (btrfs_super_bytenr(super) != bytenr ||
+		    btrfs_super_magic(super) != BTRFS_MAGIC) {
+		brelse(bh);
+		return -EINVAL;
+	}
+
+	*bh_ret = bh;
+	return 0;
+}
+
+
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 {
 	struct buffer_head *bh;
@@ -3195,7 +3226,6 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	struct btrfs_super_block *super;
 	int i;
 	u64 transid = 0;
-	u64 bytenr;
 	int ret = -EINVAL;
 
 	/* we would like to check all the supers, but that would make
@@ -3204,28 +3234,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 	 */
 	for (i = 0; i < 1; i++) {
-		bytenr = btrfs_sb_offset(i);
-		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-					i_size_read(bdev->bd_inode))
-			break;
-		bh = __bread(bdev, bytenr / 4096,
-					BTRFS_SUPER_INFO_SIZE);
-		/*
-		 * If we fail to read from the underlying devices, as of now
-		 * the best option we have is to mark it EIO.
-		 */
-		if (!bh) {
-			ret = -EIO;
+		ret = btrfs_read_dev_one_super(bdev, i, &bh);
+		if (ret)
 			continue;
-		}
 
 		super = (struct btrfs_super_block *)bh->b_data;
-		if (btrfs_super_bytenr(super) != bytenr ||
-		    btrfs_super_magic(super) != BTRFS_MAGIC) {
-			brelse(bh);
-			ret = -EINVAL;
-			continue;
-		}
 
 		if (!latest || btrfs_super_generation(super) > transid) {
 			brelse(latest);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bdfb479..adeb318 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+			struct buffer_head **bh_ret);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr);
-- 
cgit v1.1


From 12b1c2637b6e0763121648d637ffe603b422e986 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:32:59 +0800
Subject: Btrfs: enhance btrfs_scratch_superblock to scratch all superblocks

This patch updates and renames btrfs_scratch_superblocks, (which is used
by the replace device thread), with those fixes from the scratch
superblock code section of btrfs_rm_device(). The fixes are:
  Scratch all copies of superblock
  Notify kobject that superblock has been changed
  Update time on the device

So that btrfs_rm_device() can use the function
btrfs_scratch_superblocks() instead of its own scratch code. And further
replace deivce code which similarly releases device back to the system,
will have the fixes from the btrfs device delete.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[renamed to btrfs_scratch_superblock]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 38 ++++++++++++++++++++++++++------------
 fs/btrfs/volumes.h |  2 +-
 2 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c3e2591..1cde784 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1924,7 +1924,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	if (srcdev->writeable) {
 		fs_devices->rw_devices--;
 		/* zero out the old super if it is writable */
-		btrfs_scratch_superblock(srcdev);
+		btrfs_scratch_superblocks(srcdev->bdev,
+					rcu_str_deref(srcdev->name));
 	}
 
 	if (srcdev->bdev)
@@ -1974,7 +1975,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
 
 	if (tgtdev->bdev) {
-		btrfs_scratch_superblock(tgtdev);
+		btrfs_scratch_superblocks(tgtdev->bdev,
+					rcu_str_deref(tgtdev->name));
 		fs_info->fs_devices->open_devices--;
 	}
 	fs_info->fs_devices->num_devices--;
@@ -6738,22 +6740,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
 {
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
+	int copy_num;
 
-	bh = btrfs_read_dev_super(device->bdev);
-	if (IS_ERR(bh))
-		return PTR_ERR(bh);
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (!bdev)
+		return;
 
-	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-	set_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-	brelse(bh);
+	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+		copy_num++) {
 
-	return 0;
+		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+			continue;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+		brelse(bh);
+	}
+
+	/* Notify udev that device has changed */
+	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+	/* Update ctime/mtime for device path for libblkid */
+	update_dev_time(device_path);
 }
 
 /*
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 413a70c..8663bf7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -474,7 +474,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				      struct btrfs_device *tgtdev);
 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
 					      struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 			   u64 logical, u64 len, int mirror_num);
 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
-- 
cgit v1.1


From 9e271ae27e4407388c5b25c747abc25c07838b0b Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:33:02 +0800
Subject: Btrfs: kernel operation should come after user input has been
 verified

By general rule of thumb there shouldn't be any way that user land
could trigger a kernel operation just by sending wrong arguments.

Here do commit cleanups after user input has been verified.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7aed8a8..02031fc 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	    args->start.tgtdev_name[0] == '\0')
 		return -EINVAL;
 
-	/*
-	 * Here we commit the transaction to make sure commit_total_bytes
-	 * of all the devices are updated.
-	 */
-	trans = btrfs_attach_transaction(root);
-	if (!IS_ERR(trans)) {
-		ret = btrfs_commit_transaction(trans, root);
-		if (ret)
-			return ret;
-	} else if (PTR_ERR(trans) != -ENOENT) {
-		return PTR_ERR(trans);
-	}
-
 	/* the disk copy procedure reuses the scrub code */
 	mutex_lock(&fs_info->volume_mutex);
 	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	if (ret)
 		return ret;
 
+	/*
+	 * Here we commit the transaction to make sure commit_total_bytes
+	 * of all the devices are updated.
+	 */
+	trans = btrfs_attach_transaction(root);
+	if (!IS_ERR(trans)) {
+		ret = btrfs_commit_transaction(trans, root);
+		if (ret)
+			return ret;
+	} else if (PTR_ERR(trans) != -ENOENT) {
+		return PTR_ERR(trans);
+	}
+
 	btrfs_dev_replace_lock(dev_replace);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
-- 
cgit v1.1


From 097efc966ab3e2c3677698f4ab5b229bfff101dd Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:33:04 +0800
Subject: Btrfs: don't log error from btrfs_get_bdev_and_sb

Originally the message was not in a helper but ended up there. We should
print error messages from callers instead.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[reworded subject and changelog]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1cde784..694d82b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -198,7 +198,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 
 	if (IS_ERR(*bdev)) {
 		ret = PTR_ERR(*bdev);
-		printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
 		goto error;
 	}
 
-- 
cgit v1.1


From f190aa471a7a703444f6587ed79c24a04a22d848 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Aug 2015 18:33:05 +0800
Subject: Btrfs: add helper for closing one device

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[reworded subject and changelog]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 66 +++++++++++++++++++++++++++++-------------------------
 fs/btrfs/volumes.h |  1 +
 2 files changed, 37 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 694d82b..5688b78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -764,36 +764,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
-		struct btrfs_device *new_device;
-		struct rcu_string *name;
-
-		if (device->bdev)
-			fs_devices->open_devices--;
-
-		if (device->writeable &&
-		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
-			list_del_init(&device->dev_alloc_list);
-			fs_devices->rw_devices--;
-		}
-
-		if (device->missing)
-			fs_devices->missing_devices--;
-
-		new_device = btrfs_alloc_device(NULL, &device->devid,
-						device->uuid);
-		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
-		/* Safe because we are under uuid_mutex */
-		if (device->name) {
-			name = rcu_string_strdup(device->name->str, GFP_NOFS);
-			BUG_ON(!name); /* -ENOMEM */
-			rcu_assign_pointer(new_device->name, name);
-		}
-
-		list_replace_rcu(&device->dev_list, &new_device->dev_list);
-		new_device->fs_devices = device->fs_devices;
-
-		call_rcu(&device->rcu, free_device);
+		btrfs_close_one_device(device);
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 
@@ -6834,3 +6805,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
 		fs_devices = fs_devices->seed;
 	}
 }
+
+void btrfs_close_one_device(struct btrfs_device *device)
+{
+	struct btrfs_fs_devices *fs_devices = device->fs_devices;
+	struct btrfs_device *new_device;
+	struct rcu_string *name;
+
+	if (device->bdev)
+		fs_devices->open_devices--;
+
+	if (device->writeable &&
+	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
+		list_del_init(&device->dev_alloc_list);
+		fs_devices->rw_devices--;
+	}
+
+	if (device->missing)
+		fs_devices->missing_devices--;
+
+	new_device = btrfs_alloc_device(NULL, &device->devid,
+					device->uuid);
+	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+	/* Safe because we are under uuid_mutex */
+	if (device->name) {
+		name = rcu_string_strdup(device->name->str, GFP_NOFS);
+		BUG_ON(!name); /* -ENOMEM */
+		rcu_assign_pointer(new_device->name, name);
+	}
+
+	list_replace_rcu(&device->dev_list, &new_device->dev_list);
+	new_device->fs_devices = device->fs_devices;
+
+	call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8663bf7..38450dc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -547,5 +547,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
 struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_close_one_device(struct btrfs_device *device);
 
 #endif
-- 
cgit v1.1


From 73416dab235e5ff030e3b3c61da0cd6ced324fc9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Aug 2015 18:33:07 +0800
Subject: Btrfs: move kobj stuff out of dev_replace lock range

To avoid deadlock described in commit 084b6e7c7607 ("btrfs: Fix a
lockdep warning when running xfstest."), we should move kobj stuff out
of dev_replace lock range.

  "It is because the btrfs_kobj_{add/rm}_device() will call memory
  allocation with GFP_KERNEL,
  which may flush fs page cache to free space, waiting for it self to do
  the commit, causing the deadlock.

  To solve the problem, move btrfs_kobj_{add/rm}_device() out of the
  dev_replace lock range, also involing split the
  btrfs_rm_dev_replace_srcdev() function into remove and free parts.

  Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace
  lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are
  called out of the lock range."

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
[added lockup description]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 02031fc..0978e52 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -375,10 +375,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
-	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
-	if (ret)
-		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
 	printk_in_rcu(KERN_INFO
 		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
 		      src_device->missing ? "<missing disk>" :
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 	btrfs_dev_replace_unlock(dev_replace);
 
+	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+	if (ret)
+		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
 	btrfs_wait_ordered_roots(root->fs_info, -1);
 
 	/* force writing the updated state information to disk */
-- 
cgit v1.1


From 491e94f790982bec5a1334dd3db2708de3724bdf Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 1 Oct 2015 11:47:31 -0500
Subject: gfs2: Add missing else in trans_add_meta/data

This patch fixes a timing window that causes a segfault.
The problem is that bd can remain NULL throughout the function
and then reference that NULL pointer if the bh->b_private starts
out NULL, then someone sets it to non-NULL inside the locking.
In that case, bd still needs to be set.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/trans.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index b95d0d6..0c1bde3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -176,6 +176,8 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 		unlock_buffer(bh);
 		if (bh->b_private == NULL)
 			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+		else
+			bd = bh->b_private;
 		lock_buffer(bh);
 		gfs2_log_lock(sdp);
 	}
@@ -236,6 +238,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 		lock_page(bh->b_page);
 		if (bh->b_private == NULL)
 			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+		else
+			bd = bh->b_private;
 		unlock_page(bh->b_page);
 		lock_buffer(bh);
 		gfs2_log_lock(sdp);
-- 
cgit v1.1


From 8346c416d17bf5b4ea1508662959bb62e73fd6a5 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Thu, 1 Oct 2015 15:36:59 -0700
Subject: dax: fix NULL pointer in __dax_pmd_fault()

Commit 46c043ede471 ("mm: take i_mmap_lock in unmap_mapping_range() for
DAX") moved some code in __dax_pmd_fault() that was responsible for
zeroing newly allocated PMD pages.  The new location didn't properly set
up 'kaddr', so when run this code resulted in a NULL pointer BUG.

Fix this by getting the correct 'kaddr' via bdev_direct_access().

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 7ae6df7..bcfb14b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -569,8 +569,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
 		goto fallback;
 
+	sector = bh.b_blocknr << (blkbits - 9);
+
 	if (buffer_unwritten(&bh) || buffer_new(&bh)) {
 		int i;
+
+		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+						bh.b_size);
+		if (length < 0) {
+			result = VM_FAULT_SIGBUS;
+			goto out;
+		}
+		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+			goto fallback;
+
 		for (i = 0; i < PTRS_PER_PMD; i++)
 			clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
 		wmb_pmem();
@@ -623,7 +635,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		result = VM_FAULT_NOPAGE;
 		spin_unlock(ptl);
 	} else {
-		sector = bh.b_blocknr << (blkbits - 9);
 		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
 						bh.b_size);
 		if (length < 0) {
-- 
cgit v1.1


From 646200a041203f440fb6fcf9cacd9efeda9de74c Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 28 Sep 2015 17:21:07 -0500
Subject: [SMB3] Do not fall back to SMBWriteX in set_file_size error cases

The error paths in set_file_size for cifs and smb3 are incorrect.

In the unlikely event that a server did not support set file info
of the file size, the code incorrectly falls back to trying SMBWriteX
(note that only the original core SMB Write, used for example by DOS,
can set the file size this way - this actually  does not work for the more
recent SMBWriteX).  The idea was since the old DOS SMB Write could set
the file size if you write zero bytes at that offset then use that if
server rejects the normal set file info call.

Fortunately the SMBWriteX will never be sent on the wire (except when
file size is zero) since the length and offset fields were reversed
in the two places in this function that call SMBWriteX causing
the fall back path to return an error. It is also important to never call
an SMB request from an SMB2/sMB3 session (which theoretically would
be possible, and can cause a brief session drop, although the client
recovers) so this should be fixed.  In practice this path does not happen
with modern servers but the error fall back to SMBWriteX is clearly wrong.

Removing the calls to SMBWriteX in the error paths in cifs_set_file_size

Pointed out by PaX/grsecurity team

Signed-off-by: Steve French <steve.french@primarydata.com>
Reported-by: PaX Team <pageexec@freemail.hu>
CC: Emese Revfy <re.emese@gmail.com>
CC: Brad Spengler <spender@grsecurity.net>
CC: Stable <stable@vger.kernel.org>
---
 fs/cifs/inode.c | 34 ----------------------------------
 1 file changed, 34 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f621b44..6b66dd5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2034,7 +2034,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	struct tcon_link *tlink = NULL;
 	struct cifs_tcon *tcon = NULL;
 	struct TCP_Server_Info *server;
-	struct cifs_io_parms io_parms;
 
 	/*
 	 * To avoid spurious oplock breaks from server, in the case of
@@ -2056,18 +2055,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 			rc = -ENOSYS;
 		cifsFileInfo_put(open_file);
 		cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
-		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-			unsigned int bytes_written;
-
-			io_parms.netfid = open_file->fid.netfid;
-			io_parms.pid = open_file->pid;
-			io_parms.tcon = tcon;
-			io_parms.offset = 0;
-			io_parms.length = attrs->ia_size;
-			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
-					  NULL, NULL, 1);
-			cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
-		}
 	} else
 		rc = -EINVAL;
 
@@ -2093,28 +2080,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	else
 		rc = -ENOSYS;
 	cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
-	if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-		__u16 netfid;
-		int oplock = 0;
 
-		rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
-				   GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
-				   &oplock, NULL, cifs_sb->local_nls,
-				   cifs_remap(cifs_sb));
-		if (rc == 0) {
-			unsigned int bytes_written;
-
-			io_parms.netfid = netfid;
-			io_parms.pid = current->tgid;
-			io_parms.tcon = tcon;
-			io_parms.offset = 0;
-			io_parms.length = attrs->ia_size;
-			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
-					  NULL,  1);
-			cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
-			CIFSSMBClose(xid, tcon, netfid);
-		}
-	}
 	if (tlink)
 		cifs_put_tlink(tlink);
 
-- 
cgit v1.1


From 40f90271a835478e5910dc84f32b8e25885419a4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 1 Oct 2015 11:36:38 -0400
Subject: NFS: Fix up page writeback accounting

Currently, we are crediting all the calls to nfs_writepages_callback()
(i.e. the nfs_writepages() callback) to nfs_writepage(). Aside from
being inconsistent with the behaviour of the equivalent readpage/readpages
accounting, this also means that we cannot distinguish between bulk writes
and single page writebacks (which confuses the 'nfsiostat -p' tool).

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 72624dc..cfa4776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -569,19 +569,17 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	if (!nfs_pageio_add_request(pgio, req)) {
 		nfs_redirty_request(req);
 		ret = pgio->pg_error;
-	}
+	} else
+		nfs_add_stats(page_file_mapping(page)->host,
+				NFSIOS_WRITEPAGES, 1);
 out:
 	return ret;
 }
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-	struct inode *inode = page_file_mapping(page)->host;
 	int ret;
 
-	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
-	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
-
 	nfs_pageio_cond_complete(pgio, page_file_index(page));
 	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
@@ -597,9 +595,11 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
 	struct nfs_pageio_descriptor pgio;
+	struct inode *inode = page_file_mapping(page)->host;
 	int err;
 
-	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
 				false, &nfs_async_write_completion_ops);
 	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
-- 
cgit v1.1


From 8fa4592a14ebb3c22a21d846d1e4f65dab7d1a7c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 1 Oct 2015 18:38:27 -0400
Subject: NFS: Fix a write performance regression

If all other conditions in nfs_can_extend_write() are met, and there
are no locks, then we should be able to assume close-to-open semantics
and the ability to extend our write to cover the whole page.

With this patch, the xfstests generic/074 test completes in 242s instead
of >1400s on my test rig.

Fixes: bd61e0a9c852 ("locks: convert posix locks to file_lock_context")
Cc: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cfa4776..75ab762 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1223,7 +1223,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 		return 1;
 	if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
 		       list_empty_careful(&flctx->flc_posix)))
-		return 0;
+		return 1;
 
 	/* Check to see if there are whole file write locks */
 	ret = 0;
-- 
cgit v1.1


From 4a0954ef347de7409ddf8f8153d893827d3feba8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 2 Oct 2015 11:11:16 -0400
Subject: NFSv4: Don't try to reclaim unused state owners

Currently, we don't test if the state owner is in use before we try to
recover it. The problem is that if the refcount is zero, then the
state owner will be waiting on the lru list for garbage collection.
The expectation in that case is that if you bump the refcount, then
you must also remove the state owner from the lru list. Otherwise
the call to nfs4_put_state_owner will corrupt that list by trying
to add our state owner a second time.

Avoid the whole problem by just skipping state owners that hold no
state.

Reported-by: Andrew W Elble <aweits@rit.edu>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5db3246..d854693 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1725,7 +1725,8 @@ restart:
 			if (!test_and_clear_bit(ops->owner_flag_bit,
 							&sp->so_flags))
 				continue;
-			atomic_inc(&sp->so_count);
+			if (!atomic_inc_not_zero(&sp->so_count))
+				continue;
 			spin_unlock(&clp->cl_lock);
 			rcu_read_unlock();
 
-- 
cgit v1.1


From 72d79ff83c34d430a9f7f72c45717905762ef4d9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 2 Oct 2015 11:44:54 -0400
Subject: NFSv4.1: nfs4_opendata_check_deleg needs to handle
 NFS4_OPEN_CLAIM_DELEG_CUR_FH

We need to warn against broken NFSv4.1 servers that try to hand out
delegations in response to NFS4_OPEN_CLAIM_DELEG_CUR_FH.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f93b9cd..12e9808 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1458,12 +1458,18 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
 	if (delegation)
 		delegation_flags = delegation->flags;
 	rcu_read_unlock();
-	if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
+	switch (data->o_arg.claim) {
+	default:
+		break;
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 		pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
 				   "returning a delegation for "
 				   "OPEN(CLAIM_DELEGATE_CUR)\n",
 				   clp->cl_hostname);
-	} else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+		return;
+	}
+	if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
 		nfs_inode_set_delegation(state->inode,
 					 data->owner->so_cred,
 					 &data->o_res);
-- 
cgit v1.1


From e92c1e0d40c50472f80820bd829645ce9fefd6c1 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 1 Oct 2015 09:17:33 -0400
Subject: NFSv4: Fix a nograce recovery hang

Since commit 5cae02f42793130e1387f4ec09c4d07056ce9fa5 an OPEN_CONFIRM should
have a privileged sequence in the recovery case to allow nograce recovery to
proceed for NFSv4.0.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 12e9808..83d5770 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1869,6 +1869,8 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
 	data->rpc_done = 0;
 	data->rpc_status = 0;
 	data->timestamp = jiffies;
+	if (data->is_recover)
+		nfs4_set_sequence_privileged(&data->c_arg.seq_args);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
-- 
cgit v1.1


From 5e99b532bb95f8e6bf39f4500f0caef070bac16d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Fri, 2 Oct 2015 13:14:37 -0400
Subject: nfs4: reset states to use open_stateid when returning delegation
 voluntarily

When the client goes to return a delegation, it should always update any
nfs4_state currently set up to use that delegation stateid to instead
use the open stateid. It already does do this in some cases,
particularly in the state recovery code, but not currently when the
delegation is voluntarily returned (e.g. in advance of a RENAME).  This
causes the client to try to continue using the delegation stateid after
the DELEGRETURN, e.g. in LAYOUTGET.

Set the nfs4_state back to using the open stateid in
nfs4_open_delegation_recall, just before clearing the
NFS_DELEGATED_STATE bit.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83d5770..5133bb1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1777,6 +1777,9 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
 	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
+	write_seqlock(&state->seqlock);
+	nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+	write_sequnlock(&state->seqlock);
 	clear_bit(NFS_DELEGATED_STATE, &state->flags);
 	switch (type & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ|FMODE_WRITE:
-- 
cgit v1.1


From 937d7b84dca58f2565715f2c8e52f14c3d65fb22 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 2 Oct 2015 23:54:58 -0400
Subject: ext4 crypto: fix memory leak in ext4_bio_write_page()

There are times when ext4_bio_write_page() is called even though we
don't actually need to do any I/O.  This happens when ext4_writepage()
gets called by the jbd2 commit path when an inode needs to force its
pages written out in order to provide data=ordered guarantees --- and
a page is backed by an unwritten (e.g., uninitialized) block on disk,
or if delayed allocation means the page's backing store hasn't been
allocated yet.  In that case, we need to skip the call to
ext4_encrypt_page(), since in addition to wasting CPU, it leads to a
bounce page and an ext4 crypto context getting leaked.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/page-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 84ba4d2..17fbe38 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -425,6 +425,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	struct buffer_head *bh, *head;
 	int ret = 0;
 	int nr_submitted = 0;
+	int nr_to_submit = 0;
 
 	blocksize = 1 << inode->i_blkbits;
 
@@ -477,11 +478,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 		}
 		set_buffer_async_write(bh);
+		nr_to_submit++;
 	} while ((bh = bh->b_this_page) != head);
 
 	bh = head = page_buffers(page);
 
-	if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+	if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
+	    nr_to_submit) {
 		data_page = ext4_encrypt(inode, page);
 		if (IS_ERR(data_page)) {
 			ret = PTR_ERR(data_page);
-- 
cgit v1.1


From cccd147a57e47527ce9c27fcd5f0a1c5669d6870 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Oct 2015 10:49:23 -0400
Subject: ext4: optimize ext4_writepage() for attempted 4k delalloc writes

In cases where the file system block size is the same as the page
size, and ext4_writepage() is asked to write out a page which is
either has the unwritten bit set in the extent tree, or which does not
yet have a block assigned due to delayed allocation, we can bail out
early and, unlocking the page earlier and avoiding a round trip
through ext4_bio_write_page() with the attendant calls to
set_page_writeback() and redirty_page_for_writeback().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 612fbcf..c95d406 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1815,11 +1815,22 @@ static int ext4_writepage(struct page *page,
 	 * the page. But we may reach here when we do a journal commit via
 	 * journal_submit_inode_data_buffers() and in that case we must write
 	 * allocated buffers to achieve data=ordered mode guarantees.
+	 *
+	 * Also, if there is only one buffer per page (the fs block
+	 * size == the page size), if one buffer needs block
+	 * allocation or needs to modify the extent tree to clear the
+	 * unwritten flag, we know that the page can't be written at
+	 * all, so we might as well refuse the write immediately.
+	 * Unfortunately if the block size != page size, we can't as
+	 * easily detect this case using ext4_walk_page_buffers(), but
+	 * for the extremely common case, this is an optimization that
+	 * skips a useless round trip through ext4_bio_write_page().
 	 */
 	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 				   ext4_bh_delay_or_unwritten)) {
 		redirty_page_for_writepage(wbc, page);
-		if (current->flags & PF_MEMALLOC) {
+		if ((current->flags & PF_MEMALLOC) ||
+		    (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
 			/*
 			 * For memory cleaning there's no point in writing only
 			 * some buffers. So just bail out. Warn if we came here
-- 
cgit v1.1


From 3684de8ca2b82180f2cdcf5fbcfd3d29ef0afaf8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Oct 2015 10:49:26 -0400
Subject: ext4 crypto: ext4_page_crypto() doesn't need a encryption context

Since ext4_page_crypto() doesn't need an encryption context (at least
not any more), this allows us to simplify a number function signature
and also allows us to avoid needing to allocate a context in
ext4_block_write_begin().  It also means we no longer need a separate
ext4_decrypt_one() function.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/crypto.c   | 28 +++++-----------------------
 fs/ext4/ext4.h     |  3 +--
 fs/ext4/inode.c    |  4 ++--
 fs/ext4/readpage.c |  2 +-
 4 files changed, 9 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 4573155..3a5a7a2 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -253,8 +253,7 @@ typedef enum {
 	EXT4_ENCRYPT,
 } ext4_direction_t;
 
-static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
-			    struct inode *inode,
+static int ext4_page_crypto(struct inode *inode,
 			    ext4_direction_t rw,
 			    pgoff_t index,
 			    struct page *src_page,
@@ -353,7 +352,7 @@ struct page *ext4_encrypt(struct inode *inode,
 	if (IS_ERR(ciphertext_page))
 		goto errout;
 	ctx->w.control_page = plaintext_page;
-	err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+	err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
 			       plaintext_page, ciphertext_page);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
@@ -378,31 +377,14 @@ struct page *ext4_encrypt(struct inode *inode,
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+int ext4_decrypt(struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 
-	return ext4_page_crypto(ctx, page->mapping->host,
+	return ext4_page_crypto(page->mapping->host,
 				EXT4_DECRYPT, page->index, page, page);
 }
 
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int ext4_decrypt_one(struct inode *inode, struct page *page)
-{
-	int ret;
-
-	struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
-
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-	ret = ext4_decrypt(ctx, page);
-	ext4_release_crypto_ctx(ctx);
-	return ret;
-}
-
 int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
 	struct ext4_crypto_ctx	*ctx;
@@ -426,7 +408,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
 	}
 
 	while (len--) {
-		err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+		err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
 				       ZERO_PAGE(0), ciphertext_page);
 		if (err)
 			goto errout;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 766b7f7..3f248c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2063,8 +2063,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
 void ext4_restore_control_page(struct page *data_page);
 struct page *ext4_encrypt(struct inode *inode,
 			  struct page *plaintext_page);
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
-int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_decrypt(struct page *page);
 int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c95d406..ae52e32 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -965,7 +965,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	else if (decrypt)
-		err = ext4_decrypt_one(inode, page);
+		err = ext4_decrypt(page);
 	return err;
 }
 #endif
@@ -3404,7 +3404,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			/* We expect the key to be set. */
 			BUG_ON(!ext4_has_encryption_key(inode));
 			BUG_ON(blocksize != PAGE_CACHE_SIZE);
-			WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+			WARN_ON_ONCE(ext4_decrypt(page));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e26803f..a91e4bf 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work)
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
 
-		int ret = ext4_decrypt(ctx, page);
+		int ret = ext4_decrypt(page);
 		if (ret) {
 			WARN_ON_ONCE(1);
 			SetPageError(page);
-- 
cgit v1.1


From 687c3c36e754a999a8263745b27965128db4fee5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Oct 2015 10:49:27 -0400
Subject: ext4 crypto: replace some BUG_ON()'s with error checks

Buggy (or hostile) userspace should not be able to cause the kernel to
crash.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/crypto.c        |  1 -
 fs/ext4/crypto_fname.c  |  2 --
 fs/ext4/crypto_key.c    | 16 +++++++++++++---
 fs/ext4/crypto_policy.c |  3 ++-
 4 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 3a5a7a2..879cb15 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -295,7 +295,6 @@ static int ext4_page_crypto(struct inode *inode,
 	else
 		res = crypto_ablkcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 847f919..2fbef8a 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode,
 	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
 	res = crypto_ablkcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
@@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode,
 	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
 	res = crypto_ablkcipher_decrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 1d510c1..f9270ec 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
 				     EXT4_AES_256_XTS_KEY_SIZE, NULL);
 	res = crypto_ablkcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
@@ -208,7 +207,12 @@ retry:
 		goto out;
 	}
 	crypt_info->ci_keyring_key = keyring_key;
-	BUG_ON(keyring_key->type != &key_type_logon);
+	if (keyring_key->type != &key_type_logon) {
+		printk_once(KERN_WARNING
+			    "ext4: key type must be logon\n");
+		res = -ENOKEY;
+		goto out;
+	}
 	ukp = ((struct user_key_payload *)keyring_key->payload.data);
 	if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
 		res = -EINVAL;
@@ -217,7 +221,13 @@ retry:
 	master_key = (struct ext4_encryption_key *)ukp->data;
 	BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
 		     EXT4_KEY_DERIVATION_NONCE_SIZE);
-	BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+	if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
+		printk_once(KERN_WARNING
+			    "ext4: key size incorrect: %d\n",
+			    master_key->size);
+		res = -ENOKEY;
+		goto out;
+	}
 	res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
 				  raw_key);
 	if (res)
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index a640ec2..ad05069 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
 
 	if ((parent == NULL) || (child == NULL)) {
 		pr_err("parent %p child %p\n", parent, child);
-		BUG_ON(1);
+		WARN_ON(1);	/* Should never happen */
+		return 0;
 	}
 	/* no restrictions if the parent directory is not encrypted */
 	if (!ext4_encrypted_inode(parent))
-- 
cgit v1.1


From 36086d43f6575c081067de9855786a2fc91df77b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Oct 2015 10:49:29 -0400
Subject: ext4 crypto: fix bugs in ext4_encrypted_zeroout()

Fix multiple bugs in ext4_encrypted_zeroout(), including one that
could cause us to write an encrypted zero page to the wrong location
on disk, potentially causing data and file system corruption.
Fortunately, this tends to only show up in stress tests, but even with
these fixes, we are seeing some test failures with generic/127 --- but
these are now caused by data failures instead of metadata corruption.

Since ext4_encrypted_zeroout() is only used for some optimizations to
keep the extent tree from being too fragmented, and
ext4_encrypted_zeroout() itself isn't all that optimized from a time
or IOPS perspective, disable the extent tree optimization for
encrypted inodes for now.  This prevents the data corruption issues
reported by generic/127 until we can figure out what's going wrong.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/crypto.c  | 23 +++++++++++++++++++----
 fs/ext4/extents.c |  3 +++
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 879cb15..af06830 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -392,7 +392,13 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
 	ext4_lblk_t		lblk = ex->ee_block;
 	ext4_fsblk_t		pblk = ext4_ext_pblock(ex);
 	unsigned int		len = ext4_ext_get_actual_len(ex);
-	int			err = 0;
+	int			ret, err = 0;
+
+#if 0
+	ext4_msg(inode->i_sb, KERN_CRIT,
+		 "ext4_encrypted_zeroout ino %lu lblk %u len %u",
+		 (unsigned long) inode->i_ino, lblk, len);
+#endif
 
 	BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
 
@@ -418,17 +424,26 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
 			goto errout;
 		}
 		bio->bi_bdev = inode->i_sb->s_bdev;
-		bio->bi_iter.bi_sector = pblk;
-		err = bio_add_page(bio, ciphertext_page,
+		bio->bi_iter.bi_sector =
+			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		ret = bio_add_page(bio, ciphertext_page,
 				   inode->i_sb->s_blocksize, 0);
-		if (err) {
+		if (ret != inode->i_sb->s_blocksize) {
+			/* should never happen! */
+			ext4_msg(inode->i_sb, KERN_ERR,
+				 "bio_add_page failed: %d", ret);
+			WARN_ON(1);
 			bio_put(bio);
+			err = -EIO;
 			goto errout;
 		}
 		err = submit_bio_wait(WRITE, bio);
+		if ((err == 0) && bio->bi_error)
+			err = -EIO;
 		bio_put(bio);
 		if (err)
 			goto errout;
+		lblk++; pblk++;
 	}
 	err = 0;
 errout:
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2553aa8..7f486e3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3558,6 +3558,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		max_zeroout = sbi->s_extent_max_zeroout_kb >>
 			(inode->i_sb->s_blocksize_bits - 10);
 
+	if (ext4_encrypted_inode(inode))
+		max_zeroout = 0;
+
 	/* If extent is less than s_max_zeroout_kb, zeroout directly */
 	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
-- 
cgit v1.1


From bbc8a0044fb27da5fadb7efbfb472aed00c58e72 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Fri, 7 Aug 2015 09:59:31 +0200
Subject: UBIFS: use kmemdup rather than duplicating its implementation

The patch was generated using fixed coccinelle semantic patch
scripts/coccinelle/api/memdup.cocci [1].

[1]: http://permalink.gmane.org/gmane.linux.kernel/2014320

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/lpt.c | 6 ++----
 fs/ubifs/tnc.c | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index dc9f27e..9a51710 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1498,11 +1498,10 @@ static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
 	}
 
 	/* nnode is being committed, so copy it */
-	n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	n = kmemdup(nnode, sizeof(struct ubifs_nnode), GFP_NOFS);
 	if (unlikely(!n))
 		return ERR_PTR(-ENOMEM);
 
-	memcpy(n, nnode, sizeof(struct ubifs_nnode));
 	n->cnext = NULL;
 	__set_bit(DIRTY_CNODE, &n->flags);
 	__clear_bit(COW_CNODE, &n->flags);
@@ -1549,11 +1548,10 @@ static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
 	}
 
 	/* pnode is being committed, so copy it */
-	p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	p = kmemdup(pnode, sizeof(struct ubifs_pnode), GFP_NOFS);
 	if (unlikely(!p))
 		return ERR_PTR(-ENOMEM);
 
-	memcpy(p, pnode, sizeof(struct ubifs_pnode));
 	p->cnext = NULL;
 	__set_bit(DIRTY_CNODE, &p->flags);
 	__clear_bit(COW_CNODE, &p->flags);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 957f575..fa9a20c 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -198,11 +198,10 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
 {
 	struct ubifs_znode *zn;
 
-	zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+	zn = kmemdup(znode, c->max_znode_sz, GFP_NOFS);
 	if (unlikely(!zn))
 		return ERR_PTR(-ENOMEM);
 
-	memcpy(zn, znode, c->max_znode_sz);
 	zn->cnext = NULL;
 	__set_bit(DIRTY_ZNODE, &zn->flags);
 	__clear_bit(COW_ZNODE, &zn->flags);
-- 
cgit v1.1


From 7d25b361a2d3e8a9dcaf463d92285d58dedc74ab Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Thu, 30 Jul 2015 13:48:09 +0800
Subject: UBIFS: fix a typo in comment of ubifs_budget_req

s/now/how

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/ubifs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index de75902..6ef2415 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -858,9 +858,9 @@ struct ubifs_compressor {
  * @mod_dent: non-zero if the operation removes or modifies an existing
  *            directory entry
  * @new_ino: non-zero if the operation adds a new inode
- * @new_ino_d: now much data newly created inode contains
+ * @new_ino_d: how much data newly created inode contains
  * @dirtied_ino: how many inodes the operation makes dirty
- * @dirtied_ino_d: now much data dirtied inode contains
+ * @dirtied_ino_d: how much data dirtied inode contains
  * @idx_growth: how much the index will supposedly grow
  * @data_growth: how much new data the operation will supposedly add
  * @dd_growth: how much data that makes other data dirty the operation will
-- 
cgit v1.1


From 8f6983abe31f37d953197c3ede14b1d1321affee Mon Sep 17 00:00:00 2001
From: shengyong <shengyong1@huawei.com>
Date: Wed, 23 Sep 2015 09:11:39 +0000
Subject: UBIFS: call dbg_is_power_cut() instead of reading c->dbg->pc_happened

Call dbg_is_power_cut() to emulate power cut instead of reading
c->dbg->pc_happened. Otherwise, the function becomes dead code.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/debug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4c46a98..595ca0d 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2573,7 +2573,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
 {
 	int err, failing;
 
-	if (c->dbg->pc_happened)
+	if (dbg_is_power_cut(c))
 		return -EROFS;
 
 	failing = power_cut_emulated(c, lnum, 1);
@@ -2595,7 +2595,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
 {
 	int err;
 
-	if (c->dbg->pc_happened)
+	if (dbg_is_power_cut(c))
 		return -EROFS;
 	if (power_cut_emulated(c, lnum, 1))
 		return -EROFS;
@@ -2611,7 +2611,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum)
 {
 	int err;
 
-	if (c->dbg->pc_happened)
+	if (dbg_is_power_cut(c))
 		return -EROFS;
 	if (power_cut_emulated(c, lnum, 0))
 		return -EROFS;
@@ -2627,7 +2627,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum)
 {
 	int err;
 
-	if (c->dbg->pc_happened)
+	if (dbg_is_power_cut(c))
 		return -EROFS;
 	if (power_cut_emulated(c, lnum, 0))
 		return -EROFS;
-- 
cgit v1.1


From be186cc4de35fcfe766d2938c13ddf3935d6c159 Mon Sep 17 00:00:00 2001
From: shengyong <shengyong1@huawei.com>
Date: Wed, 23 Sep 2015 09:11:40 +0000
Subject: UBIFS: print verbose message when rescanning a corrupted node

This is a trivial fix of showing verbose message when leb-recovery detects
a corrupted node, which is not the last one in the LEB. Rescan expects to
show more detail of the corrupted node.

Reviewed-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 695fc71..2a1f0bc 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -789,7 +789,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 corrupted_rescan:
 	/* Re-scan the corrupted data with verbose messages */
 	ubifs_err(c, "corruption %d", ret);
-	ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+	ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
 corrupted:
 	ubifs_scanned_corruption(c, lnum, offs, buf);
 	err = -EUCLEAN;
-- 
cgit v1.1


From 616a5399b8f11a526cedcbaeb68bc0a1b08a3f43 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sat, 3 Oct 2015 16:54:17 -0500
Subject: [CIFS] Update cifs version number

Update modinfo cifs.ko version number to 2.08

Signed-off-by: Steve French <steve.french@primarydata.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 27aea11..c3cc160 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.07"
+#define CIFS_VERSION   "2.08"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.1


From 621a5f7ad9cd1ce7933f1d302067cbd58354173c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Sat, 26 Sep 2015 15:04:07 -0700
Subject: debugfs: Pass bool pointer to debugfs_create_bool()

Its a bit odd that debugfs_create_bool() takes 'u32 *' as an argument,
when all it needs is a boolean pointer.

It would be better to update this API to make it accept 'bool *'
instead, as that will make it more consistent and often more convenient.
Over that bool takes just a byte.

That required updates to all user sites as well, in the same commit
updating the API. regmap core was also using
debugfs_{read|write}_file_bool(), directly and variable types were
updated for that to be bool as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6c55ade..b70c20f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -439,7 +439,7 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
 			       size_t count, loff_t *ppos)
 {
 	char buf[3];
-	u32 *val = file->private_data;
+	bool *val = file->private_data;
 
 	if (*val)
 		buf[0] = 'Y';
@@ -457,7 +457,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 	char buf[32];
 	size_t buf_size;
 	bool bv;
-	u32 *val = file->private_data;
+	bool *val = file->private_data;
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
@@ -503,7 +503,7 @@ static const struct file_operations fops_bool = {
  * code.
  */
 struct dentry *debugfs_create_bool(const char *name, umode_t mode,
-				   struct dentry *parent, u32 *value)
+				   struct dentry *parent, bool *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_bool);
 }
-- 
cgit v1.1


From 398dc4ad52022c3d585908544b936bdf73640727 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Mon, 7 Sep 2015 19:03:15 +0200
Subject: debugfs: document that debugfs_remove*() accepts NULL and error
 values

According to commit a59d6293e537 ("debugfs: change parameter check in
debugfs_remove() functions"), this is meant to make cleanup easier for
callers. In that case it ought to be documented.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c711be8..5d8f35f 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,7 +533,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 /**
  * debugfs_remove - removes a file or directory from the debugfs filesystem
  * @dentry: a pointer to a the dentry of the file or directory to be
- *          removed.
+ *          removed.  If this parameter is NULL or an error value, nothing
+ *          will be done.
  *
  * This function removes a file or directory in debugfs that was previously
  * created with a call to another debugfs function (like
@@ -565,7 +566,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
 
 /**
  * debugfs_remove_recursive - recursively removes a directory
- * @dentry: a pointer to a the dentry of the directory to be removed.
+ * @dentry: a pointer to a the dentry of the directory to be removed.  If this
+ *          parameter is NULL or an error value, nothing will be done.
  *
  * This function recursively removes a directory tree in debugfs that
  * was previously created with a call to another debugfs function
-- 
cgit v1.1


From 65da3484d9be5664f5f7d2378e438bb2794f40b8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 6 Aug 2015 08:27:55 +1000
Subject: sysfs: correctly handle short reads on PREALLOC attrs.

attributes declared with __ATTR_PREALLOC use sysfs_kf_read()
which ignores the 'count' arg.
So a 1-byte read request can return more bytes than that.

This is seen with the 'dash' shell when 'read' is used on
some 'md' sysfs attributes.

So only return the 'min' of count and the attribute length.

Signed-off-by: NeilBrown <neilb@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6c95628..f35523d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -108,6 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
 {
 	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
 	struct kobject *kobj = of->kn->parent->priv;
+	size_t len;
 
 	/*
 	 * If buf != of->prealloc_buf, we don't know how
@@ -115,7 +116,8 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
 	 */
 	if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
 		return 0;
-	return ops->show(kobj, of->kn->priv, buf);
+	len = ops->show(kobj, of->kn->priv, buf);
+	return min(count, len);
 }
 
 /* kernfs write callback for regular sysfs files */
-- 
cgit v1.1


From eb042ec35956de2684de2a05a814cd15efe570ca Mon Sep 17 00:00:00 2001
From: Wei Fang <fangwei1@huawei.com>
Date: Tue, 29 Sep 2015 11:23:55 +0800
Subject: jffs2: fix a memleak in read_direntry()

Need to free the memory allocated for 'fd' if failed to read all
of the remainder name.

Signed-off-by: Wei Fang <fangwei1@huawei.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 fs/jffs2/readinode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 28e0aab..bfebbf1 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 
 		err = jffs2_flash_read(c, (ref_offset(ref)) + read,
 				rd->nsize - already, &read, &fd->name[already]);
-		if (unlikely(read != rd->nsize - already) && likely(!err))
+		if (unlikely(read != rd->nsize - already) && likely(!err)) {
+			jffs2_free_full_dirent(fd);
+			JFFS2_ERROR("short read: wanted %d bytes, got %zd\n",
+				    rd->nsize - already, read);
 			return -EIO;
+		}
 
 		if (unlikely(err)) {
 			JFFS2_ERROR("read remainder of name: error %d\n", err);
-- 
cgit v1.1


From b786f16ac3c5d4f7a5fd136656b6a1301b29b73b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sat, 26 Sep 2015 15:30:23 +0100
Subject: Btrfs: send, fix corner case for reference overwrite detection

When the inode given to did_overwrite_ref() matches the current progress
and has a reference that collides with the reference of other inode that
has the same number as the current progress, we were always telling our
caller that the inode's reference was overwritten, which is incorrect
because the other inode might be a new inode (different generation number)
in which case we must return false from did_overwrite_ref() so that its
callers don't use an orphanized path for the inode (as it will never be
orphanized, instead it will be unlinked and the new inode created later).

The following test case for fstests reproduces the issue:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $send_files_dir
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _need_to_be_root

  send_files_dir=$TEST_DIR/btrfs-test-$seq

  rm -f $seqres.full
  rm -fr $send_files_dir
  mkdir $send_files_dir

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  # Create our test file with a single extent of 64K.
  mkdir -p $SCRATCH_MNT/foo
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" $SCRATCH_MNT/foo/bar \
      | _filter_xfs_io

  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
      $SCRATCH_MNT/mysnap1
  _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT \
      $SCRATCH_MNT/mysnap2

  echo "File digest before being replaced:"
  md5sum $SCRATCH_MNT/mysnap1/foo/bar | _filter_scratch

  # Remove the file and then create a new one in the same location with
  # the same name but with different content. This new file ends up
  # getting the same inode number as the previous one, because that inode
  # number was the highest inode number used by the snapshot's root and
  # therefore when attempting to find the a new inode number for the new
  # file, we end up reusing the same inode number. This happens because
  # currently btrfs uses the highest inode number summed by 1 for the
  # first inode created once a snapshot's root is loaded (done at
  # fs/btrfs/inode-map.c:btrfs_find_free_objectid in the linux kernel
  # tree).
  # Having these two different files in the snapshots with the same inode
  # number (but different generation numbers) caused the btrfs send code
  # to emit an incorrect path for the file when issuing an unlink
  # operation because it failed to realize they were different files.
  rm -f $SCRATCH_MNT/mysnap2/foo/bar
  $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 96K" \
      $SCRATCH_MNT/mysnap2/foo/bar | _filter_xfs_io

  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT/mysnap2 \
      $SCRATCH_MNT/mysnap2_ro

  _run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap
  _run_btrfs_util_prog send -p $SCRATCH_MNT/mysnap1 \
      $SCRATCH_MNT/mysnap2_ro -f $send_files_dir/2.snap

  echo "File digest in the original filesystem after being replaced:"
  md5sum $SCRATCH_MNT/mysnap2_ro/foo/bar | _filter_scratch

  # Now recreate the filesystem by receiving both send streams and verify
  # we get the same file contents that the original filesystem had.
  _scratch_unmount
  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  _run_btrfs_util_prog receive -vv $SCRATCH_MNT -f $send_files_dir/1.snap
  _run_btrfs_util_prog receive -vv $SCRATCH_MNT -f $send_files_dir/2.snap

  echo "File digest in the new filesystem:"
  # Must match the digest from the new file.
  md5sum $SCRATCH_MNT/mysnap2_ro/foo/bar | _filter_scratch

  status=0
  exit

Reported-by: Martin Raiber <martin@urbackup.org>
Fixes: 8b191a684968 ("Btrfs: incremental send, check if orphanized dir inode needs delayed rename")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa72bfd..a739b82 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1920,10 +1920,12 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 	/*
 	 * We know that it is or will be overwritten. Check this now.
 	 * The current inode being processed might have been the one that caused
-	 * inode 'ino' to be orphanized, therefore ow_inode can actually be the
-	 * same as sctx->send_progress.
+	 * inode 'ino' to be orphanized, therefore check if ow_inode matches
+	 * the current inode being processed.
 	 */
-	if (ow_inode <= sctx->send_progress)
+	if ((ow_inode < sctx->send_progress) ||
+	    (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+	     gen == sctx->cur_inode_gen))
 		ret = 1;
 	else
 		ret = 0;
-- 
cgit v1.1


From 808f80b46790f27e145c72112189d6a3be2bc884 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 28 Sep 2015 09:56:26 +0100
Subject: Btrfs: update fix for read corruption of compressed and shared
 extents

My previous fix in commit 005efedf2c7d ("Btrfs: fix read corruption of
compressed and shared extents") was effective only if the compressed
extents cover a file range with a length that is not a multiple of 16
pages. That's because the detection of when we reached a different range
of the file that shares the same compressed extent as the previously
processed range was done at extent_io.c:__do_contiguous_readpages(),
which covers subranges with a length up to 16 pages, because
extent_readpages() groups the pages in clusters no larger than 16 pages.
So fix this by tracking the start of the previously processed file
range's extent map at extent_readpages().

The following test case for fstests reproduces the issue:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner

  rm -f $seqres.full

  test_clone_and_read_compressed_extent()
  {
      local mount_opts=$1

      _scratch_mkfs >>$seqres.full 2>&1
      _scratch_mount $mount_opts

      # Create our test file with a single extent of 64Kb that is going to
      # be compressed no matter which compression algo is used (zlib/lzo).
      $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 64K" \
          $SCRATCH_MNT/foo | _filter_xfs_io

      # Now clone the compressed extent into an adjacent file offset.
      $CLONER_PROG -s 0 -d $((64 * 1024)) -l $((64 * 1024)) \
          $SCRATCH_MNT/foo $SCRATCH_MNT/foo

      echo "File digest before unmount:"
      md5sum $SCRATCH_MNT/foo | _filter_scratch

      # Remount the fs or clear the page cache to trigger the bug in
      # btrfs. Because the extent has an uncompressed length that is a
      # multiple of 16 pages, all the pages belonging to the second range
      # of the file (64K to 128K), which points to the same extent as the
      # first range (0K to 64K), had their contents full of zeroes instead
      # of the byte 0xaa. This was a bug exclusively in the read path of
      # compressed extents, the correct data was stored on disk, btrfs
      # just failed to fill in the pages correctly.
      _scratch_remount

      echo "File digest after remount:"
      # Must match the digest we got before.
      md5sum $SCRATCH_MNT/foo | _filter_scratch
  }

  echo -e "\nTesting with zlib compression..."
  test_clone_and_read_compressed_extent "-o compress=zlib"

  _scratch_unmount

  echo -e "\nTesting with lzo compression..."
  test_clone_and_read_compressed_extent "-o compress=lzo"

  status=0
  exit

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Timofey Titovets <nefelim4ag@gmail.com>
---
 fs/btrfs/extent_io.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 11aa8f7..363726b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3144,12 +3144,12 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 					     get_extent_t *get_extent,
 					     struct extent_map **em_cached,
 					     struct bio **bio, int mirror_num,
-					     unsigned long *bio_flags, int rw)
+					     unsigned long *bio_flags, int rw,
+					     u64 *prev_em_start)
 {
 	struct inode *inode;
 	struct btrfs_ordered_extent *ordered;
 	int index;
-	u64 prev_em_start = (u64)-1;
 
 	inode = pages[0]->mapping->host;
 	while (1) {
@@ -3165,7 +3165,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 
 	for (index = 0; index < nr_pages; index++) {
 		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
-			      mirror_num, bio_flags, rw, &prev_em_start);
+			      mirror_num, bio_flags, rw, prev_em_start);
 		page_cache_release(pages[index]);
 	}
 }
@@ -3175,7 +3175,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
 			       int nr_pages, get_extent_t *get_extent,
 			       struct extent_map **em_cached,
 			       struct bio **bio, int mirror_num,
-			       unsigned long *bio_flags, int rw)
+			       unsigned long *bio_flags, int rw,
+			       u64 *prev_em_start)
 {
 	u64 start = 0;
 	u64 end = 0;
@@ -3196,7 +3197,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 						  index - first_index, start,
 						  end, get_extent, em_cached,
 						  bio, mirror_num, bio_flags,
-						  rw);
+						  rw, prev_em_start);
 			start = page_start;
 			end = start + PAGE_CACHE_SIZE - 1;
 			first_index = index;
@@ -3207,7 +3208,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
 		__do_contiguous_readpages(tree, &pages[first_index],
 					  index - first_index, start,
 					  end, get_extent, em_cached, bio,
-					  mirror_num, bio_flags, rw);
+					  mirror_num, bio_flags, rw,
+					  prev_em_start);
 }
 
 static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -4218,6 +4220,7 @@ int extent_readpages(struct extent_io_tree *tree,
 	struct page *page;
 	struct extent_map *em_cached = NULL;
 	int nr = 0;
+	u64 prev_em_start = (u64)-1;
 
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		page = list_entry(pages->prev, struct page, lru);
@@ -4234,12 +4237,12 @@ int extent_readpages(struct extent_io_tree *tree,
 		if (nr < ARRAY_SIZE(pagepool))
 			continue;
 		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-				   &bio, 0, &bio_flags, READ);
+				   &bio, 0, &bio_flags, READ, &prev_em_start);
 		nr = 0;
 	}
 	if (nr)
 		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-				   &bio, 0, &bio_flags, READ);
+				   &bio, 0, &bio_flags, READ, &prev_em_start);
 
 	if (em_cached)
 		free_extent_map(em_cached);
-- 
cgit v1.1


From d9a0540a79f87456907f2ce031f058cf745c5bff Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sat, 3 Oct 2015 13:13:13 +0100
Subject: Btrfs: fix deadlock when finalizing block group creation

Josef ran into a deadlock while a transaction handle was finalizing the
creation of its block groups, which produced the following trace:

  [260445.593112] fio             D ffff88022a9df468     0  8924   4518 0x00000084
  [260445.593119]  ffff88022a9df468 ffffffff81c134c0 ffff880429693c00 ffff88022a9df488
  [260445.593126]  ffff88022a9e0000 ffff8803490d7b00 ffff8803490d7b18 ffff88022a9df4b0
  [260445.593132]  ffff8803490d7af8 ffff88022a9df488 ffffffff8175a437 ffff8803490d7b00
  [260445.593137] Call Trace:
  [260445.593145]  [<ffffffff8175a437>] schedule+0x37/0x80
  [260445.593189]  [<ffffffffa0850f37>] btrfs_tree_lock+0xa7/0x1f0 [btrfs]
  [260445.593197]  [<ffffffff810db7c0>] ? prepare_to_wait_event+0xf0/0xf0
  [260445.593225]  [<ffffffffa07eac44>] btrfs_lock_root_node+0x34/0x50 [btrfs]
  [260445.593253]  [<ffffffffa07eff6b>] btrfs_search_slot+0x88b/0xa00 [btrfs]
  [260445.593295]  [<ffffffffa08389df>] ? free_extent_buffer+0x4f/0x90 [btrfs]
  [260445.593324]  [<ffffffffa07f1a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs]
  [260445.593351]  [<ffffffffa07ea94a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs]
  [260445.593394]  [<ffffffffa08403b9>] btrfs_finish_chunk_alloc+0x1c9/0x570 [btrfs]
  [260445.593427]  [<ffffffffa08002ab>] btrfs_create_pending_block_groups+0x11b/0x200 [btrfs]
  [260445.593459]  [<ffffffffa0800964>] do_chunk_alloc+0x2a4/0x2e0 [btrfs]
  [260445.593491]  [<ffffffffa0803815>] find_free_extent+0xa55/0xd90 [btrfs]
  [260445.593524]  [<ffffffffa0803c22>] btrfs_reserve_extent+0xd2/0x220 [btrfs]
  [260445.593532]  [<ffffffff8119fe5d>] ? account_page_dirtied+0xdd/0x170
  [260445.593564]  [<ffffffffa0803e78>] btrfs_alloc_tree_block+0x108/0x4a0 [btrfs]
  [260445.593597]  [<ffffffffa080c9de>] ? btree_set_page_dirty+0xe/0x10 [btrfs]
  [260445.593626]  [<ffffffffa07eb5cd>] __btrfs_cow_block+0x12d/0x5b0 [btrfs]
  [260445.593654]  [<ffffffffa07ebbff>] btrfs_cow_block+0x11f/0x1c0 [btrfs]
  [260445.593682]  [<ffffffffa07ef8c7>] btrfs_search_slot+0x1e7/0xa00 [btrfs]
  [260445.593724]  [<ffffffffa08389df>] ? free_extent_buffer+0x4f/0x90 [btrfs]
  [260445.593752]  [<ffffffffa07f1a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs]
  [260445.593830]  [<ffffffffa07ea94a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs]
  [260445.593905]  [<ffffffffa08403b9>] btrfs_finish_chunk_alloc+0x1c9/0x570 [btrfs]
  [260445.593946]  [<ffffffffa08002ab>] btrfs_create_pending_block_groups+0x11b/0x200 [btrfs]
  [260445.593990]  [<ffffffffa0815798>] btrfs_commit_transaction+0xa8/0xb40 [btrfs]
  [260445.594042]  [<ffffffffa085abcd>] ? btrfs_log_dentry_safe+0x6d/0x80 [btrfs]
  [260445.594089]  [<ffffffffa082bc84>] btrfs_sync_file+0x294/0x350 [btrfs]
  [260445.594115]  [<ffffffff8123e29b>] vfs_fsync_range+0x3b/0xa0
  [260445.594133]  [<ffffffff81023891>] ? syscall_trace_enter_phase1+0x131/0x180
  [260445.594149]  [<ffffffff8123e35d>] do_fsync+0x3d/0x70
  [260445.594169]  [<ffffffff81023bb8>] ? syscall_trace_leave+0xb8/0x110
  [260445.594187]  [<ffffffff8123e600>] SyS_fsync+0x10/0x20
  [260445.594204]  [<ffffffff8175de6e>] entry_SYSCALL_64_fastpath+0x12/0x71

This happened because the same transaction handle created a large number
of block groups and while finalizing their creation (inserting new items
and updating existing items in the chunk and device trees) a new metadata
extent had to be allocated and no free space was found in the current
metadata block groups, which made find_free_extent() attempt to allocate
a new block group via do_chunk_alloc(). However at do_chunk_alloc() we
ended up allocating a new system chunk too and exceeded the threshold
of 2Mb of reserved chunk bytes, which makes do_chunk_alloc() enter the
final part of block group creation again (at
btrfs_create_pending_block_groups()) and attempt to lock again the root
of the chunk tree when it's already write locked by the same task.

Similarly we can deadlock on extent tree nodes/leafs if while we are
running delayed references we end up creating a new metadata block group
in order to allocate a new node/leaf for the extent tree (as part of
a CoW operation or growing the tree), as btrfs_create_pending_block_groups
inserts items into the extent tree as well. In this case we get the
following trace:

  [14242.773581] fio             D ffff880428ca3418     0  3615   3100 0x00000084
  [14242.773588]  ffff880428ca3418 ffff88042d66b000 ffff88042a03c800 ffff880428ca3438
  [14242.773594]  ffff880428ca4000 ffff8803e4b20190 ffff8803e4b201a8 ffff880428ca3460
  [14242.773600]  ffff8803e4b20188 ffff880428ca3438 ffffffff8175a437 ffff8803e4b20190
  [14242.773606] Call Trace:
  [14242.773613]  [<ffffffff8175a437>] schedule+0x37/0x80
  [14242.773656]  [<ffffffffa057ff07>] btrfs_tree_lock+0xa7/0x1f0 [btrfs]
  [14242.773664]  [<ffffffff810db7c0>] ? prepare_to_wait_event+0xf0/0xf0
  [14242.773692]  [<ffffffffa0519c44>] btrfs_lock_root_node+0x34/0x50 [btrfs]
  [14242.773720]  [<ffffffffa051ef6b>] btrfs_search_slot+0x88b/0xa00 [btrfs]
  [14242.773750]  [<ffffffffa0520a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs]
  [14242.773758]  [<ffffffff811ef4a2>] ? kmem_cache_alloc+0x1d2/0x200
  [14242.773786]  [<ffffffffa0520ad1>] btrfs_insert_item+0x71/0xf0 [btrfs]
  [14242.773818]  [<ffffffffa052f292>] btrfs_create_pending_block_groups+0x102/0x200 [btrfs]
  [14242.773850]  [<ffffffffa052f96e>] do_chunk_alloc+0x2ae/0x2f0 [btrfs]
  [14242.773934]  [<ffffffffa0532825>] find_free_extent+0xa55/0xd90 [btrfs]
  [14242.773998]  [<ffffffffa0532c22>] btrfs_reserve_extent+0xc2/0x1d0 [btrfs]
  [14242.774041]  [<ffffffffa0532e38>] btrfs_alloc_tree_block+0x108/0x4a0 [btrfs]
  [14242.774078]  [<ffffffffa051a5cd>] __btrfs_cow_block+0x12d/0x5b0 [btrfs]
  [14242.774118]  [<ffffffffa051abff>] btrfs_cow_block+0x11f/0x1c0 [btrfs]
  [14242.774155]  [<ffffffffa051e8c7>] btrfs_search_slot+0x1e7/0xa00 [btrfs]
  [14242.774194]  [<ffffffffa0528021>] ? __btrfs_free_extent.isra.70+0x2e1/0xcb0 [btrfs]
  [14242.774235]  [<ffffffffa0520a06>] btrfs_insert_empty_items+0x66/0xc0 [btrfs]
  [14242.774274]  [<ffffffffa051994a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs]
  [14242.774318]  [<ffffffffa052c433>] __btrfs_run_delayed_refs+0xbb3/0x1020 [btrfs]
  [14242.774358]  [<ffffffffa052f404>] btrfs_run_delayed_refs.part.78+0x74/0x280 [btrfs]
  [14242.774391]  [<ffffffffa052f627>] btrfs_run_delayed_refs+0x17/0x20 [btrfs]
  [14242.774432]  [<ffffffffa05be236>] commit_cowonly_roots+0x8d/0x2bd [btrfs]
  [14242.774474]  [<ffffffffa059d07f>] ? __btrfs_run_delayed_items+0x1cf/0x210 [btrfs]
  [14242.774516]  [<ffffffffa05adac3>] ? btrfs_qgroup_account_extents+0x83/0x130 [btrfs]
  [14242.774558]  [<ffffffffa0544c40>] btrfs_commit_transaction+0x590/0xb40 [btrfs]
  [14242.774599]  [<ffffffffa0589b9d>] ? btrfs_log_dentry_safe+0x6d/0x80 [btrfs]
  [14242.774642]  [<ffffffffa055ac54>] btrfs_sync_file+0x294/0x350 [btrfs]
  [14242.774650]  [<ffffffff8123e29b>] vfs_fsync_range+0x3b/0xa0
  [14242.774657]  [<ffffffff81023891>] ? syscall_trace_enter_phase1+0x131/0x180
  [14242.774663]  [<ffffffff8123e35d>] do_fsync+0x3d/0x70
  [14242.774669]  [<ffffffff81023bb8>] ? syscall_trace_leave+0xb8/0x110
  [14242.774675]  [<ffffffff8123e600>] SyS_fsync+0x10/0x20
  [14242.774681]  [<ffffffff8175de6e>] entry_SYSCALL_64_fastpath+0x12/0x71

Fix this by never recursing into the finalization phase of block group
creation and making sure we never trigger the finalization of block group
creation while running delayed references.

Reported-by: Josef Bacik <jbacik@fb.com>
Fixes: 00d80e342c0f ("Btrfs: fix quick exhaustion of the system array in the superblock")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/extent-tree.c | 9 ++++++++-
 fs/btrfs/transaction.c | 1 +
 fs/btrfs/transaction.h | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..601d7d4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2828,6 +2828,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_head *head;
 	int ret;
 	int run_all = count == (unsigned long)-1;
+	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 
 	/* We'll clean this up in btrfs_cleanup_transaction */
 	if (trans->aborted)
@@ -2844,6 +2845,7 @@ again:
 #ifdef SCRAMBLE_DELAYED_REFS
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
+	trans->can_flush_pending_bgs = false;
 	ret = __btrfs_run_delayed_refs(trans, root, count);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, root, ret);
@@ -2893,6 +2895,7 @@ again:
 	}
 out:
 	assert_qgroups_uptodate(trans);
+	trans->can_flush_pending_bgs = can_flush_pending_bgs;
 	return 0;
 }
 
@@ -4306,7 +4309,8 @@ out:
 	 * the block groups that were made dirty during the lifetime of the
 	 * transaction.
 	 */
-	if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+	if (trans->can_flush_pending_bgs &&
+	    trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
 		btrfs_create_pending_block_groups(trans, trans->root);
 		btrfs_trans_release_chunk_metadata(trans);
 	}
@@ -9560,7 +9564,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_item item;
 	struct btrfs_key key;
 	int ret = 0;
+	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 
+	trans->can_flush_pending_bgs = false;
 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
 		if (ret)
 			goto next;
@@ -9581,6 +9587,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 next:
 		list_del_init(&block_group->bg_list);
 	}
+	trans->can_flush_pending_bgs = can_flush_pending_bgs;
 }
 
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a2d6f7b..376191c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -557,6 +557,7 @@ again:
 	h->delayed_ref_elem.seq = 0;
 	h->type = type;
 	h->allocating_chunk = false;
+	h->can_flush_pending_bgs = true;
 	h->reloc_reserved = false;
 	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 87964bf..a994bb0 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -118,6 +118,7 @@ struct btrfs_trans_handle {
 	short aborted;
 	short adding_csums;
 	bool allocating_chunk;
+	bool can_flush_pending_bgs;
 	bool reloc_reserved;
 	bool sync;
 	unsigned int type;
-- 
cgit v1.1


From e5fffbac4a49270e4976d71a0e054c0cf3ef4f8e Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Mon, 5 Oct 2015 22:14:25 +0530
Subject: Btrfs: open_ctree: Fix possible memory leak

After reading one of chunk or tree root tree's root node from disk, if the
root node does not have EXTENT_BUFFER_UPTODATE flag set, we fail to release
the memory used by the root node. Fix this.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
---
 fs/btrfs/disk-io.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aa59871..807f685 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2842,6 +2842,8 @@ int open_ctree(struct super_block *sb,
 	    !extent_buffer_uptodate(chunk_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
 		       sb->s_id);
+		if (!IS_ERR(chunk_root->node))
+			free_extent_buffer(chunk_root->node);
 		chunk_root->node = NULL;
 		goto fail_tree_roots;
 	}
@@ -2880,6 +2882,8 @@ retry_root_backup:
 	    !extent_buffer_uptodate(tree_root->node)) {
 		printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
 		       sb->s_id);
+		if (!IS_ERR(tree_root->node))
+			free_extent_buffer(tree_root->node);
 		tree_root->node = NULL;
 		goto recovery_tree_root;
 	}
-- 
cgit v1.1


From 7d35199e15b82a4d1a20049153b03e6258ce79f7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 10:16:23 +1000
Subject: BTRFS: support NFSv2 export

The "fh_len" passed to ->fh_to_* is not guaranteed to be that same as
that returned by encode_fh - it may be larger.

With NFSv2, the filehandle is fixed length, so it may appear longer
than expected and be zero-padded.

So we must test that fh_len is at least some value, not exactly equal
to it.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/export.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 8d05220..2513a7f 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -112,11 +112,11 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
 	u32 generation;
 
 	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
-		if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+		if (fh_len <  BTRFS_FID_SIZE_CONNECTABLE)
 			return NULL;
 		root_objectid = fid->root_objectid;
 	} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
-		if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+		if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT)
 			return NULL;
 		root_objectid = fid->parent_root_objectid;
 	} else
@@ -136,11 +136,11 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 	u32 generation;
 
 	if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
-	     fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+	     fh_len < BTRFS_FID_SIZE_CONNECTABLE) &&
 	    (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
-	     fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+	     fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
 	    (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
-	     fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+	     fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE))
 		return NULL;
 
 	objectid = fid->objectid;
-- 
cgit v1.1


From 39d0d3bdf7bab3021a31e501172ac0f18947f9b3 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Mon, 5 Oct 2015 16:43:26 -0400
Subject: NFS: Fix a tracepoint NULL-pointer dereference

Running xfstest generic/013 with the tracepoint nfs:nfs4_open_file
enabled produces a NULL-pointer dereference when calculating fileid and
filehandle of the opened file.  Fix this by checking if state is NULL
before trying to use the inode pointer.

Reported-by: Olga Kornievskaia <aglo@umich.edu>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 28df12e..671cf68 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
 			__entry->flags = flags;
 			__entry->fmode = (__force unsigned int)ctx->mode;
 			__entry->dev = ctx->dentry->d_sb->s_dev;
-			if (!IS_ERR(state))
+			if (!IS_ERR_OR_NULL(state))
 				inode = state->inode;
 			if (inode != NULL) {
 				__entry->fileid = NFS_FILEID(inode);
-- 
cgit v1.1


From 7f5028cf6190407b7a632b0f30b83187577824cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Emilio=20L=C3=B3pez?= <emilio.lopez@collabora.co.uk>
Date: Mon, 21 Sep 2015 10:38:20 -0300
Subject: sysfs: Support is_visible() on binary attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the sysfs header file:

    "The returned value will replace static permissions defined in
     struct attribute or struct bin_attribute."

but this isn't the case, as is_visible is only called on struct attribute
only. This patch introduces a new is_bin_visible() function to implement
the same functionality for binary attributes, and updates documentation
accordingly.

Note that to keep functionality and code similar to that of normal
attributes, the mode is now checked as well to ensure it contains only
read/write permissions or SYSFS_PREALLOC.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Emilio López <emilio.lopez@collabora.co.uk>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 fs/sysfs/group.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 39a0199..51b56e6 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -73,13 +73,26 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 	}
 
 	if (grp->bin_attrs) {
-		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+		for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
+			umode_t mode = (*bin_attr)->attr.mode;
+
 			if (update)
 				kernfs_remove_by_name(parent,
 						(*bin_attr)->attr.name);
+			if (grp->is_bin_visible) {
+				mode = grp->is_bin_visible(kobj, *bin_attr, i);
+				if (!mode)
+					continue;
+			}
+
+			WARN(mode & ~(SYSFS_PREALLOC | 0664),
+			     "Attribute %s: Invalid permissions 0%o\n",
+			     (*bin_attr)->attr.name, mode);
+
+			mode &= SYSFS_PREALLOC | 0664;
 			error = sysfs_add_file_mode_ns(parent,
 					&(*bin_attr)->attr, true,
-					(*bin_attr)->attr.mode, NULL);
+					mode, NULL);
 			if (error)
 				break;
 		}
-- 
cgit v1.1


From 08a84e25a80566cf3cb1903c2448123efedff132 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 08:48:52 +0200
Subject: btrfs: introduce _in_rcu variants of message printing functions

Due to the missing variants there are messages that lack the information
printed by btrfs_info etc helpers.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..88acdff 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4039,14 +4039,43 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_info(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
 
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
 #ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 #else
 #define btrfs_debug(fs_info, fmt, args...) \
     no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+	no_printk(KERN_DEBUG fmt, ##args)
 #endif
 
+#define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
+do {							\
+	rcu_read_lock();				\
+	btrfs_printk(fs_info, fmt, ##args);		\
+	rcu_read_unlock();				\
+} while (0)
+
 #ifdef CONFIG_BTRFS_ASSERT
 
 __cold
-- 
cgit v1.1


From ecaeb14b912a3be55530aeec4e81c7243f5ceb5d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 09:01:03 +0200
Subject: btrfs: switch message printers to _in_rcu variants

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  4 ++--
 fs/btrfs/dev-replace.c     | 16 ++++++++--------
 fs/btrfs/ioctl.c           |  2 +-
 fs/btrfs/scrub.c           | 14 +++++++-------
 fs/btrfs/volumes.c         | 18 +++++++++---------
 5 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfa..3e20630 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror(
 		superblock_tmp->never_written = 0;
 		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
 		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
-				     " @%llu (%s/%llu/%d)\n",
+			btrfs_info_in_rcu(device->dev_root->fs_info,
+				"new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
 				     superblock_bdev,
 				     rcu_str_deref(device->name), dev_bytenr,
 				     dev_state->name, dev_bytenr,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd59..ede1c1a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -379,8 +379,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	if (ret)
 		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
 
-	printk_in_rcu(KERN_INFO
-		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+	btrfs_info_in_rcu(root->fs_info,
+		      "dev_replace from %s (devid %llu) to %s started",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
@@ -523,8 +523,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 								src_device,
 								tgt_device);
 	} else {
-		printk_in_rcu(KERN_ERR
-			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+		btrfs_err_in_rcu(root->fs_info,
+			      "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 			      src_device->missing ? "<missing disk>" :
 			        rcu_str_deref(src_device->name),
 			      src_device->devid,
@@ -540,8 +540,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		return scrub_ret;
 	}
 
-	printk_in_rcu(KERN_INFO
-		      "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+	btrfs_info_in_rcu(root->fs_info,
+		      "dev_replace from %s (devid %llu) to %s finished",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
@@ -809,8 +809,8 @@ static int btrfs_dev_replace_kthread(void *data)
 		progress = status_args->status.progress_1000;
 		kfree(status_args);
 		progress = div_u64(progress, 10);
-		printk_in_rcu(KERN_INFO
-			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+		btrfs_info_in_rcu(fs_info,
+			"continuing dev_replace from %s (devid %llu) to %s @%u%%",
 			dev_replace->srcdev->missing ? "<missing disk>" :
 			rcu_str_deref(dev_replace->srcdev->name),
 			dev_replace->srcdev->devid,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..8e9105a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1579,7 +1579,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	new_size = div_u64(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+	btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
 		      rcu_str_deref(device->name), new_size);
 
 	if (new_size > old_size) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a39f5d1..26cfbb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	 * hold all of the paths here
 	 */
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
-		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+		btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
 			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
-			"length %llu, links %u (path: %s)\n", swarn->errstr,
+			"length %llu, links %u (path: %s)", swarn->errstr,
 			swarn->logical, rcu_str_deref(swarn->dev->name),
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	return 0;
 
 err:
-	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+	btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
 		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
-		"resolving failed with ret=%d\n", swarn->errstr,
+		"resolving failed with ret=%d", swarn->errstr,
 		swarn->logical, rcu_str_deref(swarn->dev->name),
 		(unsigned long long)swarn->sector, root, inum, offset, ret);
 
@@ -649,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 						      item_size, &ref_root,
 						      &ref_level);
-			printk_in_rcu(KERN_WARNING
-				"BTRFS: %s at logical %llu on dev %s, "
+			btrfs_warn_in_rcu(fs_info,
+				"%s at logical %llu on dev %s, "
 				"sector %llu: metadata %s (level %d) in tree "
-				"%llu\n", errstr, swarn.logical,
+				"%llu", errstr, swarn.logical,
 				rcu_str_deref(dev->name),
 				(unsigned long long)swarn.sector,
 				ref_level ? "node" : "leaf",
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..b0dae14 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6594,8 +6594,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 	if (ret < 0) {
-		printk_in_rcu(KERN_WARNING "BTRFS: "
-			"error %d while searching for dev_stats item for device %s!\n",
+		btrfs_warn_in_rcu(dev_root->fs_info,
+			"error %d while searching for dev_stats item for device %s",
 			      ret, rcu_str_deref(device->name));
 		goto out;
 	}
@@ -6605,8 +6605,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		/* need to delete old one and insert a new one */
 		ret = btrfs_del_item(trans, dev_root, path);
 		if (ret != 0) {
-			printk_in_rcu(KERN_WARNING "BTRFS: "
-				"delete too small dev_stats item for device %s failed %d!\n",
+			btrfs_warn_in_rcu(dev_root->fs_info,
+				"delete too small dev_stats item for device %s failed %d",
 				      rcu_str_deref(device->name), ret);
 			goto out;
 		}
@@ -6619,9 +6619,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		ret = btrfs_insert_empty_item(trans, dev_root, path,
 					      &key, sizeof(*ptr));
 		if (ret < 0) {
-			printk_in_rcu(KERN_WARNING "BTRFS: "
-					  "insert dev_stats item for device %s failed %d!\n",
-				      rcu_str_deref(device->name), ret);
+			btrfs_warn_in_rcu(dev_root->fs_info,
+				"insert dev_stats item for device %s failed %d",
+				rcu_str_deref(device->name), ret);
 			goto out;
 		}
 	}
@@ -6695,8 +6695,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
 		return; /* all values == 0, suppress message */
 
-	printk_in_rcu(KERN_INFO "BTRFS: "
-		   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	btrfs_info_in_rcu(dev->dev_root->fs_info,
+		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
 	       rcu_str_deref(dev->name),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
-- 
cgit v1.1


From 24aa6b41d4611b14ad0fb89fa0b899ae9ab7005c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 10:27:02 +0200
Subject: btrfs: introduce ratelimited _in_rcu variants of message printing
 functions

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 88acdff..9c0fd90 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4057,16 +4057,38 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_info_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
 
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
 #ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
 #define btrfs_debug_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 #else
 #define btrfs_debug(fs_info, fmt, args...) \
     no_printk(KERN_DEBUG fmt, ##args)
 #define btrfs_debug_in_rcu(fs_info, fmt, args...) \
 	no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+	no_printk(KERN_DEBUG fmt, ##args)
 #endif
 
 #define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
@@ -4076,6 +4098,22 @@ do {							\
 	rcu_read_unlock();				\
 } while (0)
 
+#define btrfs_printk_ratelimited(fs_info, fmt, args...)		\
+do {								\
+	static DEFINE_RATELIMIT_STATE(_rs,			\
+		DEFAULT_RATELIMIT_INTERVAL,			\
+		DEFAULT_RATELIMIT_BURST);       		\
+	if (__ratelimit(&_rs))					\
+		btrfs_printk(fs_info, fmt, ##args);		\
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)		\
+do {								\
+	rcu_read_lock();					\
+	btrfs_printk_ratelimited(fs_info, fmt, ##args);		\
+	rcu_read_unlock();					\
+} while (0)
+
 #ifdef CONFIG_BTRFS_ASSERT
 
 __cold
-- 
cgit v1.1


From b14af3b46f214c5e927dacf58a994fa3c4cb20dc Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 10:43:10 +0200
Subject: btrfs: switch message printers to ratelimited _in_rcu variants

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   |  4 ++--
 fs/btrfs/extent_io.c |  4 ++--
 fs/btrfs/scrub.c     | 20 ++++++++++----------
 fs/btrfs/volumes.c   |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..ba41faf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3175,8 +3175,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 		struct btrfs_device *device = (struct btrfs_device *)
 			bh->b_private;
 
-		printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
-					  "I/O error on %s\n",
+		btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+				"lost page write due to IO error on %s",
 					  rcu_str_deref(device->name));
 		/* note, we dont' set_buffer_write_io_error because we have
 		 * our own ways of dealing with the IO errors
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e2357e3..58b9ccd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2078,8 +2078,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 
-	printk_ratelimited_in_rcu(KERN_INFO
-				  "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+	btrfs_info_rl_in_rcu(fs_info,
+		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
 				  btrfs_ino(inode), start,
 				  rcu_str_deref(dev->name), sector);
 	bio_put(bio);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 26cfbb0..bce86f8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -850,8 +850,8 @@ out:
 		btrfs_dev_replace_stats_inc(
 			&sctx->dev_root->fs_info->dev_replace.
 			num_uncorrectable_read_errors);
-		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+		btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+		    "unable to fixup (nodatasum) error at logical %llu on dev %s",
 			fixup->logical, rcu_str_deref(fixup->dev->name));
 	}
 
@@ -1230,8 +1230,8 @@ corrected_error:
 			sctx->stat.corrected_errors++;
 			sblock_to_check->data_corrected = 1;
 			spin_unlock(&sctx->stat_lock);
-			printk_ratelimited_in_rcu(KERN_ERR
-				"BTRFS: fixed up error at logical %llu on dev %s\n",
+			btrfs_err_rl_in_rcu(fs_info,
+				"fixed up error at logical %llu on dev %s",
 				logical, rcu_str_deref(dev->name));
 		}
 	} else {
@@ -1239,8 +1239,8 @@ did_not_correct_error:
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+		btrfs_err_rl_in_rcu(fs_info,
+			"unable to fixup (regular) error at logical %llu on dev %s",
 			logical, rcu_str_deref(dev->name));
 	}
 
@@ -2201,15 +2201,15 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.read_errors++;
 		spin_unlock(&sctx->stat_lock);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"BTRFS: I/O error rebulding logical %llu for dev %s\n",
+		btrfs_err_rl_in_rcu(fs_info,
+			"IO error rebuilding logical %llu for dev %s",
 			logical, rcu_str_deref(dev->name));
 	} else if (sblock->header_error || sblock->checksum_error) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+		btrfs_err_rl_in_rcu(fs_info,
+			"failed to rebuild valid logical %llu for dev %s",
 			logical, rcu_str_deref(dev->name));
 	} else {
 		scrub_write_block_to_dev_replace(sblock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b0dae14..4f80986 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6675,8 +6675,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
 {
 	if (!dev->dev_stats_valid)
 		return;
-	printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-			   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
 			   rcu_str_deref(dev->name),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
-- 
cgit v1.1


From 1dd6d7ca9d062721ae37388900a322a4b959e939 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 10:51:11 +0200
Subject: btrfs: introduce ratelimited variants of message printing functions

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c0fd90..bca42c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4075,6 +4075,23 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
 
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
 #ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -4082,6 +4099,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
 	btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
 #else
 #define btrfs_debug(fs_info, fmt, args...) \
     no_printk(KERN_DEBUG fmt, ##args)
@@ -4089,6 +4108,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	no_printk(KERN_DEBUG fmt, ##args)
 #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
 	no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+	no_printk(KERN_DEBUG fmt, ##args)
 #endif
 
 #define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
-- 
cgit v1.1


From 9464732266862f6044e4708dca6c4b2a83dd937b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 11:01:36 +0200
Subject: btrfs: switch message printers to ratelimited variants

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c          | 21 ++++++++++-----------
 fs/btrfs/extent-tree.c      | 14 ++++++++------
 fs/btrfs/extent_io.c        |  4 ++--
 fs/btrfs/free-space-cache.c | 10 +++++-----
 fs/btrfs/inode.c            |  7 ++-----
 fs/btrfs/scrub.c            |  8 ++++----
 6 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ba41faf..72553cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk_ratelimited(KERN_WARNING
-				"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
-				"level %d\n",
+			btrfs_warn_rl(fs_info,
+				"%s checksum verify failed on %llu wanted %X found %X "
+				"level %d",
 				fs_info->sb->s_id, buf->start,
 				val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
@@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	printk_ratelimited(KERN_ERR
-	    "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
-			eb->fs_info->sb->s_id, eb->start,
+	btrfs_err_rl(eb->fs_info,
+		"parent transid verify failed on %llu wanted %llu found %llu",
+			eb->start,
 			parent_transid, btrfs_header_generation(eb));
 	ret = 1;
 
@@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != eb->start) {
-		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
-			       "%llu %llu\n",
-			       eb->fs_info->sb->s_id, found_start, eb->start);
+		btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+			       found_start, eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root->fs_info, eb)) {
-		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
-			       eb->fs_info->sb->s_id, eb->start);
+		btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+			       eb->start);
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..913a583 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8271,10 +8271,11 @@ skip:
 			ret = account_shared_subtree(trans, root, next,
 						     generation, level - 1);
 			if (ret) {
-				printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+				btrfs_err_rl(root->fs_info,
+					"Error "
 					"%d accounting shared subtree. Quota "
-					"is out of sync, rescan required.\n",
-					root->fs_info->sb->s_id, ret);
+					"is out of sync, rescan required.",
+					ret);
 			}
 		}
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
@@ -8363,10 +8364,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			BUG_ON(ret); /* -ENOMEM */
 			ret = account_leaf_items(trans, root, eb);
 			if (ret) {
-				printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+				btrfs_err_rl(root->fs_info,
+					"error "
 					"%d accounting leaf items. Quota "
-					"is out of sync, rescan required.\n",
-					root->fs_info->sb->s_id, ret);
+					"is out of sync, rescan required.",
+					ret);
 			}
 		}
 		/* make block locked assertion in clean_tree_block happy */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 58b9ccd..bb318b4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 	inode = tree->mapping->host;
 	isize = i_size_read(inode);
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-		printk_ratelimited(KERN_DEBUG
-		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
 				caller, btrfs_ino(inode), isize, start, end);
 	}
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66..afc1072 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 
 	gen = io_ctl->cur;
 	if (le64_to_cpu(*gen) != generation) {
-		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
-				   "(%Lu) does not match inode (%Lu)\n", *gen,
-				   generation);
+		btrfs_err_rl(io_ctl->root->fs_info,
+			"space cache generation (%llu) does not match inode (%llu)",
+				*gen, generation);
 		io_ctl_unmap_page(io_ctl);
 		return -EIO;
 	}
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	if (val != crc) {
-		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
-				   "space cache\n");
+		btrfs_err_rl(io_ctl->root->fs_info,
+			"csum mismatch on free space cache");
 		io_ctl_unmap_page(io_ctl);
 		return -EIO;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d..25eb814 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3018,8 +3018,6 @@ static int __readpage_endio_check(struct inode *inode,
 	char *kaddr;
 	u32 csum_expected;
 	u32 csum = ~(u32)0;
-	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-				      DEFAULT_RATELIMIT_BURST);
 
 	csum_expected = *(((u32 *)io_bio->csum) + icsum);
 
@@ -3032,9 +3030,8 @@ static int __readpage_endio_check(struct inode *inode,
 	kunmap_atomic(kaddr);
 	return 0;
 zeroit:
-	if (__ratelimit(&_rs))
-		btrfs_warn(BTRFS_I(inode)->root->fs_info,
-			   "csum failed ino %llu off %llu csum %u expected csum %u",
+	btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+		"csum failed ino %llu off %llu csum %u expected csum %u",
 			   btrfs_ino(inode), start, csum, csum_expected);
 	memset(kaddr + pgoff, 1, len);
 	flush_dcache_page(page);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bce86f8..550de89 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1626,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		int ret;
 
 		if (!page_bad->dev->bdev) {
-			printk_ratelimited(KERN_WARNING "BTRFS: "
+			btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
 				"scrub_repair_page_from_good_copy(bdev == NULL) "
-				"is unexpected!\n");
+				"is unexpected");
 			return -EIO;
 		}
 
@@ -4375,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	if (!dev)
 		return -EIO;
 	if (!dev->bdev) {
-		printk_ratelimited(KERN_WARNING
-			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+		btrfs_warn_rl(dev->dev_root->fs_info,
+			"scrub write_page_nocow(bdev == NULL) is unexpected");
 		return -EIO;
 	}
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-- 
cgit v1.1


From f14d104dbdb5044dac9acd0e983ffb60f706c746 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Oct 2015 11:37:06 +0200
Subject: btrfs: switch more printks to our helpers

Convert the simple cases, not all functions provide a way to reach the
fs_info. Also skipped debugging messages (print-tree, integrity
checker and pr_debug) and messages that are printed from possibly
unfinished mount.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 12 ++++++------
 fs/btrfs/extent_io.c | 18 ++++++++++--------
 fs/btrfs/ioctl.c     |  6 +++---
 fs/btrfs/root-tree.c |  5 +++--
 fs/btrfs/send.c      |  2 +-
 fs/btrfs/volumes.c   |  3 ++-
 6 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 72553cd..5bc94d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2347,8 +2347,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	u64 bytenr = btrfs_super_log_root(disk_super);
 
 	if (fs_devices->rw_devices == 0) {
-		printk(KERN_WARNING "BTRFS: log replay required "
-		       "on RO media\n");
+		btrfs_warn(fs_info, "log replay required on RO media");
 		return -EIO;
 	}
 
@@ -2363,12 +2362,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	log_tree_root->node = read_tree_block(tree_root, bytenr,
 			fs_info->generation + 1);
 	if (IS_ERR(log_tree_root->node)) {
-		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		btrfs_warn(fs_info, "failed to read log tree");
 		ret = PTR_ERR(log_tree_root->node);
 		kfree(log_tree_root);
 		return ret;
 	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
-		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		btrfs_err(fs_info, "failed to read log tree");
 		free_extent_buffer(log_tree_root->node);
 		kfree(log_tree_root);
 		return -EIO;
@@ -3294,8 +3293,9 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
 			if (!bh) {
-				printk(KERN_ERR "BTRFS: couldn't get super "
-				       "buffer head for bytenr %Lu\n", bytenr);
+				btrfs_err(device->dev_root->fs_info,
+				    "couldn't get super buffer head for bytenr %llu",
+				    bytenr);
 				errors++;
 				continue;
 			}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bb318b4..c7a3391 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5563,13 +5563,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-		       "len %lu dst len %lu\n", src_offset, len, dst->len);
+		btrfs_err(dst->fs_info,
+			"memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+		btrfs_err(dst->fs_info,
+			"memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 
@@ -5609,13 +5611,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-		       "len %lu len %lu\n", src_offset, len, dst->len);
+		btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+		       "len %lu len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-		       "len %lu len %lu\n", dst_offset, len, dst->len);
+		btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+		       "len %lu len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset < src_offset) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8e9105a..2e520c6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1342,7 +1342,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			break;
 
 		if (btrfs_defrag_cancelled(root->fs_info)) {
-			printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+			btrfs_debug(root->fs_info, "defrag_file cancelled");
 			ret = -EAGAIN;
 			break;
 		}
@@ -2081,7 +2081,7 @@ static noinline int search_ioctl(struct inode *inode,
 		key.offset = (u64)-1;
 		root = btrfs_read_fs_root_no_name(info, &key);
 		if (IS_ERR(root)) {
-			printk(KERN_ERR "BTRFS: could not find root %llu\n",
+			btrfs_err(info, "could not find root %llu",
 			       sk->tree_id);
 			btrfs_free_path(path);
 			return -ENOENT;
@@ -2221,7 +2221,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.offset = (u64)-1;
 	root = btrfs_read_fs_root_no_name(info, &key);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+		btrfs_err(info, "could not find root %llu", tree_id);
 		ret = -ENOENT;
 		goto out;
 	}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..42267d3 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 	if (!need_reset && btrfs_root_generation(item)
 		!= btrfs_root_generation_v2(item)) {
 		if (btrfs_root_generation_v2(item) != 0) {
-			printk(KERN_WARNING "BTRFS: mismatching "
+			btrfs_warn(eb->fs_info,
+					"mismatching "
 					"generation and generation_v2 "
 					"found in root item. This root "
 					"was probably mounted with an "
 					"older kernel. Resetting all "
-					"new fields.\n");
+					"new fields.");
 		}
 		need_reset = 1;
 	}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa72bfd..4ea07c8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2562,7 +2562,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 	} else if (S_ISSOCK(mode)) {
 		cmd = BTRFS_SEND_C_MKSOCK;
 	} else {
-		printk(KERN_WARNING "btrfs: unexpected inode type %o",
+		btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
 				(int)(mode & S_IFMT));
 		ret = -ENOTSUPP;
 		goto out;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4f80986..7ad310c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2352,7 +2352,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 						root->fs_info->fsid);
 		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
 								fsid_buf))
-			pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+			btrfs_warn(root->fs_info,
+				"sysfs: failed to create fsid for sprout");
 	}
 
 	root->fs_info->num_tolerated_disk_barrier_failures =
-- 
cgit v1.1


From b3c2aa07454cf7ab4ec3ee882c586abbea033132 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Sep 2015 14:32:45 -0400
Subject: NFSv4: Refactor NFSv4 error handling

Prepare for unification of the synchronous and asynchronous error
handling.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4_fs.h  |  6 ++++--
 fs/nfs/nfs4proc.c | 39 ++++++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 50cfc4c..4afdee4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -183,10 +183,12 @@ struct nfs4_state {
 
 
 struct nfs4_exception {
-	long timeout;
-	int retry;
 	struct nfs4_state *state;
 	struct inode *inode;
+	long timeout;
+	unsigned char delay : 1,
+		      recovering : 1,
+		      retry : 1;
 };
 
 struct nfs4_state_recovery_ops {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5133bb1..d5c2a46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -344,13 +344,16 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
  * to sleep.
  */
-int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_do_handle_exception(struct nfs_server *server,
+		int errorcode, struct nfs4_exception *exception)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state *state = exception->state;
 	struct inode *inode = exception->inode;
 	int ret = errorcode;
 
+	exception->delay = 0;
+	exception->recovering = 0;
 	exception->retry = 0;
 	switch(errorcode) {
 		case 0:
@@ -411,9 +414,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
 			}
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
-			ret = nfs4_delay(server->client, &exception->timeout);
-			if (ret != 0)
-				break;
+			exception->delay = 1;
+			return 0;
+
 		case -NFS4ERR_RETRY_UNCACHED_REP:
 		case -NFS4ERR_OLD_STATEID:
 			exception->retry = 1;
@@ -434,9 +437,31 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
 wait_on_recovery:
-	ret = nfs4_wait_clnt_recover(clp);
-	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
-		return -EIO;
+	exception->recovering = 1;
+	return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+	struct nfs_client *clp = server->nfs_client;
+	int ret;
+
+	ret = nfs4_do_handle_exception(server, errorcode, exception);
+	if (exception->delay) {
+		ret = nfs4_delay(server->client, &exception->timeout);
+		goto out_retry;
+	}
+	if (exception->recovering) {
+		ret = nfs4_wait_clnt_recover(clp);
+		if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+			return -EIO;
+		goto out_retry;
+	}
+	return ret;
+out_retry:
 	if (ret == 0)
 		exception->retry = 1;
 	return ret;
-- 
cgit v1.1


From 2598ed344586cf3612bfbd33041527c55543acfe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Sep 2015 16:10:18 -0400
Subject: NFSv4: Update the delay statistics counter for synchronous delays

Currently, we only do so for asynchronous delays.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d5c2a46..d044c7b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -412,8 +412,9 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 				ret = -EBUSY;
 				break;
 			}
-		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
+			nfs_inc_server_stats(server, NFSIOS_DELAY);
+		case -NFS4ERR_GRACE:
 			exception->delay = 1;
 			return 0;
 
-- 
cgit v1.1


From 516285ebe0efadc40b914a0e61a913a390604810 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Sep 2015 16:15:24 -0400
Subject: NFSv4: nfs4_async_handle_error should take a non-const nfs_server

For symmetry with the synchronous handler, and so that we can potentially
handle errors such as NFS4ERR_BADNAME.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d044c7b..ae5cde6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -78,7 +78,7 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
+static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *, struct nfs4_state *, long *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -4982,7 +4982,7 @@ out:
 
 
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
 			struct nfs4_state *state, long *timeout)
 {
 	struct nfs_client *clp = server->nfs_client;
@@ -5559,7 +5559,7 @@ struct nfs4_unlockdata {
 	struct nfs4_lock_state *lsp;
 	struct nfs_open_context *ctx;
 	struct file_lock fl;
-	const struct nfs_server *server;
+	struct nfs_server *server;
 	unsigned long timestamp;
 };
 
-- 
cgit v1.1


From 4816fdadab9ff874ec1a4138dad43b636c7d220b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Sep 2015 14:58:42 -0400
Subject: NFSv4: Don't use synchronous delegation recall in exception handling

The code needs to be able to work from inside an asynchronous context.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c | 6 ++----
 fs/nfs/nfs4proc.c   | 8 +++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index be806ea..5166adc 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -721,14 +721,12 @@ int nfs_async_inode_return_delegation(struct inode *inode,
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_delegation *delegation;
 
-	filemap_flush(inode->i_mapping);
-
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
 	if (delegation == NULL)
 		goto out_enoent;
-
-	if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+	if (stateid != NULL &&
+	    !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
 		goto out_enoent;
 	nfs_mark_return_delegation(server, delegation);
 	rcu_read_unlock();
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ae5cde6..2a155ec 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -362,11 +362,9 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
-			if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
-				nfs4_inode_return_delegation(inode);
-				exception->retry = 1;
-				return 0;
-			}
+			if (inode && nfs_async_inode_return_delegation(inode,
+						NULL) == 0)
+				goto wait_on_recovery;
 			if (state == NULL)
 				break;
 			ret = nfs4_schedule_stateid_recovery(server, state);
-- 
cgit v1.1


From 037fc9808a777f6fb6a54c6510b9656716e0c8c8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Sep 2015 15:51:00 -0400
Subject: NFSv4: Unify synchronous and asynchronous error handling

They now only differ in the way we handle waiting, so let's unify.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 123 ++++++++++++++++++++++--------------------------------
 1 file changed, 49 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2a155ec..e32b7e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -78,7 +78,6 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *, struct nfs4_state *, long *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -466,6 +465,55 @@ out_retry:
 	return ret;
 }
 
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+		int errorcode, struct nfs4_exception *exception)
+{
+	struct nfs_client *clp = server->nfs_client;
+	int ret;
+
+	ret = nfs4_do_handle_exception(server, errorcode, exception);
+	if (exception->delay) {
+		rpc_delay(task, nfs4_update_delay(&exception->timeout));
+		goto out_retry;
+	}
+	if (exception->recovering) {
+		rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+		if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+			rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+		goto out_retry;
+	}
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		ret = -EIO;
+	return ret;
+out_retry:
+	if (ret == 0)
+		exception->retry = 1;
+	return ret;
+}
+
+static int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+			struct nfs4_state *state, long *timeout)
+{
+	struct nfs4_exception exception = {
+		.state = state,
+	};
+
+	if (task->tk_status >= 0)
+		return 0;
+	if (timeout)
+		exception.timeout = *timeout;
+	task->tk_status = nfs4_async_handle_exception(task, server,
+			task->tk_status,
+			&exception);
+	if (exception.delay && timeout)
+		*timeout = exception.timeout;
+	if (exception.retry)
+		return -EAGAIN;
+	return 0;
+}
+
 /*
  * Return 'true' if 'clp' is using an rpc_client that is integrity protected
  * or 'false' otherwise.
@@ -4979,79 +5027,6 @@ out:
 #endif	/* CONFIG_NFS_V4_SECURITY_LABEL */
 
 
-static int
-nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
-			struct nfs4_state *state, long *timeout)
-{
-	struct nfs_client *clp = server->nfs_client;
-
-	if (task->tk_status >= 0)
-		return 0;
-	switch(task->tk_status) {
-		case -NFS4ERR_DELEG_REVOKED:
-		case -NFS4ERR_ADMIN_REVOKED:
-		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_OPENMODE:
-			if (state == NULL)
-				break;
-			if (nfs4_schedule_stateid_recovery(server, state) < 0)
-				goto recovery_failed;
-			goto wait_on_recovery;
-		case -NFS4ERR_EXPIRED:
-			if (state != NULL) {
-				if (nfs4_schedule_stateid_recovery(server, state) < 0)
-					goto recovery_failed;
-			}
-		case -NFS4ERR_STALE_STATEID:
-		case -NFS4ERR_STALE_CLIENTID:
-			nfs4_schedule_lease_recovery(clp);
-			goto wait_on_recovery;
-		case -NFS4ERR_MOVED:
-			if (nfs4_schedule_migration_recovery(server) < 0)
-				goto recovery_failed;
-			goto wait_on_recovery;
-		case -NFS4ERR_LEASE_MOVED:
-			nfs4_schedule_lease_moved_recovery(clp);
-			goto wait_on_recovery;
-#if defined(CONFIG_NFS_V4_1)
-		case -NFS4ERR_BADSESSION:
-		case -NFS4ERR_BADSLOT:
-		case -NFS4ERR_BAD_HIGH_SLOT:
-		case -NFS4ERR_DEADSESSION:
-		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-		case -NFS4ERR_SEQ_FALSE_RETRY:
-		case -NFS4ERR_SEQ_MISORDERED:
-			dprintk("%s ERROR %d, Reset session\n", __func__,
-				task->tk_status);
-			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
-			goto wait_on_recovery;
-#endif /* CONFIG_NFS_V4_1 */
-		case -NFS4ERR_DELAY:
-			nfs_inc_server_stats(server, NFSIOS_DELAY);
-			rpc_delay(task, nfs4_update_delay(timeout));
-			goto restart_call;
-		case -NFS4ERR_GRACE:
-			rpc_delay(task, NFS4_POLL_RETRY_MAX);
-		case -NFS4ERR_RETRY_UNCACHED_REP:
-		case -NFS4ERR_OLD_STATEID:
-			goto restart_call;
-	}
-	task->tk_status = nfs4_map_errors(task->tk_status);
-	return 0;
-recovery_failed:
-	task->tk_status = -EIO;
-	return 0;
-wait_on_recovery:
-	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
-		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
-	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
-		goto recovery_failed;
-restart_call:
-	task->tk_status = 0;
-	return -EAGAIN;
-}
-
 static void nfs4_init_boot_verifier(const struct nfs_client *clp,
 				    nfs4_verifier *bootverf)
 {
-- 
cgit v1.1


From 8c3ad9cb7343dc5f61b8cf3cdbe1016c5e7c2c8b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Oct 2015 15:03:26 +0200
Subject: nfsd/blocklayout: accept any minlength

Recent Linux clients have started to send GETLAYOUT requests with
minlength less than blocksize.

Servers aren't really allowed to impose this kind of restriction on
layouts; see RFC 5661 section 18.43.3 for details.

This has been observed to cause indefinite hangs on fsx runs on some
clients.

Cc: stable@vger.kernel.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/blocklayout.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index cdefaa3..c29d942 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -56,14 +56,6 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
 	u32 device_generation = 0;
 	int error;
 
-	/*
-	 * We do not attempt to support I/O smaller than the fs block size,
-	 * or not aligned to it.
-	 */
-	if (args->lg_minlength < block_size) {
-		dprintk("pnfsd: I/O too small\n");
-		goto out_layoutunavailable;
-	}
 	if (seg->offset & (block_size - 1)) {
 		dprintk("pnfsd: I/O misaligned\n");
 		goto out_layoutunavailable;
-- 
cgit v1.1


From 744288c7216bf5876f271c32516dfb66dedb0448 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sun, 6 Sep 2015 17:50:13 +0800
Subject: f2fs: trace in batches extent info update

Rename trace_f2fs_update_extent_tree to trace_f2fs_update_extent_tree_range,
then expand and enable it to trace in batches extent info updates.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 997ac86..1cd6c6c 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -409,6 +409,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 	if (!et)
 		return false;
 
+	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+
 	write_lock(&et->lock);
 
 	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
-- 
cgit v1.1


From 538e17e7e96e14f76bb82dd83290a5315da70c3b Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Sun, 6 Sep 2015 08:28:46 -0400
Subject: f2fs: fix incorrect return statement in the function
 f2fs_ioc_release_volatile_write

This fixes the incorrect return statement at the end of the function
f2fs_ioc_release_volatile_write's body for returning zero as this is
incorrect due to the function call before this return statement to
the function punch_hole being able to fail and we should return this
function's return fail directly in order to signal to callers of the
function f2fs_ioc_release_volatile if a failure arises with this call
to punch_hole fails.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8120f86..b2fab9e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1437,8 +1437,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
 	if (!f2fs_is_first_block_written(inode))
 		return truncate_partial_data_page(inode, 0, true);
 
-	punch_hole(inode, 0, F2FS_BLKSIZE);
-	return 0;
+	return punch_hole(inode, 0, F2FS_BLKSIZE);
 }
 
 static int f2fs_ioc_abort_volatile_write(struct file *filp)
-- 
cgit v1.1


From 25b93346a6b5542fd7de50555f8f0ddfc56d7443 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 12 Sep 2015 11:13:04 -0700
Subject: f2fs: cover number of dirty node pages under node_write lock

This number is referenced by checkpoint under node_write lock.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27d1a74..4d9bedf 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1323,23 +1323,24 @@ static int f2fs_write_node_page(struct page *page,
 	nid = nid_of_node(page);
 	f2fs_bug_on(sbi, page->index != nid);
 
+	if (wbc->for_reclaim) {
+		if (!down_read_trylock(&sbi->node_write))
+			goto redirty_out;
+	} else {
+		down_read(&sbi->node_write);
+	}
+
 	get_node_info(sbi, nid, &ni);
 
 	/* This page is already truncated */
 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
 		ClearPageUptodate(page);
 		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		up_read(&sbi->node_write);
 		unlock_page(page);
 		return 0;
 	}
 
-	if (wbc->for_reclaim) {
-		if (!down_read_trylock(&sbi->node_write))
-			goto redirty_out;
-	} else {
-		down_read(&sbi->node_write);
-	}
-
 	set_page_writeback(page);
 	fio.blk_addr = ni.blk_addr;
 	write_node_page(nid, &fio);
-- 
cgit v1.1


From c5cd29d21cebc2e1802a9eea4cd940ddf9825ce1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 12 Sep 2015 11:25:30 -0700
Subject: f2fs: no need to lock for update_inode_page all the time

As comment says, we don't need to call f2fs_lock_op in write_inode to prevent
from producing dirty node pages all the time.
That happens only when there is not enough free sections and we can avoid that
by calling balance_fs in prior to that.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 35aae65..97e20de 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -296,16 +296,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		return 0;
 
 	/*
-	 * We need to lock here to prevent from producing dirty node pages
+	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	f2fs_lock_op(sbi);
 	update_inode_page(inode);
-	f2fs_unlock_op(sbi);
-
-	if (wbc)
-		f2fs_balance_fs(sbi);
 
+	f2fs_balance_fs(sbi);
 	return 0;
 }
 
-- 
cgit v1.1


From c998012b0bb93009d66349ea7e9747c968375e69 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 11 Sep 2015 14:39:02 +0800
Subject: f2fs: verify file type early in f2fs_fallocate

This patch changes to verify file type early in f2fs_fallocate for
cleanup, meanwhile this also fixes to add missing verification for
expand_inode_data.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b2fab9e..a85bb14 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -765,9 +765,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	if (!S_ISREG(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	if (f2fs_has_inline_data(inode)) {
 		ret = f2fs_convert_inline_inode(inode);
 		if (ret)
@@ -908,9 +905,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	loff_t new_size;
 	int ret;
 
-	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
-
 	if (offset + len >= i_size_read(inode))
 		return -EINVAL;
 
@@ -959,9 +953,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
-
 	ret = inode_newsize_ok(inode, (len + offset));
 	if (ret)
 		return ret;
@@ -1068,9 +1059,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	loff_t new_size;
 	int ret;
 
-	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
-
 	new_size = i_size_read(inode) + len;
 	if (new_size > inode->i_sb->s_maxbytes)
 		return -EFBIG;
@@ -1228,6 +1216,10 @@ static long f2fs_fallocate(struct file *file, int mode,
 	struct inode *inode = file_inode(file);
 	long ret = 0;
 
+	/* f2fs only support ->fallocate for regular file */
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
 	if (f2fs_encrypted_inode(inode) &&
 		(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
 		return -EOPNOTSUPP;
-- 
cgit v1.1


From 100136acfb4023ab7dc899192e95aca9aedfe98a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 11 Sep 2015 14:43:02 +0800
Subject: f2fs: fix incorrect searching position when shrinking extent cache

When shrinking extent cache, we have two steps in the flow:
1) shrink objects which are unreferenced by inodes;
2) shrink objects from LRU list of extent cache.

In step 1, if we haven't shrunk enough number of objects, we will try
step 2, but before that we didn't update the searching position which
may point to last inode index in global extent tree, result in failing
to shrink objects by traversing the all inodes' extent tree.

In this patch, we reset searching position to beginning of global extent
tree for fixing.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 1cd6c6c..a8b9aa2 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -652,6 +652,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	}
 	spin_unlock(&sbi->extent_lock);
 
+	/*
+	 * reset ino for searching victims from beginning of global extent tree.
+	 */
+	ino = F2FS_ROOT_INO(sbi);
+
 	while ((found = radix_tree_gang_lookup(root,
 				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
 		unsigned i;
-- 
cgit v1.1


From 9edcdabf36422d15d01db73b0fa5487e418beff6 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 11 Sep 2015 14:43:52 +0800
Subject: f2fs: fix overflow of size calculation

We have potential overflow issue when calculating size of object, when
we left shift index with PAGE_CACHE_SHIFT bits, if type of index has only
32-bits space in 32-bit architecture, left shifting will incur overflow,
i.e:

pgoff_t index =  0xFFFFFFFF;
loff_t size = index << PAGE_CACHE_SHIFT;
size: 0xFFFFF000

So we should cast index with 64-bits type to avoid this issue.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c     | 11 ++++++-----
 fs/f2fs/debug.c    | 12 ++++++------
 fs/f2fs/f2fs.h     |  2 +-
 fs/f2fs/file.c     | 18 ++++++++++--------
 fs/f2fs/recovery.c |  2 +-
 5 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a82abe9..681373e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -447,9 +447,9 @@ repeat:
 		lock_page(page);
 	}
 got_it:
-	if (new_i_size &&
-		i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
-		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+	if (new_i_size && i_size_read(inode) <
+				((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
+		i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
 		/* Only the directory inode sets new_i_size */
 		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
 	}
@@ -489,8 +489,9 @@ alloc:
 	/* update i_size */
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
 							dn->ofs_in_node;
-	if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
-		i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
+		i_size_write(dn->inode,
+				((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
 
 	/* direct IO doesn't use extent cache to maximize the performance */
 	f2fs_drop_largest_extent(dn->inode, fofs);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index d013d84..ebfcc40 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -198,9 +198,9 @@ get_cache:
 
 	si->page_mem = 0;
 	npages = NODE_MAPPING(sbi)->nrpages;
-	si->page_mem += npages << PAGE_CACHE_SHIFT;
+	si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
 	npages = META_MAPPING(sbi)->nrpages;
-	si->page_mem += npages << PAGE_CACHE_SHIFT;
+	si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
 }
 
 static int stat_show(struct seq_file *s, void *v)
@@ -333,13 +333,13 @@ static int stat_show(struct seq_file *s, void *v)
 
 		/* memory footprint */
 		update_mem_info(si->sbi);
-		seq_printf(s, "\nMemory: %u KB\n",
+		seq_printf(s, "\nMemory: %llu KB\n",
 			(si->base_mem + si->cache_mem + si->page_mem) >> 10);
-		seq_printf(s, "  - static: %u KB\n",
+		seq_printf(s, "  - static: %llu KB\n",
 				si->base_mem >> 10);
-		seq_printf(s, "  - cached: %u KB\n",
+		seq_printf(s, "  - cached: %llu KB\n",
 				si->cache_mem >> 10);
-		seq_printf(s, "  - paged : %u KB\n",
+		seq_printf(s, "  - paged : %llu KB\n",
 				si->page_mem >> 10);
 	}
 	mutex_unlock(&f2fs_stat_mutex);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f1a90ff..79c38ad 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1844,7 +1844,7 @@ struct f2fs_stat_info {
 	unsigned int segment_count[2];
 	unsigned int block_count[2];
 	unsigned int inplace_count;
-	unsigned base_mem, cache_mem, page_mem;
+	unsigned long long base_mem, cache_mem, page_mem;
 };
 
 static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a85bb14..27a789c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 		goto mapped;
 
 	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+	if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+						i_size_read(inode)) {
 		unsigned offset;
 		offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
 		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
@@ -343,7 +344,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 
 	dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
 
-	for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+	for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 		err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
 		if (err && err != -ENOENT) {
@@ -802,8 +803,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 			f2fs_balance_fs(sbi);
 
-			blk_start = pg_start << PAGE_CACHE_SHIFT;
-			blk_end = pg_end << PAGE_CACHE_SHIFT;
+			blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
+			blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
 			truncate_inode_pages_range(mapping, blk_start,
 					blk_end - 1);
 
@@ -994,7 +995,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 				return ret;
 
 			new_size = max_t(loff_t, new_size,
-						pg_start << PAGE_CACHE_SHIFT);
+					(loff_t)pg_start << PAGE_CACHE_SHIFT);
 		}
 
 		for (index = pg_start; index < pg_end; index++) {
@@ -1030,7 +1031,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 			f2fs_unlock_op(sbi);
 
 			new_size = max_t(loff_t, new_size,
-					(index + 1) << PAGE_CACHE_SHIFT);
+				(loff_t)(index + 1) << PAGE_CACHE_SHIFT);
 		}
 
 		if (off_end) {
@@ -1192,9 +1193,10 @@ noalloc:
 		if (pg_start == pg_end)
 			new_size = offset + len;
 		else if (index == pg_start && off_start)
-			new_size = (index + 1) << PAGE_CACHE_SHIFT;
+			new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
 		else if (index == pg_end)
-			new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+			new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
+								off_end;
 		else
 			new_size += PAGE_CACHE_SIZE;
 	}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index faec2ca..acc21f2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -570,7 +570,7 @@ out:
 
 	/* truncate meta pages to be used by the recovery */
 	truncate_inode_pages_range(META_MAPPING(sbi),
-			MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+			(loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
 
 	if (err) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
-- 
cgit v1.1


From 514053e4543b33bbd74d8900f368df663cb03325 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 15 Sep 2015 14:46:43 -0700
Subject: f2fs: declare f2fs_update_extent_tree_range as static

This function should be static.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index a8b9aa2..63068b7 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -394,7 +394,7 @@ do_insert:
 	return en;
 }
 
-unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 				pgoff_t fofs, block_t blkaddr, unsigned int len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-- 
cgit v1.1


From 973163fc0ce07761eae671e895dc6747a3410fe7 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 18 Sep 2015 16:51:51 +0800
Subject: f2fs: reorganize f2fs_map_blocks

In this patch, we try to reorganize f2fs_map_blocks to make block mapping
flow more clear by using following structure:

/* check status of mapping */

if (unmapped) {
	/* blkaddr == NULL_ADDR || blkaddr == NEW_ADDR */

	if (create) {
		/* write path, handle dio write case here */
		alloc_and_map;
	} else {
		/*
		 * handle read cases from all call paths:
		 *     1. generic read;
		 *     2. dio read;
		 *     3. fiemap;
		 *     4. bmap
		 */
	}
}

/* map buffer_header */

Besides, this patch handles the missing case correctly for dio write:
When we fail in __allocate_data_blocks, then in f2fs_map_blocks, we will
not allocate blocks correctly for preallocated blocks, but returning with
an unmapped buffer head, which will result in failure of dio write.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 84 ++++++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 681373e..883b649 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -596,40 +596,36 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 			err = 0;
 		goto unlock_out;
 	}
-	if (dn.data_blkaddr == NEW_ADDR) {
-		if (flag == F2FS_GET_BLOCK_BMAP) {
-			err = -ENOENT;
-			goto put_out;
-		} else if (flag == F2FS_GET_BLOCK_READ ||
-				flag == F2FS_GET_BLOCK_DIO) {
-			goto put_out;
+
+	if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+		if (create) {
+			err = __allocate_data_block(&dn);
+			if (err)
+				goto put_out;
+			allocated = true;
+			map->m_flags = F2FS_MAP_NEW;
+		} else {
+			if (flag != F2FS_GET_BLOCK_FIEMAP ||
+						dn.data_blkaddr != NEW_ADDR) {
+				if (flag == F2FS_GET_BLOCK_BMAP)
+					err = -ENOENT;
+				goto put_out;
+			}
+
+			/*
+			 * preallocated unwritten block should be mapped
+			 * for fiemap.
+			 */
+			if (dn.data_blkaddr == NEW_ADDR)
+				map->m_flags = F2FS_MAP_UNWRITTEN;
 		}
-		/*
-		 * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP),
-		 * mark it as mapped and unwritten block.
-		 */
 	}
 
-	if (dn.data_blkaddr != NULL_ADDR) {
-		map->m_flags = F2FS_MAP_MAPPED;
-		map->m_pblk = dn.data_blkaddr;
-		if (dn.data_blkaddr == NEW_ADDR)
-			map->m_flags |= F2FS_MAP_UNWRITTEN;
-	} else if (create) {
-		err = __allocate_data_block(&dn);
-		if (err)
-			goto put_out;
-		allocated = true;
-		map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
-		map->m_pblk = dn.data_blkaddr;
-	} else {
-		if (flag == F2FS_GET_BLOCK_BMAP)
-			err = -ENOENT;
-		goto put_out;
-	}
+	map->m_flags |= F2FS_MAP_MAPPED;
+	map->m_pblk = dn.data_blkaddr;
+	map->m_len = 1;
 
 	end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-	map->m_len = 1;
 	dn.ofs_in_node++;
 	pgofs++;
 
@@ -648,23 +644,31 @@ get_next:
 			goto unlock_out;
 		}
 
-		if (dn.data_blkaddr == NEW_ADDR &&
-				flag != F2FS_GET_BLOCK_FIEMAP)
-			goto put_out;
-
 		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 	}
 
 	if (maxblocks > map->m_len) {
 		block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
-		if (blkaddr == NULL_ADDR && create) {
-			err = __allocate_data_block(&dn);
-			if (err)
-				goto sync_out;
-			allocated = true;
-			map->m_flags |= F2FS_MAP_NEW;
-			blkaddr = dn.data_blkaddr;
+
+		if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
+			if (create) {
+				err = __allocate_data_block(&dn);
+				if (err)
+					goto sync_out;
+				allocated = true;
+				map->m_flags |= F2FS_MAP_NEW;
+				blkaddr = dn.data_blkaddr;
+			} else {
+				/*
+				 * we only merge preallocated unwritten blocks
+				 * for fiemap.
+				 */
+				if (flag != F2FS_GET_BLOCK_FIEMAP ||
+						blkaddr != NEW_ADDR)
+					goto sync_out;
+			}
 		}
+
 		/* Give more consecutive addresses for the readahead */
 		if ((map->m_pblk != NEW_ADDR &&
 				blkaddr == (map->m_pblk + ofs)) ||
-- 
cgit v1.1


From 569cf1876a32e574ba8a7fb825cd91bafd003882 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 3 Sep 2015 13:38:23 -0700
Subject: f2fs crypto: allocate buffer for decrypting filename

We got dentry pages from high_mem, and its address space directly goes into the
decryption path via f2fs_fname_disk_to_usr.
But, sg_init_one assumes the address is not from high_mem, so we can get this
panic since it doesn't call kmap_high but kunmap_high is triggered at the end.

kernel BUG at ../../../../../../kernel/mm/highmem.c:290!
Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM
...
 (kunmap_high+0xb0/0xb8) from [<c0114534>] (__kunmap_atomic+0xa0/0xa4)
 (__kunmap_atomic+0xa0/0xa4) from [<c035f028>] (blkcipher_walk_done+0x128/0x1ec)
 (blkcipher_walk_done+0x128/0x1ec) from [<c0366c24>] (crypto_cbc_decrypt+0xc0/0x170)
 (crypto_cbc_decrypt+0xc0/0x170) from [<c0367148>] (crypto_cts_decrypt+0xc0/0x114)
 (crypto_cts_decrypt+0xc0/0x114) from [<c035ea98>] (async_decrypt+0x40/0x48)
 (async_decrypt+0x40/0x48) from [<c032ca34>] (f2fs_fname_disk_to_usr+0x124/0x304)
 (f2fs_fname_disk_to_usr+0x124/0x304) from [<c03056fc>] (f2fs_fill_dentries+0xac/0x188)
 (f2fs_fill_dentries+0xac/0x188) from [<c03059c8>] (f2fs_readdir+0x1f0/0x300)
 (f2fs_readdir+0x1f0/0x300) from [<c0218054>] (vfs_readdir+0x90/0xb4)
 (vfs_readdir+0x90/0xb4) from [<c0218418>] (SyS_getdents64+0x64/0xcc)
 (SyS_getdents64+0x64/0xcc) from [<c0105ba0>] (ret_fast_syscall+0x0/0x30)

Cc: <stable@vger.kernel.org>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c   | 13 ++++++++++---
 fs/f2fs/namei.c | 10 +++++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8f15fc1..6726c4a 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -787,7 +787,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 		else
 			d_type = DT_UNKNOWN;
 
-		/* encrypted case */
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
 
@@ -795,12 +794,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			int save_len = fstr->len;
 			int ret;
 
+			de_name.name = kmalloc(de_name.len, GFP_NOFS);
+			if (!de_name.name)
+				return false;
+
+			memcpy(de_name.name, d->filename[bit_pos], de_name.len);
+
 			ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
 							&de_name, fstr);
-			de_name = *fstr;
-			fstr->len = save_len;
+			kfree(de_name.name);
 			if (ret < 0)
 				return true;
+
+			de_name = *fstr;
+			fstr->len = save_len;
 		}
 
 		if (!dir_emit(ctx, de_name.name, de_name.len,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a680bf3..dfa01c8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -947,8 +947,13 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 
 	/* Symlink is encrypted */
 	sd = (struct f2fs_encrypted_symlink_data *)caddr;
-	cstr.name = sd->encrypted_path;
 	cstr.len = le16_to_cpu(sd->len);
+	cstr.name = kmalloc(cstr.len, GFP_NOFS);
+	if (!cstr.name) {
+		res = -ENOMEM;
+		goto errout;
+	}
+	memcpy(cstr.name, sd->encrypted_path, cstr.len);
 
 	/* this is broken symlink case */
 	if (cstr.name[0] == 0 && cstr.len == 0) {
@@ -970,6 +975,8 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	if (res < 0)
 		goto errout;
 
+	kfree(cstr.name);
+
 	paddr = pstr.name;
 
 	/* Null-terminate the name */
@@ -979,6 +986,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	page_cache_release(cpage);
 	return *cookie = paddr;
 errout:
+	kfree(cstr.name);
 	f2fs_fname_crypto_free_buffer(&pstr);
 	kunmap(cpage);
 	page_cache_release(cpage);
-- 
cgit v1.1


From a7230d16d5f8635dbc029af057969e46e3eb2246 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 16 Sep 2015 14:06:54 -0700
Subject: f2fs: check end_io for metapages before making next checkpoint blocks

This patch avoids to produce new checkpoint blocks before the previous meta
pages were written completely.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c5a38e3..ff53405 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1000,6 +1000,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	start_blk = __start_cp_addr(sbi);
 
+	/* need to wait for end_io results */
+	wait_on_all_pages_writeback(sbi);
+	if (unlikely(f2fs_cp_error(sbi)))
+		return;
+
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
 
-- 
cgit v1.1


From 41a099de3a8d2b570147445c5cddc59c0daf5e96 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Thu, 17 Sep 2015 18:24:17 +0800
Subject: f2fs: drop largest extent by range

now we update extent by range, fofs may not be on the largest
extent if the new extent overlaps with it. so add a new function
to drop largest extent properly.

Signed-off-by: Fan li <fanofcode.li@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 63068b7..2f013e2 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -155,11 +155,12 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 	return count - et->count;
 }
 
-static void __drop_largest_extent(struct inode *inode, pgoff_t fofs)
+static void __drop_largest_extent(struct inode *inode,
+					pgoff_t fofs, unsigned int len)
 {
 	struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
 
-	if (largest->fofs <= fofs && largest->fofs + largest->len > fofs)
+	if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
 		largest->len = 0;
 }
 
@@ -168,7 +169,7 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
 	if (!f2fs_may_extent_tree(inode))
 		return;
 
-	__drop_largest_extent(inode, fofs);
+	__drop_largest_extent(inode, fofs, 1);
 }
 
 void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
@@ -422,7 +423,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 	dei.len = 0;
 
 	/* we do not guarantee that the largest extent is cached all the time */
-	__drop_largest_extent(inode, fofs);
+	__drop_largest_extent(inode, fofs, len);
 
 	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
 	en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
-- 
cgit v1.1


From 4d1fa815f26ce92dc4710d28f50071174a2340b7 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Thu, 17 Sep 2015 18:42:06 +0800
Subject: f2fs: optimize code of f2fs_update_extent_tree_range

Fix 2 potential problems:
1. when largest extent needs to be invalidated, it will be reset in
   __drop_largest_extent, which makes __is_extent_same after always
   return false, and largest extent unchanged. Now we update it properly.

2. when extent is split and the latter part remains in tree, next_en
   should be the latter part instead of next extent of original extent.
   It will cause merge failure if there is in-place update, although
   there is not, I think this fix will still makes codes less ambiguous.

This patch also simplifies codes of invalidating extents, and optimizes the
procedues that split extent into two.
There are a few modifications after last patch:
1. prev_en now is updated properly.
2. more codes and branches are simplified.

Signed-off-by: Fan li <fanofcode.li@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 171 ++++++++++++++++++-------------------------------
 1 file changed, 62 insertions(+), 109 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2f013e2..c9d1cfd 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -400,7 +400,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
-	struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+	struct extent_node *en = NULL, *en1 = NULL;
 	struct extent_node *prev_en = NULL, *next_en = NULL;
 	struct extent_info ei, dei, prev;
 	struct rb_node **insert_p = NULL, *insert_parent = NULL;
@@ -422,148 +422,101 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 	prev = et->largest;
 	dei.len = 0;
 
-	/* we do not guarantee that the largest extent is cached all the time */
+	/*
+	 * drop largest extent before lookup, in case it's already
+	 * been shrunk from extent tree
+	 */
 	__drop_largest_extent(inode, fofs, len);
 
 	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
 	en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
 					&insert_p, &insert_parent);
-	if (!en) {
-		if (next_en) {
-			en = next_en;
-			f2fs_bug_on(sbi, en->ei.fofs <= pos);
-			pos = en->ei.fofs;
-		} else {
-			/*
-			 * skip searching in the tree since there is no
-			 * larger extent node in the cache.
-			 */
-			goto update_extent;
-		}
-	}
+	if (!en)
+		en = next_en;
 
 	/* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
-	while (en) {
-		struct rb_node *node;
+	while (en && en->ei.fofs < end) {
+		unsigned int org_end;
+		int parts = 0;	/* # of parts current extent split into */
 
-		if (pos >= end)
-			break;
+		next_en = en1 = NULL;
 
 		dei = en->ei;
-		en1 = en2 = NULL;
+		org_end = dei.fofs + dei.len;
+		f2fs_bug_on(sbi, pos >= org_end);
 
-		node = rb_next(&en->rb_node);
+		if (pos > dei.fofs &&	pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+			en->ei.len = pos - en->ei.fofs;
+			prev_en = en;
+			parts = 1;
+		}
 
-		/*
-		 * 2.1 there are four cases when we invalidate blkaddr in extent
-		 * node, |V: valid address, X: will be invalidated|
-		 */
-		/* case#1, invalidate right part of extent node |VVVVVXXXXX| */
-		if (pos > dei.fofs && end >= dei.fofs + dei.len) {
-			en->ei.len = pos - dei.fofs;
-
-			if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
-				__detach_extent_node(sbi, et, en);
-				insert_p = NULL;
-				insert_parent = NULL;
-				goto update;
+		if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+			if (parts) {
+				set_extent_info(&ei, end,
+						end - dei.fofs + dei.blk,
+						org_end - end);
+				en1 = __insert_extent_tree(sbi, et, &ei,
+							NULL, NULL);
+				next_en = en1;
+			} else {
+				en->ei.fofs = end;
+				en->ei.blk += end - dei.fofs;
+				en->ei.len -= end - dei.fofs;
+				next_en = en;
 			}
-
-			if (__is_extent_same(&dei, &et->largest))
-				et->largest = en->ei;
-			goto next;
+			parts++;
 		}
 
-		/* case#2, invalidate left part of extent node |XXXXXVVVVV| */
-		if (pos <= dei.fofs && end < dei.fofs + dei.len) {
-			en->ei.fofs = end;
-			en->ei.blk += end - dei.fofs;
-			en->ei.len -= end - dei.fofs;
-
-			if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
-				__detach_extent_node(sbi, et, en);
-				insert_p = NULL;
-				insert_parent = NULL;
-				goto update;
-			}
+		if (!next_en) {
+			struct rb_node *node = rb_next(&en->rb_node);
 
-			if (__is_extent_same(&dei, &et->largest))
-				et->largest = en->ei;
-			goto next;
+			next_en = node ?
+				rb_entry(node, struct extent_node, rb_node)
+				: NULL;
 		}
 
-		__detach_extent_node(sbi, et, en);
-
-		/*
-		 * if we remove node in rb-tree, our parent node pointer may
-		 * point the wrong place, discard them.
-		 */
-		insert_p = NULL;
-		insert_parent = NULL;
-
-		/* case#3, invalidate entire extent node |XXXXXXXXXX| */
-		if (pos <= dei.fofs && end >= dei.fofs + dei.len) {
-			if (__is_extent_same(&dei, &et->largest))
-				et->largest.len = 0;
-			goto update;
+		if (parts) {
+			if (en->ei.len > et->largest.len)
+				et->largest = en->ei;
+		} else {
+			__detach_extent_node(sbi, et, en);
 		}
 
 		/*
-		 * case#4, invalidate data in the middle of extent node
-		 * |VVVXXXXVVV|
+		 * if original extent is split into zero or two parts, extent
+		 * tree has been altered by deletion or insertion, therefore
+		 * invalidate pointers regard to tree.
 		 */
-		if (dei.len > F2FS_MIN_EXTENT_LEN) {
-			unsigned int endofs;
-
-			/*  insert left part of split extent into cache */
-			if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
-				set_extent_info(&ei, dei.fofs, dei.blk,
-							pos - dei.fofs);
-				en1 = __insert_extent_tree(sbi, et, &ei,
-								NULL, NULL);
-			}
-
-			/* insert right part of split extent into cache */
-			endofs = dei.fofs + dei.len;
-			if (endofs - end >= F2FS_MIN_EXTENT_LEN) {
-				set_extent_info(&ei, end,
-						end - dei.fofs + dei.blk,
-						endofs - end);
-				en2 = __insert_extent_tree(sbi, et, &ei,
-								NULL, NULL);
-			}
+		if (parts != 1) {
+			insert_p = NULL;
+			insert_parent = NULL;
 		}
-update:
-		/* 2.2 update in global extent list */
+
+		/* update in global extent list */
 		spin_lock(&sbi->extent_lock);
-		if (en && !list_empty(&en->list))
+		if (!parts && !list_empty(&en->list))
 			list_del(&en->list);
 		if (en1)
 			list_add_tail(&en1->list, &sbi->extent_list);
-		if (en2)
-			list_add_tail(&en2->list, &sbi->extent_list);
 		spin_unlock(&sbi->extent_lock);
 
-		/* 2.3 release extent node */
-		if (en)
+		/* release extent node */
+		if (!parts)
 			kmem_cache_free(extent_node_slab, en);
-next:
-		en = node ? rb_entry(node, struct extent_node, rb_node) : NULL;
-		next_en = en;
-		if (en)
-			pos = en->ei.fofs;
+
+		en = next_en;
 	}
 
-update_extent:
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
 		struct extent_node *den = NULL;
 
 		set_extent_info(&ei, fofs, blkaddr, len);
-		en3 = __try_merge_extent_node(sbi, et, &ei, &den,
+		en1 = __try_merge_extent_node(sbi, et, &ei, &den,
 							prev_en, next_en);
-		if (!en3)
-			en3 = __insert_extent_tree(sbi, et, &ei,
+		if (!en1)
+			en1 = __insert_extent_tree(sbi, et, &ei,
 						insert_p, insert_parent);
 
 		/* give up extent_cache, if split and small updates happen */
@@ -575,11 +528,11 @@ update_extent:
 		}
 
 		spin_lock(&sbi->extent_lock);
-		if (en3) {
-			if (list_empty(&en3->list))
-				list_add_tail(&en3->list, &sbi->extent_list);
+		if (en1) {
+			if (list_empty(&en1->list))
+				list_add_tail(&en1->list, &sbi->extent_list);
 			else
-				list_move_tail(&en3->list, &sbi->extent_list);
+				list_move_tail(&en1->list, &sbi->extent_list);
 		}
 		if (den && !list_empty(&den->list))
 			list_del(&den->list);
-- 
cgit v1.1


From ea58711e884c3006e6f61e44d47f0c2d9f0979e1 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 17 Sep 2015 20:22:44 +0800
Subject: f2fs: do in batches truncation in truncate_hole

truncate_data_blocks_range can do in batches truncation which makes all
changes in dnode page content, dnode page status, extent cache, block
count updating together.

But previously, truncate_hole() always truncates one block in dnode page
at a time by invoking truncate_data_blocks_range(,1), which make thing
slow.

This patch changes truncate_hole() to do in batches truncation for all
target blocks in one direct node inside truncate_data_blocks_range, which
can make our punch hole operation in ->fallocate more efficent.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 27a789c..6702157 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -739,23 +739,31 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 
 int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 {
-	pgoff_t index;
 	int err;
 
-	for (index = pg_start; index < pg_end; index++) {
+	while (pg_start < pg_end) {
 		struct dnode_of_data dn;
+		pgoff_t end_offset, count;
 
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+		err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
 		if (err) {
-			if (err == -ENOENT)
+			if (err == -ENOENT) {
+				pg_start++;
 				continue;
+			}
 			return err;
 		}
 
-		if (dn.data_blkaddr != NULL_ADDR)
-			truncate_data_blocks_range(&dn, 1);
+		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+		count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
+
+		f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
+
+		truncate_data_blocks_range(&dn, count);
 		f2fs_put_dnode(&dn);
+
+		pg_start += count;
 	}
 	return 0;
 }
-- 
cgit v1.1


From f9811703fefbdb22abf723ec61368777cdad8508 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 21 Sep 2015 20:17:52 +0800
Subject: f2fs: fix to handle io error in ->direct_IO

Here is a oops reported as following message when testing generic/019 of
xfstest:

 ------------[ cut here ]------------
 kernel BUG at /home/yuchao/git/f2fs-dev/segment.c:882!
 invalid opcode: 0000 [#1] SMP
 Modules linked in: zram lz4_compress lz4_decompress f2fs(O) ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4
nf_def
 CPU: 2 PID: 25441 Comm: fio Tainted: G           O    4.3.0-rc1+ #6
 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.61 05/16/2013
 task: ffff8803f4e85580 ti: ffff8803fd61c000 task.ti: ffff8803fd61c000
 RIP: 0010:[<ffffffffa0784981>]  [<ffffffffa0784981>] new_curseg+0x321/0x330 [f2fs]
 RSP: 0018:ffff8803fd61f918  EFLAGS: 00010246
 RAX: 00000000000007ed RBX: 0000000000000224 RCX: 000000000000001f
 RDX: 0000000000000800 RSI: ffffffffffffffff RDI: ffff8803f56f4300
 RBP: ffff8803fd61f978 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000024 R11: ffff8800d23bbd78 R12: ffff8800d0ef0000
 R13: 0000000000000224 R14: 0000000000000000 R15: 0000000000000001
 FS:  00007f827ff85700(0000) GS:ffff88041ea80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffffff600000 CR3: 00000003fef17000 CR4: 00000000001406e0
 Stack:
  000007ea00000002 0000000100000001 ffff8803f6456248 000007ed0000002b
  0000000000000224 ffff880404d1aa20 ffff8803fd61f9c8 ffff8800d0ef0000
  ffff8803f6456248 0000000000000001 00000000ffffffff ffffffffa078f358
 Call Trace:
  [<ffffffffa0785b87>] allocate_segment_by_default+0x1a7/0x1f0 [f2fs]
  [<ffffffffa078322c>] allocate_data_block+0x17c/0x360 [f2fs]
  [<ffffffffa0779521>] __allocate_data_block+0x131/0x1d0 [f2fs]
  [<ffffffffa077a995>] f2fs_direct_IO+0x4b5/0x580 [f2fs]
  [<ffffffff811510ae>] generic_file_direct_write+0xae/0x160
  [<ffffffff811518f5>] __generic_file_write_iter+0xd5/0x1f0
  [<ffffffff81151e07>] generic_file_write_iter+0xf7/0x200
  [<ffffffff81319e38>] ? apparmor_file_permission+0x18/0x20
  [<ffffffffa0768480>] ? f2fs_fallocate+0x1190/0x1190 [f2fs]
  [<ffffffffa07684c6>] f2fs_file_write_iter+0x46/0x90 [f2fs]
  [<ffffffff8120b4fe>] aio_run_iocb+0x1ee/0x290
  [<ffffffff81700f7e>] ? mutex_lock+0x1e/0x50
  [<ffffffff8120a1d7>] ? aio_read_events+0x207/0x2b0
  [<ffffffff8120b913>] do_io_submit+0x373/0x630
  [<ffffffff8120a4f6>] ? SyS_io_getevents+0x56/0xb0
  [<ffffffff8120bbe0>] SyS_io_submit+0x10/0x20
  [<ffffffff81703857>] entry_SYSCALL_64_fastpath+0x12/0x6a
 Code: 45 c8 48 8b 78 10 e8 9f 23 bf e0 41 8b 8c 24 cc 03 00 00 89 c7 31 d2 89 c6 89 d8 29 df f7 f1 29 d1 39 cf 0f 83 be fd ff ff eb
 RIP  [<ffffffffa0784981>] new_curseg+0x321/0x330 [f2fs]
  RSP <ffff8803fd61f918>
 ---[ end trace 2e577d7f711ddb86 ]---

The reason is that: in the test of generic/019, we will trigger a manmade
IO error in block layer through debugfs, after that, prefree segment will
no longer be freed, because we always skip doing gc or checkpoint when
there occurs an IO error.

Meanwhile fio with aio engine generated a large number of direct IOs,
which continue allocating spaces in free segment until we run out of them,
eventually, results in panic in new_curseg as no more free segment was
found.

So, this patch changes to return EIO in direct_IO for this condition.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 883b649..d265401 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -524,6 +524,9 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
 		while (dn.ofs_in_node < end_offset && len) {
 			block_t blkaddr;
 
+			if (unlikely(f2fs_cp_error(sbi)))
+				goto sync_out;
+
 			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 			if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
 				if (__allocate_data_block(&dn))
@@ -566,6 +569,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 {
 	unsigned int maxblocks = map->m_len;
 	struct dnode_of_data dn;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
 	pgoff_t pgofs, end_offset;
 	int err = 0, ofs = 1;
@@ -599,6 +603,10 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 
 	if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
 		if (create) {
+			if (unlikely(f2fs_cp_error(sbi))) {
+				err = -EIO;
+				goto put_out;
+			}
 			err = __allocate_data_block(&dn);
 			if (err)
 				goto put_out;
@@ -652,6 +660,10 @@ get_next:
 
 		if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
 			if (create) {
+				if (unlikely(f2fs_cp_error(sbi))) {
+					err = -EIO;
+					goto sync_out;
+				}
 				err = __allocate_data_block(&dn);
 				if (err)
 					goto sync_out;
@@ -1556,10 +1568,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
-	if (iov_iter_rw(iter) == WRITE)
+	if (iov_iter_rw(iter) == WRITE) {
 		__allocate_data_blocks(inode, offset, count);
+		if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
+			err = -EIO;
+			goto out;
+		}
+	}
 
 	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+out:
 	if (err < 0 && iov_iter_rw(iter) == WRITE)
 		f2fs_write_failed(mapping, offset + count);
 
-- 
cgit v1.1


From 46c9e1413f3c4ecbbe775e545063e88feb1f9871 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 18 Sep 2015 16:54:16 +0800
Subject: f2fs: use correct flag in f2fs_map_blocks()

We introduce F2FS_GET_BLOCK_READ in commit e2b4e2bc8865 ("f2fs: fix
incorrect mapping for bmap"), but forget to use this flag in the right
place, fix it.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d265401..bc04e92 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -920,7 +920,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 			map.m_lblk = block_in_file;
 			map.m_len = last_block - block_in_file;
 
-			if (f2fs_map_blocks(inode, &map, 0, false))
+			if (f2fs_map_blocks(inode, &map, 0,
+							F2FS_GET_BLOCK_READ))
 				goto set_error_page;
 		}
 got_it:
-- 
cgit v1.1


From 9cd81ce3c2f01fc599b9156b94b868b4f578698c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 18 Sep 2015 16:55:26 +0800
Subject: f2fs: disallow switch extent_cache option dynamically

Swith extent_cache option dynamically when remount may casue consistency
issue between extent cache and dnode page. Fix in this patch to avoid
that condition.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f794781..16442ec 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -742,6 +742,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	int err, active_logs;
 	bool need_restart_gc = false;
 	bool need_stop_gc = false;
+	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
 
 	sync_filesystem(sb);
 
@@ -767,6 +768,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
 		goto skip;
 
+	/* disallow enable/disable extent_cache dynamically */
+	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+		err = -EINVAL;
+		f2fs_msg(sbi->sb, KERN_WARNING,
+				"switch extent_cache option is not allowed");
+		goto restore_opts;
+	}
+
 	/*
 	 * We stop the GC thread if FS is mounted as RO
 	 * or if background_gc = off is passed in mount
-- 
cgit v1.1


From 545fe4210df5eb4097aa17c68f0f153db27bcf44 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Mon, 21 Sep 2015 18:55:49 -0400
Subject: f2fs: fix error handling for calls to various functions in the
 function recover_inline_data

This fixes error handling for calls to various functions in the
function  recover_inline_data to check if these particular functions
either return a error code or the boolean value false to signal their
caller they have failed internally and if this arises return false
to signal failure immediately to the caller of recover_inline_data
as we cannot continue after failures to calling either the function
truncate_inline_inode or truncate_blocks.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inline.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3d143be..3b18e23 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -274,12 +274,14 @@ process_inline:
 	if (f2fs_has_inline_data(inode)) {
 		ipage = get_node_page(sbi, inode->i_ino);
 		f2fs_bug_on(sbi, IS_ERR(ipage));
-		truncate_inline_inode(ipage, 0);
+		if (!truncate_inline_inode(ipage, 0))
+			return false;
 		f2fs_clear_inline_inode(inode);
 		update_inode(inode, ipage);
 		f2fs_put_page(ipage, 1);
 	} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
-		truncate_blocks(inode, 0, false);
+		if (truncate_blocks(inode, 0, false))
+			return false;
 		goto process_inline;
 	}
 	return false;
-- 
cgit v1.1


From 4abd3f5ac49314fe345566ec859b8e60993207bf Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 22 Sep 2015 21:07:47 +0800
Subject: f2fs: introduce __try_update_largest_extent

This patch adds a new helper __try_update_largest_extent for cleanup.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 14 +++++---------
 fs/f2fs/f2fs.h         |  7 +++++++
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c9d1cfd..a38ee9b 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -351,8 +351,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 	}
 
 	if (en) {
-		if (en->ei.len > et->largest.len)
-			et->largest = en->ei;
+		__try_update_largest_extent(et, en);
 		et->cached_en = en;
 	}
 	return en;
@@ -389,8 +388,7 @@ do_insert:
 	if (!en)
 		return NULL;
 
-	if (en->ei.len > et->largest.len)
-		et->largest = en->ei;
+	__try_update_largest_extent(et, en);
 	et->cached_en = en;
 	return en;
 }
@@ -476,12 +474,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 				: NULL;
 		}
 
-		if (parts) {
-			if (en->ei.len > et->largest.len)
-				et->largest = en->ei;
-		} else {
+		if (parts)
+			__try_update_largest_extent(et, en);
+		else
 			__detach_extent_node(sbi, et, en);
-		}
 
 		/*
 		 * if original extent is split into zero or two parts, extent
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 79c38ad..8d6681a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -492,6 +492,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
 	return __is_extent_mergeable(cur, front);
 }
 
+static inline void __try_update_largest_extent(struct extent_tree *et,
+						struct extent_node *en)
+{
+	if (en->ei.len > et->largest.len)
+		et->largest = en->ei;
+}
+
 struct f2fs_nm_info {
 	block_t nat_blkaddr;		/* base disk address of NAT */
 	nid_t max_nid;			/* maximum possible node ids */
-- 
cgit v1.1


From 1d7e10d58a1f62c8c44668cca1cff6a2dd57d7d9 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 23 Sep 2015 09:25:43 +0800
Subject: f2fs: fix incorrect bimodal calculation

In update_sit_info, we use div_u64 to handle 'u64 divide u64' case, but
div_u64 can only handle 32-bits divisor, so our divisor with u64 type
passed to div_u64 will overflow, result in the wrong calculation when
show debug info of f2fs as below:

BDF: 464, avg. vblocks: 23509
(BDF should never exceed 100)

So change to use div64_u64 to handle this case correctly.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ebfcc40..615a307 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 		}
 	}
 	dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
-	si->bimodal = div_u64(bimodal, dist);
+	si->bimodal = div64_u64(bimodal, dist);
 	if (si->dirty_count)
 		si->avg_vblocks = div_u64(total_vblocks, ndirty);
 	else
-- 
cgit v1.1


From 7223554133c3f72809ea6ddbf0d8464e7c70c1b1 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 25 Sep 2015 17:54:56 +0800
Subject: f2fs: remove unneeded f2fs_{,un}lock_op in do_recover_data()

Protecting recovery flow by using cp_rwsem is not needed, since we have
prevent triggering any checkpoint by locking cp_mutex previously.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index acc21f2..c5daec5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -383,15 +383,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	start = start_bidx_of_node(ofs_of_node(page), fi);
 	end = start + ADDRS_PER_PAGE(page, fi);
 
-	f2fs_lock_op(sbi);
-
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 
 	err = get_dnode_of_data(&dn, start, ALLOC_NODE);
-	if (err) {
-		f2fs_unlock_op(sbi);
+	if (err)
 		goto out;
-	}
 
 	f2fs_wait_on_page_writeback(dn.node_page, NODE);
 
@@ -456,7 +452,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	set_page_dirty(dn.node_page);
 err:
 	f2fs_put_dnode(&dn);
-	f2fs_unlock_op(sbi);
 out:
 	f2fs_msg(sbi->sb, KERN_NOTICE,
 		"recover_data: ino = %lx, recovered = %d blocks, err = %d",
-- 
cgit v1.1


From 90b803e6fb6243922bff9ddd8a6205c17cb93b31 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 25 Sep 2015 19:34:50 -0700
Subject: f2fs: do not skip dentry block writes

Previously, we skip dentry block writes when wbc is SYNC_NONE with no memory
pressure and the number of dirty pages is pretty small.

But, we didn't skip for normal data writes, which gives us not much big impact
on overall performance.
Moreover, by skipping some data writes, kworker falls into infinite loop to try
to write blocks, when many dir inodes have only one dentry block.

So, this patch removes skipping data writes.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    | 5 -----
 fs/f2fs/node.c    | 5 -----
 fs/f2fs/node.h    | 1 -
 fs/f2fs/segment.h | 4 +---
 4 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index bc04e92..a903423 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1340,11 +1340,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
 		return 0;
 
-	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
-			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
-			available_free_memory(sbi, DIRTY_DENTS))
-		goto skip_write;
-
 	/* during POR, we don't need to trigger writepage at all. */
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto skip_write;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4d9bedf..1fe49ca 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -52,11 +52,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
 							PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
-	} else if (type == DIRTY_DENTS) {
-		if (sbi->sb->s_bdi->wb.dirty_exceeded)
-			return false;
-		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
-		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == INO_ENTRIES) {
 		int i;
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7427e95..51c62ed 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -118,7 +118,6 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
 enum mem_type {
 	FREE_NIDS,	/* indicates the free nid list */
 	NAT_ENTRIES,	/* indicates the cached nat entry */
-	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	EXTENT_CACHE,	/* indicates extent cache */
 	BASE_CHECK,	/* check kernel status */
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index b6e4ed1..a294da7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -697,9 +697,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	if (sbi->sb->s_bdi->wb.dirty_exceeded)
 		return 0;
 
-	if (type == DATA)
-		return sbi->blocks_per_seg;
-	else if (type == NODE)
+	if (type == NODE)
 		return 3 * sbi->blocks_per_seg;
 	else if (type == META)
 		return MAX_BIO_BLOCKS(sbi);
-- 
cgit v1.1


From 345a6b2ee2987a11bc8e9c08ff2b68a973fd912c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 28 Sep 2015 17:46:01 +0800
Subject: f2fs: fix to update {m,c}time correctly when truncating larger

This patch fixes to update ctime and atime correctly when truncating
larger in ->setattr.

The bug is reported by xfstest generic/313 as below:

generic/313 2s ... - output mismatch (see ./results/generic/313.out.bad)
    --- tests/generic/313.out   2015-08-04 15:28:53.430798882 +0800
    +++ results/generic/313.out.bad   2015-09-28 17:04:27.294278016 +0800
    @@ -1,2 +1,4 @@
     QA output created by 313
     Silence is golden
    +ctime not updated after truncate up
    +mtime not updated after truncate up
    ...
    (Run 'diff -u tests/generic/313.out tests/generic/313.out.bad'  to see the entire diff)
Ran: generic/313
Failures: generic/313
Failed 1 of 1 tests

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6702157..a350c2a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -681,6 +681,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			 * larger than i_size.
 			 */
 			truncate_setsize(inode, attr->ia_size);
+			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		}
 	}
 
-- 
cgit v1.1


From 45fe8492ccbe561c4b8918c2d4c83a0501e50646 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 28 Sep 2015 17:42:24 +0800
Subject: f2fs: fix to correct freed section number during gc

This patch fixes to maintain the right section count freed in garbage
collecting when triggering a foreground gc.

Besides, when a foreground gc is running on current selected section, once
we fail to gc one segment, it's better to abandon gcing the left segments
in current section, because anyway we will select next victim for
foreground gc, so gc on the left segments in previous section will become
overhead and also cause the long latency for caller.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 782b8e7..b6e03eb 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -802,7 +802,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 	unsigned int segno = NULL_SEGNO;
 	unsigned int i;
 	int gc_type = BG_GC;
-	int nfree = 0;
+	int sec_freed = 0;
 	int ret = -1;
 	struct cp_control cpc;
 	struct gc_inode_list gc_list = {
@@ -817,7 +817,7 @@ gc_more:
 	if (unlikely(f2fs_cp_error(sbi)))
 		goto stop;
 
-	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
+	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
 		gc_type = FG_GC;
 		if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
 			write_checkpoint(sbi, &cpc);
@@ -832,13 +832,23 @@ gc_more:
 		ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
 								META_SSA);
 
-	for (i = 0; i < sbi->segs_per_sec; i++)
-		nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
+	for (i = 0; i < sbi->segs_per_sec; i++) {
+		/*
+		 * for FG_GC case, halt gcing left segments once failed one
+		 * of segments in selected section to avoid long latency.
+		 */
+		if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
+				gc_type == FG_GC)
+			break;
+	}
+
+	if (i == sbi->segs_per_sec && gc_type == FG_GC)
+		sec_freed++;
 
 	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;
 
-	if (has_not_enough_free_secs(sbi, nfree))
+	if (has_not_enough_free_secs(sbi, sec_freed))
 		goto gc_more;
 
 	if (gc_type == FG_GC)
-- 
cgit v1.1


From ab126cfc3090fa5af2a87cc0698f793aebbec7d0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 18 Sep 2015 17:33:00 -0700
Subject: f2fs: should get a victim from retrials

If we do not call get_victim first, we cannot get a new victim for retrial
path.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b6e03eb..343b096 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -799,8 +799,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 
 int f2fs_gc(struct f2fs_sb_info *sbi)
 {
-	unsigned int segno = NULL_SEGNO;
-	unsigned int i;
+	unsigned int segno, i;
 	int gc_type = BG_GC;
 	int sec_freed = 0;
 	int ret = -1;
@@ -812,6 +811,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 
 	cpc.reason = __get_cp_reason(sbi);
 gc_more:
+	segno = NULL_SEGNO;
+
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
 	if (unlikely(f2fs_cp_error(sbi)))
-- 
cgit v1.1


From 39307a8e2459ecdee9f1bc0b8a5d7af4a6d8f754 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 22 Sep 2015 13:50:47 -0700
Subject: f2fs: use vmalloc to handle -ENOMEM error

This patch introduces f2fs_kvmalloc to avoid -ENOMEM during mount.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    | 21 +++++++++++++++++++++
 fs/f2fs/segment.c | 32 ++++++++++++++++----------------
 2 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8d6681a..6f27b92 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,6 +19,7 @@
 #include <linux/magic.h>
 #include <linux/kobject.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 #include <linux/bio.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -1586,6 +1587,26 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 	return S_ISREG(mode);
 }
 
+static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	ret = kmalloc(size, flags | __GFP_NOWARN);
+	if (!ret)
+		ret = __vmalloc(size, flags, PAGE_KERNEL);
+	return ret;
+}
+
+static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	ret = kzalloc(size, flags | __GFP_NOWARN);
+	if (!ret)
+		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+	return ret;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 78e6d06..0ceff28 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,7 +14,6 @@
 #include <linux/blkdev.h>
 #include <linux/prefetch.h>
 #include <linux/kthread.h>
-#include <linux/vmalloc.h>
 #include <linux/swap.h>
 
 #include "f2fs.h"
@@ -1955,12 +1954,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 	SM_I(sbi)->sit_info = sit_i;
 
-	sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
+	sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+					sizeof(struct seg_entry), GFP_KERNEL);
 	if (!sit_i->sentries)
 		return -ENOMEM;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
 	if (!sit_i->dirty_sentries_bitmap)
 		return -ENOMEM;
 
@@ -1982,8 +1982,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 
 	if (sbi->segs_per_sec > 1) {
-		sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
-					sizeof(struct sec_entry));
+		sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+					sizeof(struct sec_entry), GFP_KERNEL);
 		if (!sit_i->sec_entries)
 			return -ENOMEM;
 	}
@@ -2028,12 +2028,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 	SM_I(sbi)->free_info = free_i;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+	free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
 	if (!free_i->free_segmap)
 		return -ENOMEM;
 
 	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
-	free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+	free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
 	if (!free_i->free_secmap)
 		return -ENOMEM;
 
@@ -2174,7 +2174,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
 
-	dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+	dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
 	if (!dirty_i->victim_secmap)
 		return -ENOMEM;
 	return 0;
@@ -2196,7 +2196,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 
 	for (i = 0; i < NR_DIRTY_TYPE; i++) {
-		dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+		dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
 		if (!dirty_i->dirty_segmap[i])
 			return -ENOMEM;
 	}
@@ -2301,7 +2301,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
-	kfree(dirty_i->dirty_segmap[dirty_type]);
+	kvfree(dirty_i->dirty_segmap[dirty_type]);
 	dirty_i->nr_dirty[dirty_type] = 0;
 	mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -2309,7 +2309,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
 static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	kfree(dirty_i->victim_secmap);
+	kvfree(dirty_i->victim_secmap);
 }
 
 static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -2348,8 +2348,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
 	if (!free_i)
 		return;
 	SM_I(sbi)->free_info = NULL;
-	kfree(free_i->free_segmap);
-	kfree(free_i->free_secmap);
+	kvfree(free_i->free_segmap);
+	kvfree(free_i->free_secmap);
 	kfree(free_i);
 }
 
@@ -2370,9 +2370,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 	}
 	kfree(sit_i->tmp_map);
 
-	vfree(sit_i->sentries);
-	vfree(sit_i->sec_entries);
-	kfree(sit_i->dirty_sentries_bitmap);
+	kvfree(sit_i->sentries);
+	kvfree(sit_i->sec_entries);
+	kvfree(sit_i->dirty_sentries_bitmap);
 
 	SM_I(sbi)->sit_info = NULL;
 	kfree(sit_i->sit_bitmap);
-- 
cgit v1.1


From 5b7ee374144f8ef2db3e25d0d59a8ad83bb3cf33 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 30 Sep 2015 17:38:48 +0800
Subject: f2fs: use atomic64_t for extent cache hit stat

Our hit stat of extent cache will increase all the time until remount,
and we use atomic_t type for the stat variable, so it may easily incur
overflow when we query extent cache frequently in a long time running
fs.

So to avoid that, this patch uses atomic64_t for hit stat variables.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 22 +++++++++++-----------
 fs/f2fs/f2fs.h  | 19 ++++++++++---------
 2 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 615a307..478e5d5 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,11 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	int i;
 
 	/* validation check of the segment numbers */
-	si->hit_largest = atomic_read(&sbi->read_hit_largest);
-	si->hit_cached = atomic_read(&sbi->read_hit_cached);
-	si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree);
+	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
+	si->hit_cached = atomic64_read(&sbi->read_hit_cached);
+	si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
 	si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
-	si->total_ext = atomic_read(&sbi->total_hit_ext);
+	si->total_ext = atomic64_read(&sbi->total_hit_ext);
 	si->ext_tree = sbi->total_ext_tree;
 	si->ext_node = atomic_read(&sbi->total_ext_node);
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -283,12 +283,12 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - node blocks : %d (%d)\n", si->node_blks,
 				si->bg_node_blks);
 		seq_puts(s, "\nExtent Cache:\n");
-		seq_printf(s, "  - Hit Count: L1-1:%d L1-2:%d L2:%d\n",
+		seq_printf(s, "  - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
 				si->hit_largest, si->hit_cached,
 				si->hit_rbtree);
-		seq_printf(s, "  - Hit Ratio: %d%% (%d / %d)\n",
+		seq_printf(s, "  - Hit Ratio: %llu%% (%llu / %llu)\n",
 				!si->total_ext ? 0 :
-				(si->hit_total * 100) / si->total_ext,
+				div64_u64(si->hit_total * 100, si->total_ext),
 				si->hit_total, si->total_ext);
 		seq_printf(s, "  - Inner Struct Count: tree: %d, node: %d\n",
 				si->ext_tree, si->ext_node);
@@ -378,10 +378,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	si->sbi = sbi;
 	sbi->stat_info = si;
 
-	atomic_set(&sbi->total_hit_ext, 0);
-	atomic_set(&sbi->read_hit_rbtree, 0);
-	atomic_set(&sbi->read_hit_largest, 0);
-	atomic_set(&sbi->read_hit_cached, 0);
+	atomic64_set(&sbi->total_hit_ext, 0);
+	atomic64_set(&sbi->read_hit_rbtree, 0);
+	atomic64_set(&sbi->read_hit_largest, 0);
+	atomic64_set(&sbi->read_hit_cached, 0);
 
 	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6f27b92..1fa0555 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -795,10 +795,10 @@ struct f2fs_sb_info {
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
-	atomic_t total_hit_ext;			/* # of lookup extent cache */
-	atomic_t read_hit_rbtree;		/* # of hit rbtree extent node */
-	atomic_t read_hit_largest;		/* # of hit largest extent node */
-	atomic_t read_hit_cached;		/* # of hit cached extent node */
+	atomic64_t total_hit_ext;		/* # of lookup extent cache */
+	atomic64_t read_hit_rbtree;		/* # of hit rbtree extent node */
+	atomic64_t read_hit_largest;		/* # of hit largest extent node */
+	atomic64_t read_hit_cached;		/* # of hit cached extent node */
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
@@ -1848,7 +1848,8 @@ struct f2fs_stat_info {
 	struct f2fs_sb_info *sbi;
 	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
 	int main_area_segs, main_area_sections, main_area_zones;
-	int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext;
+	unsigned long long hit_largest, hit_cached, hit_rbtree;
+	unsigned long long hit_total, total_ext;
 	int ext_tree, ext_node;
 	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
 	int nats, dirty_nats, sits, dirty_sits, fnids;
@@ -1885,10 +1886,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
 #define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
-#define stat_inc_total_hit(sbi)		(atomic_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_rbtree))
-#define stat_inc_largest_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi)	(atomic_inc(&(sbi)->read_hit_cached))
+#define stat_inc_total_hit(sbi)		(atomic64_inc(&(sbi)->total_hit_ext))
+#define stat_inc_rbtree_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_largest_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_largest))
+#define stat_inc_cached_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_cached))
 #define stat_inc_inline_xattr(inode)					\
 	do {								\
 		if (f2fs_has_inline_xattr(inode))			\
-- 
cgit v1.1


From a43f7ec327b0d86cbb80d0841673038c0706e714 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 5 Oct 2015 22:19:24 +0800
Subject: f2fs: fix to avoid redundant searching in dirty map during gc

When doing gc, we search a victim in dirty map, starting from position of
last victim, we will reset the current searching position until we touch
the end of dirty map, and then search the whole diryt map. So sometimes we
will search the range [victim, last] twice, it's redundant, this patch
avoids this issue.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 343b096..e5c255b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -257,6 +257,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct victim_sel_policy p;
 	unsigned int secno, max_cost;
+	unsigned int last_segment = MAIN_SEGS(sbi);
 	int nsearched = 0;
 
 	mutex_lock(&dirty_i->seglist_lock);
@@ -277,9 +278,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 		unsigned long cost;
 		unsigned int segno;
 
-		segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
-		if (segno >= MAIN_SEGS(sbi)) {
+		segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
+		if (segno >= last_segment) {
 			if (sbi->last_victim[p.gc_mode]) {
+				last_segment = sbi->last_victim[p.gc_mode];
 				sbi->last_victim[p.gc_mode] = 0;
 				p.offset = 0;
 				continue;
-- 
cgit v1.1


From 3342bb303bf48dd8bb5ac94c3356489ff53e4d04 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 5 Oct 2015 22:20:40 +0800
Subject: f2fs: skip searching dirty map if dirty segment is not exist

When searching victim during gc, if there are no dirty segments in
filesystem, we will still take the time to search the whole dirty segment
map, it's not needed, it's better to skip in this condition.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index e5c255b..d844a80 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -268,6 +268,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	p.min_segno = NULL_SEGNO;
 	p.min_cost = max_cost = get_max_cost(sbi, &p);
 
+	if (p.max_search == 0)
+		goto out;
+
 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
 		p.min_segno = check_bg_victims(sbi);
 		if (p.min_segno != NULL_SEGNO)
@@ -329,6 +332,7 @@ got_it:
 				sbi->cur_victim_sec,
 				prefree_segments(sbi), free_segments(sbi));
 	}
+out:
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	return (p.min_segno == NULL_SEGNO) ? 0 : 1;
-- 
cgit v1.1


From d530d4d8e237f4d12c93bb76df40b69b8b8a1dcd Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 5 Oct 2015 22:22:44 +0800
Subject: f2fs: support synchronous gc in ioctl

This patch drops in batches gc triggered through ioctl, since user
can easily control the gc by designing the loop around the ->ioctl.

We support synchronous gc by forcing using FG_GC in f2fs_gc, so with
it, user can make sure that in this round all blocks gced were
persistent in the device until ioctl returned.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  2 +-
 fs/f2fs/file.c    | 22 +++++++++-------------
 fs/f2fs/gc.c      | 21 +++++++++++++--------
 fs/f2fs/gc.h      |  6 ------
 fs/f2fs/segment.c |  2 +-
 5 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1fa0555..1b2fc23 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1830,7 +1830,7 @@ int f2fs_release_page(struct page *, gfp_t);
 int start_gc_thread(struct f2fs_sb_info *);
 void stop_gc_thread(struct f2fs_sb_info *);
 block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
-int f2fs_gc(struct f2fs_sb_info *);
+int f2fs_gc(struct f2fs_sb_info *, bool);
 void build_gc_manager(struct f2fs_sb_info *);
 
 /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a350c2a..c269966 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1618,29 +1618,25 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	__u32 i, count;
+	__u32 sync;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (get_user(count, (__u32 __user *)arg))
+	if (get_user(sync, (__u32 __user *)arg))
 		return -EFAULT;
 
-	if (!count || count > F2FS_BATCH_GC_MAX_NUM)
-		return -EINVAL;
+	if (f2fs_readonly(sbi->sb))
+		return -EROFS;
 
-	for (i = 0; i < count; i++) {
+	if (!sync) {
 		if (!mutex_trylock(&sbi->gc_mutex))
-			break;
-
-		if (f2fs_gc(sbi))
-			break;
+			return -EBUSY;
+	} else {
+		mutex_lock(&sbi->gc_mutex);
 	}
 
-	if (put_user(i, (__u32 __user *)arg))
-		return -EFAULT;
-
-	return 0;
+	return f2fs_gc(sbi, sync);
 }
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d844a80..830d277 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -78,7 +78,7 @@ static int gc_thread_func(void *data)
 		stat_inc_bggc_count(sbi);
 
 		/* if return value is not zero, no victim was selected */
-		if (f2fs_gc(sbi))
+		if (f2fs_gc(sbi, false))
 			wait_ms = gc_th->no_gc_sleep_time;
 
 		/* balancing f2fs's metadata periodically */
@@ -803,12 +803,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 	return nfree;
 }
 
-int f2fs_gc(struct f2fs_sb_info *sbi)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
 {
 	unsigned int segno, i;
-	int gc_type = BG_GC;
+	int gc_type = sync ? FG_GC : BG_GC;
 	int sec_freed = 0;
-	int ret = -1;
+	int ret = -EINVAL;
 	struct cp_control cpc;
 	struct gc_inode_list gc_list = {
 		.ilist = LIST_HEAD_INIT(gc_list.ilist),
@@ -855,15 +855,20 @@ gc_more:
 	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;
 
-	if (has_not_enough_free_secs(sbi, sec_freed))
-		goto gc_more;
+	if (!sync) {
+		if (has_not_enough_free_secs(sbi, sec_freed))
+			goto gc_more;
 
-	if (gc_type == FG_GC)
-		write_checkpoint(sbi, &cpc);
+		if (gc_type == FG_GC)
+			write_checkpoint(sbi, &cpc);
+	}
 stop:
 	mutex_unlock(&sbi->gc_mutex);
 
 	put_gc_inode(&gc_list);
+
+	if (sync)
+		ret = sec_freed ? 0 : -EAGAIN;
 	return ret;
 }
 
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index c5a055b..b4a65be 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -19,12 +19,6 @@
 #define LIMIT_INVALID_BLOCK	40 /* percentage over total user space */
 #define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */
 
-/*
- * with this macro, we can control the max time we do garbage collection,
- * when user triggers batch mode gc by ioctl.
- */
-#define F2FS_BATCH_GC_MAX_NUM		16
-
 /* Search max. number of dirty segments to select a victim segment */
 #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0ceff28..6b8edf2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -295,7 +295,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
 	 */
 	if (has_not_enough_free_secs(sbi, 0)) {
 		mutex_lock(&sbi->gc_mutex);
-		f2fs_gc(sbi);
+		f2fs_gc(sbi, false);
 	}
 }
 
-- 
cgit v1.1


From 456b88e4d15de833165cce67c2cdf998139e2fc1 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 5 Oct 2015 22:24:19 +0800
Subject: f2fs: introduce a new ioctl F2FS_IOC_WRITE_CHECKPOINT

This patch introduce a new ioctl for those users who want to trigger
checkpoint from userspace through ioctl.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h |  1 +
 fs/f2fs/file.c | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1b2fc23..18e5902 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -231,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
 		_IOR('f', 19, struct f2fs_encryption_policy)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c269966..b3985a6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1639,6 +1639,27 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 	return f2fs_gc(sbi, sync);
 }
 
+static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct cp_control cpc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (f2fs_readonly(sbi->sb))
+		return -EROFS;
+
+	cpc.reason = __get_cp_reason(sbi);
+
+	mutex_lock(&sbi->gc_mutex);
+	write_checkpoint(sbi, &cpc);
+	mutex_unlock(&sbi->gc_mutex);
+
+	return 0;
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1670,6 +1691,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_get_encryption_pwsalt(filp, arg);
 	case F2FS_IOC_GARBAGE_COLLECT:
 		return f2fs_ioc_gc(filp, arg);
+	case F2FS_IOC_WRITE_CHECKPOINT:
+		return f2fs_ioc_write_checkpoint(filp, arg);
 	default:
 		return -ENOTTY;
 	}
-- 
cgit v1.1


From 6aefd93b01379bf0b62f8b38dcf7a21397893833 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Oct 2015 11:02:54 -0700
Subject: f2fs: introduce background_gc=sync mount option

This patch introduce background_gc=sync enabling synchronous cleaning in
background.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  1 +
 fs/f2fs/gc.c      |  2 +-
 fs/f2fs/segment.h |  4 +++-
 fs/f2fs/super.c   | 21 +++++++++++++++------
 4 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 18e5902..00bd470 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -53,6 +53,7 @@
 #define F2FS_MOUNT_NOBARRIER		0x00000800
 #define F2FS_MOUNT_FASTBOOT		0x00001000
 #define F2FS_MOUNT_EXTENT_CACHE		0x00002000
+#define F2FS_MOUNT_FORCE_FG_GC		0x00004000
 
 #define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 830d277..e627c19 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -78,7 +78,7 @@ static int gc_thread_func(void *data)
 		stat_inc_bggc_count(sbi);
 
 		/* if return value is not zero, no victim was selected */
-		if (f2fs_gc(sbi, false))
+		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
 			wait_ms = gc_th->no_gc_sleep_time;
 
 		/* balancing f2fs's metadata periodically */
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a294da7..e9afb58 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -137,10 +137,12 @@ enum {
 /*
  * BG_GC means the background cleaning job.
  * FG_GC means the on-demand cleaning job.
+ * FORCE_FG_GC means on-demand cleaning job in background.
  */
 enum {
 	BG_GC = 0,
-	FG_GC
+	FG_GC,
+	FORCE_FG_GC,
 };
 
 /* for a function parameter to select a victim segment */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 16442ec..ba058d0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -292,11 +292,16 @@ static int parse_options(struct super_block *sb, char *options)
 
 			if (!name)
 				return -ENOMEM;
-			if (strlen(name) == 2 && !strncmp(name, "on", 2))
+			if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
 				set_opt(sbi, BG_GC);
-			else if (strlen(name) == 3 && !strncmp(name, "off", 3))
+				clear_opt(sbi, FORCE_FG_GC);
+			} else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
 				clear_opt(sbi, BG_GC);
-			else {
+				clear_opt(sbi, FORCE_FG_GC);
+			} else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
+				set_opt(sbi, BG_GC);
+				set_opt(sbi, FORCE_FG_GC);
+			} else {
 				kfree(name);
 				return -EINVAL;
 			}
@@ -631,10 +636,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
 
-	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
-		seq_printf(seq, ",background_gc=%s", "on");
-	else
+	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
+		if (test_opt(sbi, FORCE_FG_GC))
+			seq_printf(seq, ",background_gc=%s", "sync");
+		else
+			seq_printf(seq, ",background_gc=%s", "on");
+	} else {
 		seq_printf(seq, ",background_gc=%s", "off");
+	}
 	if (test_opt(sbi, DISABLE_ROLL_FORWARD))
 		seq_puts(seq, ",disable_roll_forward");
 	if (test_opt(sbi, DISCARD))
-- 
cgit v1.1


From 5c2674347466d5c2d5169214e95f4ad6dc09e9b6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Oct 2015 11:32:34 -0700
Subject: f2fs: add a tracepoint for background gc

This patch introduces a tracepoint to monitor background gc behaviors.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index e627c19..e7cec86 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -77,6 +77,9 @@ static int gc_thread_func(void *data)
 
 		stat_inc_bggc_count(sbi);
 
+		trace_f2fs_background_gc(sbi->sb, wait_ms,
+				prefree_segments(sbi), free_segments(sbi));
+
 		/* if return value is not zero, no victim was selected */
 		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
 			wait_ms = gc_th->no_gc_sleep_time;
-- 
cgit v1.1


From 60b99b486b568c13cbb7caa83cf8a12af7665f1e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Oct 2015 14:49:57 -0700
Subject: f2fs: introduce a periodic checkpoint flow

This patch introduces a periodic checkpoint feature.
Note that, this is not enforcing to conduct checkpoints very strictly in terms
of trigger timing, instead just hope to help user experiences.
The default value is 60 seconds.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 3 +++
 fs/f2fs/f2fs.h       | 2 ++
 fs/f2fs/segment.c    | 4 +++-
 fs/f2fs/super.c      | 5 +++++
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ff53405..0569097 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1114,6 +1114,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (cpc->reason == CP_RECOVERY)
 		f2fs_msg(sbi->sb, KERN_NOTICE,
 			"checkpoint: version = %llx", ckpt_ver);
+
+	/* do checkpoint periodically */
+	sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval);
 out:
 	mutex_unlock(&sbi->cp_mutex);
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 00bd470..aad4720 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -124,6 +124,7 @@ enum {
 		(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define DEF_CP_INTERVAL			60	/* 60 secs */
 
 struct cp_control {
 	int reason;
@@ -734,6 +735,7 @@ struct f2fs_sb_info {
 	struct rw_semaphore node_write;		/* locking node writes */
 	struct mutex writepages;		/* mutex for writepages() */
 	wait_queue_head_t cp_wait;
+	long cp_expires, cp_interval;		/* next expected periodic cp */
 
 	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6b8edf2..1d86a35 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -15,6 +15,7 @@
 #include <linux/prefetch.h>
 #include <linux/kthread.h>
 #include <linux/swap.h>
+#include <linux/timer.h>
 
 #include "f2fs.h"
 #include "segment.h"
@@ -315,7 +316,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	/* checkpoint is the only way to shrink partial cached entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
-			!available_free_memory(sbi, INO_ENTRIES))
+			!available_free_memory(sbi, INO_ENTRIES) ||
+			jiffies > sbi->cp_expires)
 		f2fs_sync_fs(sbi->sb, true);
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ba058d0..cb23d85 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -215,6 +215,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -231,6 +232,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(max_victim_search),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
+	ATTR_LIST(cp_interval),
 	NULL,
 };
 
@@ -1014,6 +1016,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		atomic_set(&sbi->nr_pages[i], 0);
 
 	sbi->dir_level = DEF_DIR_LEVEL;
+	sbi->cp_interval = DEF_CP_INTERVAL;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
 	INIT_LIST_HEAD(&sbi->s_list);
@@ -1350,6 +1353,8 @@ try_onemore:
 		f2fs_commit_super(sbi, true);
 	}
 
+	sbi->cp_expires = round_jiffies_up(jiffies);
+
 	return 0;
 
 free_kobj:
-- 
cgit v1.1


From 6066d8cdb6dd0fd51ccd96027f6ae8e6b3b5adff Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 1 Oct 2015 16:42:55 -0700
Subject: f2fs: merge meta writes as many possible

This patch tries to merge IOs as many as possible when background flusher
conducts flushing the dirty meta pages.

[Before]

...
2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124320, size = 4096
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124560, size = 32768
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 95720, size = 987136
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123928, size = 4096
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123944, size = 8192
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123968, size = 45056
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124064, size = 4096
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 97648, size = 1007616
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123776, size = 8192
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123800, size = 32768
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124624, size = 4096
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 99616, size = 921600
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123608, size = 4096
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123624, size = 77824
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123792, size = 4096
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123864, size = 32768
...

[After]

...
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 92168, size = 892928
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 93912, size = 753664
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 95384, size = 716800
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 96784, size = 712704
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 104160, size = 364544
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 104872, size = 356352
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 105568, size = 278528
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 106112, size = 319488
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 106736, size = 258048
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 107240, size = 270336
f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 107768, size = 180224
...

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0569097..0a9ec42 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -257,7 +257,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 						long nr_to_write)
 {
 	struct address_space *mapping = META_MAPPING(sbi);
-	pgoff_t index = 0, end = LONG_MAX;
+	pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
 	struct pagevec pvec;
 	long nwritten = 0;
 	struct writeback_control wbc = {
@@ -277,6 +277,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
+			if (prev == LONG_MAX)
+				prev = page->index - 1;
+			if (nr_to_write != LONG_MAX && page->index != prev + 1) {
+				pagevec_release(&pvec);
+				goto stop;
+			}
+
 			lock_page(page);
 
 			if (unlikely(page->mapping != mapping)) {
@@ -297,13 +304,14 @@ continue_unlock:
 				break;
 			}
 			nwritten++;
+			prev = page->index;
 			if (unlikely(nwritten >= nr_to_write))
 				break;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
-
+stop:
 	if (nwritten)
 		f2fs_submit_merged_bio(sbi, type, WRITE);
 
-- 
cgit v1.1


From 730d9ec36bf27e87805aee6ebf462071735e31f9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 18:16:38 +0200
Subject: btrfs: remove waitqueue_active check from
 btrfs_rm_dev_replace_unblocked

Normally the waitqueue_active would need a barrier, but this is not
necessary here because it's not a performance sensitive context and we
can call wake_up directly.

Suggested-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd59..733ff75 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
 {
 	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-	if (waitqueue_active(&fs_info->replace_wait))
-		wake_up(&fs_info->replace_wait);
+	wake_up(&fs_info->replace_wait);
 }
 
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
-- 
cgit v1.1


From b666a9cd993c6a49c4bf23a1ed476359c978f60b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 18:24:48 +0200
Subject: btrfs: add barrier for waitqueue_active in clear_btree_io_tree

waitqueue_active should be preceded by a barrier, in this function we
don't need to call it all the time.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 74bc333..e8e5b5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 static void clear_btree_io_tree(struct extent_io_tree *tree)
 {
 	spin_lock(&tree->lock);
+	/*
+	 * Do a single barrier for the waitqueue_active check here, the state
+	 * of the waitqueue should not change once clear_btree_io_tree is
+	 * called.
+	 */
+	smp_mb();
 	while (!RB_EMPTY_ROOT(&tree->state)) {
 		struct rb_node *node;
 		struct extent_state *state;
-- 
cgit v1.1


From 33a9eca7e4a4c2c17aebbb11f0e506a48ebc30c9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 18:35:10 +0200
Subject: btrfs: comment waitqueue_active implied by locks

Suggested-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c   | 6 +++++-
 fs/btrfs/tree-log.c | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcf7265..1a33d3e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 			}
 
 			goto done_nolock;
-		} else  if (waitqueue_active(&h->wait)) {
+			/*
+			 * The barrier for this waitqueue_active is not needed,
+			 * we're protected by h->lock and can't miss a wakeup.
+			 */
+		} else if (waitqueue_active(&h->wait)) {
 			spin_unlock(&rbio->bio_list_lock);
 			spin_unlock_irqrestore(&h->lock, flags);
 			wake_up(&h->wait);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace..d0deb46 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2950,6 +2950,9 @@ out_wake_log_root:
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	mutex_unlock(&log_root_tree->log_mutex);
 
+	/*
+	 * The barrier before waitqueue_active is implied by mutex_unlock
+	 */
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
 		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
@@ -2961,6 +2964,9 @@ out:
 	atomic_set(&root->log_commit[index1], 0);
 	mutex_unlock(&root->log_mutex);
 
+	/*
+	 * The barrier before waitqueue_active is implied by mutex_unlock
+	 */
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
 	return ret;
-- 
cgit v1.1


From a83342aa0c8f0ca90057d3837ae8d198186e5153 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Feb 2015 19:36:47 +0100
Subject: btrfs: add comments to barriers before waitqueue_active

Reduce number of undocumented barriers out there.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c  | 3 +++
 fs/btrfs/extent-tree.c  | 3 +--
 fs/btrfs/locking.c      | 3 +++
 fs/btrfs/ordered-data.c | 6 ++++++
 fs/btrfs/transaction.c  | 3 +++
 5 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca..3a9317c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -839,6 +839,9 @@ static void free_workspace(int type, struct list_head *workspace)
 	btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(alloc_workspace);
 wake:
+	/*
+	 * Make sure counter is updated before we wake up waiters.
+	 */
 	smp_mb();
 	if (waitqueue_active(workspace_wait))
 		wake_up(workspace_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f96042..59eb92f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10363,8 +10363,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
 {
 	percpu_counter_dec(&root->subv_writers->counter);
 	/*
-	 * Make sure counter is updated before we wake up
-	 * waiters.
+	 * Make sure counter is updated before we wake up waiters.
 	 */
 	smp_mb();
 	if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d7e6baf..03f8630 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -280,6 +280,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
 	if (blockers) {
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_dec(&eb->blocking_writers);
+		/*
+		 * Make sure counter is updated before we wake up waiters.
+		 */
 		smp_mb();
 		if (waitqueue_active(&eb->write_lock_wq))
 			wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf..071005f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 
 	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+		/*
+		 * Implicit memory barrier after test_and_set_bit
+		 */
 		if (waitqueue_active(&entry->wait))
 			wake_up(&entry->wait);
 	} else {
@@ -409,6 +412,9 @@ have_entry:
 
 	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+		/*
+		 * Implicit memory barrier after test_and_set_bit
+		 */
 		if (waitqueue_active(&entry->wait))
 			wake_up(&entry->wait);
 	} else {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e8e5b5a..3fd70f7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -861,6 +861,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	atomic_dec(&cur_trans->num_writers);
 	extwriter_counter_dec(cur_trans, trans->type);
 
+	/*
+	 * Make sure counter is updated before we wake up waiters.
+	 */
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-- 
cgit v1.1


From 779adf0f647651f5a45eeee3442c881300ce989e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Feb 2015 19:39:00 +0100
Subject: btrfs: remove extra barrier before waitqueue_active

Removing barriers is scary, but a call to atomic_dec_and_test implies
a barrier, so we don't need to issue another one.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d0deb46..6327559 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
 void btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
-		smp_mb();
+		/*
+		 * Implicit memory barrier after atomic_dec_and_test
+		 */
 		if (waitqueue_active(&root->log_writer_wait))
 			wake_up(&root->log_writer_wait);
 	}
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&log_root_tree->log_mutex);
 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
-		smp_mb();
+		/*
+		 * Implicit memory barrier after atomic_dec_and_test
+		 */
 		if (waitqueue_active(&log_root_tree->log_writer_wait))
 			wake_up(&log_root_tree->log_writer_wait);
 	}
-- 
cgit v1.1


From ee86395458072760d62e66aad10a5e9e8902b8cf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Feb 2015 19:41:40 +0100
Subject: btrfs: comment the rest of implicit barriers before waitqueue_active

There are atomic operations that imply the barrier for waitqueue_active
mixed in an if-condition.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 4 ++++
 fs/btrfs/disk-io.c       | 3 +++
 fs/btrfs/inode.c         | 3 +++
 fs/btrfs/locking.c       | 9 +++++++++
 fs/btrfs/volumes.c       | 3 +++
 5 files changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae427..e0941fb 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
 static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 {
 	int seq = atomic_inc_return(&delayed_root->items_seq);
+
+	/*
+	 * atomic_dec_return implies a barrier for waitqueue_active
+	 */
 	if ((atomic_dec_return(&delayed_root->items) <
 	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
 	    waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..379526f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -802,6 +802,9 @@ static void run_one_async_done(struct btrfs_work *work)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
+	/*
+	 * atomic_dec_return implies a barrier for waitqueue_active
+	 */
 	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d..7be4abe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1096,6 +1096,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
 		PAGE_CACHE_SHIFT;
 
+	/*
+	 * atomic_sub_return implies a barrier for waitqueue_active
+	 */
 	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
 	    5 * 1024 * 1024 &&
 	    waitqueue_active(&root->fs_info->async_submit_wait))
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 03f8630..8077461 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		write_lock(&eb->lock);
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_inc(&eb->spinning_writers);
+		/*
+		 * atomic_dec_and_test implies a barrier for waitqueue_active
+		 */
 		if (atomic_dec_and_test(&eb->blocking_writers) &&
 		    waitqueue_active(&eb->write_lock_wq))
 			wake_up(&eb->write_lock_wq);
@@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
 		read_lock(&eb->lock);
 		atomic_inc(&eb->spinning_readers);
+		/*
+		 * atomic_dec_and_test implies a barrier for waitqueue_active
+		 */
 		if (atomic_dec_and_test(&eb->blocking_readers) &&
 		    waitqueue_active(&eb->read_lock_wq))
 			wake_up(&eb->read_lock_wq);
@@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+	/*
+	 * atomic_dec_and_test implies a barrier for waitqueue_active
+	 */
 	if (atomic_dec_and_test(&eb->blocking_readers) &&
 	    waitqueue_active(&eb->read_lock_wq))
 		wake_up(&eb->read_lock_wq);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..ff35271 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -345,6 +345,9 @@ loop_lock:
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
 
+		/*
+		 * atomic_dec_return implies a barrier for waitqueue_active
+		 */
 		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
-- 
cgit v1.1


From daf3761c9fcde0f4ca64321cbed6c1c86d304193 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 9 Oct 2015 13:44:34 -0400
Subject: namei: results of d_is_negative() should be checked after dentry
 revalidation

Leandro Awa writes:
 "After switching to version 4.1.6, our parallelized and distributed
  workflows now fail consistently with errors of the form:

  T34: ./regex.c:39:22: error: config.h: No such file or directory

  From our 'git bisect' testing, the following commit appears to be the
  possible cause of the behavior we've been seeing: commit 766c4cbfacd8"

Al Viro says:
 "What happens is that 766c4cbfacd8 got the things subtly wrong.

  We used to treat d_is_negative() after lookup_fast() as "fall with
  ENOENT".  That was wrong - checking ->d_flags outside of ->d_seq
  protection is unreliable and failing with hard error on what should've
  fallen back to non-RCU pathname resolution is a bug.

  Unfortunately, we'd pulled the test too far up and ran afoul of
  another kind of staleness.  The dentry might have been absolutely
  stable from the RCU point of view (and we might be on UP, etc), but
  stale from the remote fs point of view.  If ->d_revalidate() returns
  "it's actually stale", dentry gets thrown away and the original code
  wouldn't even have looked at its ->d_flags.

  What we need is to check ->d_flags where 766c4cbfacd8 does (prior to
  ->d_seq validation) but only use the result in cases where we do not
  discard this dentry outright"

Reported-by: Leandro Awa <lawa@nvidia.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=104911
Fixes: 766c4cbfacd8 ("namei: d_is_negative() should be checked...")
Tested-by: Leandro Awa <lawa@nvidia.com>
Cc: stable@vger.kernel.org # v4.1+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 726d211..33e9495 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1558,8 +1558,6 @@ static int lookup_fast(struct nameidata *nd,
 		negative = d_is_negative(dentry);
 		if (read_seqcount_retry(&dentry->d_seq, seq))
 			return -ECHILD;
-		if (negative)
-			return -ENOENT;
 
 		/*
 		 * This sequence count validates that the parent had no
@@ -1580,6 +1578,12 @@ static int lookup_fast(struct nameidata *nd,
 				goto unlazy;
 			}
 		}
+		/*
+		 * Note: do negative dentry check after revalidation in
+		 * case that drops it.
+		 */
+		if (negative)
+			return -ENOENT;
 		path->mnt = mnt;
 		path->dentry = dentry;
 		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
-- 
cgit v1.1


From bb230c124730f21eea13deab433f9f8fc96bd5f3 Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 05:15:45 +1100
Subject: xfs: create global stats and stats_clear in sysfs

Currently, xfs global stats are in procfs. This patch introduces
(replicates) the global stats in sysfs. Additionally a stats_clear file
is introduced in sysfs.

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_stats.c  | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_stats.h  |  2 ++
 fs/xfs/xfs_super.c  | 20 +++++++++----
 fs/xfs/xfs_sysctl.c | 15 ++--------
 fs/xfs/xfs_sysfs.c  | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_sysfs.h  |  1 +
 6 files changed, 178 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f224038..e19b84a 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -29,6 +29,88 @@ static int counter_val(int idx)
 	return val;
 }
 
+int xfs_stats_format(char *buf)
+{
+	int		i, j;
+	int		len = 0;
+	__uint64_t	xs_xstrat_bytes = 0;
+	__uint64_t	xs_write_bytes = 0;
+	__uint64_t	xs_read_bytes = 0;
+
+	static const struct xstats_entry {
+		char	*desc;
+		int	endpoint;
+	} xstats[] = {
+		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
+		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
+		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
+		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
+		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
+		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
+		{ "ig",			XFSSTAT_END_INODE_OPS		},
+		{ "log",		XFSSTAT_END_LOG_OPS		},
+		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
+		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
+		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
+		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
+		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
+		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
+		{ "buf",		XFSSTAT_END_BUF			},
+		{ "abtb2",		XFSSTAT_END_ABTB_V2		},
+		{ "abtc2",		XFSSTAT_END_ABTC_V2		},
+		{ "bmbt2",		XFSSTAT_END_BMBT_V2		},
+		{ "ibt2",		XFSSTAT_END_IBT_V2		},
+		{ "fibt2",		XFSSTAT_END_FIBT_V2		},
+		/* we print both series of quota information together */
+		{ "qm",			XFSSTAT_END_QM			},
+	};
+
+	/* Loop over all stats groups */
+
+	for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
+		len += snprintf(buf + len, PATH_MAX - len, "%s",
+				xstats[i].desc);
+		/* inner loop does each group */
+		for (; j < xstats[i].endpoint; j++)
+			len += snprintf(buf + len, PATH_MAX - len, " %u",
+					counter_val(j));
+		len += snprintf(buf + len, PATH_MAX - len, "\n");
+	}
+	/* extra precision counters */
+	for_each_possible_cpu(i) {
+		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
+		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
+		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+	}
+
+	len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+	len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
+#if defined(DEBUG)
+		1);
+#else
+		0);
+#endif
+
+	return len;
+}
+
+void xfs_stats_clearall(void)
+{
+	int		c;
+	__uint32_t	vn_active;
+
+	xfs_notice(NULL, "Clearing xfsstats");
+	for_each_possible_cpu(c) {
+		preempt_disable();
+		/* save vn_active, it's a universal truth! */
+		vn_active = per_cpu(xfsstats, c).vn_active;
+		memset(&per_cpu(xfsstats, c), 0, sizeof(struct xfsstats));
+		per_cpu(xfsstats, c).vn_active = vn_active;
+		preempt_enable();
+	}
+}
+
 static int xfs_stat_proc_show(struct seq_file *m, void *v)
 {
 	int		i, j;
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c8f238b..18807b5 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_STATS_H__
 #define __XFS_STATS_H__
 
+int xfs_stats_format(char *buf);
+void xfs_stats_clearall(void);
 
 #if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 904f637c..0dfc53b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -61,6 +61,7 @@ static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
 
 static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
+static struct xfs_kobj xfs_stats_kobj;	/* global stats sysfs attrs */
 #ifdef DEBUG
 static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #endif
@@ -1838,19 +1839,25 @@ init_xfs_fs(void)
 	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
 	if (!xfs_kset) {
 		error = -ENOMEM;
-		goto out_sysctl_unregister;;
+		goto out_sysctl_unregister;
 	}
 
+	xfs_stats_kobj.kobject.kset = xfs_kset;
+	error = xfs_sysfs_init(&xfs_stats_kobj, &xfs_stats_ktype, NULL,
+			       "stats");
+	if (error)
+		goto out_kset_unregister;
+
 #ifdef DEBUG
 	xfs_dbg_kobj.kobject.kset = xfs_kset;
 	error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
 	if (error)
-		goto out_kset_unregister;
+		goto out_remove_stats_kobj;
 #endif
 
 	error = xfs_qm_init();
 	if (error)
-		goto out_remove_kobj;
+		goto out_remove_dbg_kobj;
 
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
@@ -1859,11 +1866,13 @@ init_xfs_fs(void)
 
  out_qm_exit:
 	xfs_qm_exit();
- out_remove_kobj:
+ out_remove_dbg_kobj:
 #ifdef DEBUG
 	xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+ out_remove_stats_kobj:
 #endif
+	xfs_sysfs_del(&xfs_stats_kobj);
+ out_kset_unregister:
 	kset_unregister(xfs_kset);
  out_sysctl_unregister:
 	xfs_sysctl_unregister();
@@ -1889,6 +1898,7 @@ exit_xfs_fs(void)
 #ifdef DEBUG
 	xfs_sysfs_del(&xfs_dbg_kobj);
 #endif
+	xfs_sysfs_del(&xfs_stats_kobj);
 	kset_unregister(xfs_kset);
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index a0c8067..5defabb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include "xfs_error.h"
+#include "xfs_stats.h"
 
 static struct ctl_table_header *xfs_table_header;
 
@@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler(
 	size_t			*lenp,
 	loff_t			*ppos)
 {
-	int		c, ret, *valp = ctl->data;
-	__uint32_t	vn_active;
+	int		ret, *valp = ctl->data;
 
 	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
 	if (!ret && write && *valp) {
-		xfs_notice(NULL, "Clearing xfsstats");
-		for_each_possible_cpu(c) {
-			preempt_disable();
-			/* save vn_active, it's a universal truth! */
-			vn_active = per_cpu(xfsstats, c).vn_active;
-			memset(&per_cpu(xfsstats, c), 0,
-			       sizeof(struct xfsstats));
-			per_cpu(xfsstats, c).vn_active = vn_active;
-			preempt_enable();
-		}
+		xfs_stats_clearall();
 		xfs_stats_clear = 0;
 	}
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index aa03670..a094e20 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -21,6 +21,7 @@
 #include "xfs_log_format.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
+#include "xfs_stats.h"
 
 struct xfs_sysfs_attr {
 	struct attribute attr;
@@ -38,6 +39,8 @@ to_attr(struct attribute *attr)
 	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
 #define XFS_SYSFS_ATTR_RO(name) \
 	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
 
 #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
 
@@ -125,6 +128,78 @@ struct kobj_type xfs_dbg_ktype = {
 
 #endif /* DEBUG */
 
+
+/* stats */
+
+STATIC ssize_t
+stats_show(
+	char	*buf,
+	void	*data)
+{
+	return xfs_stats_format(buf);
+}
+XFS_SYSFS_ATTR_RO(stats);
+
+STATIC ssize_t
+stats_clear_store(
+	const char	*buf,
+	size_t		count,
+	void		*data)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val != 1)
+		return -EINVAL;
+	xfs_stats_clearall();
+	return count;
+}
+XFS_SYSFS_ATTR_WO(stats_clear);
+
+static struct attribute *xfs_stats_attrs[] = {
+	ATTR_LIST(stats),
+	ATTR_LIST(stats_clear),
+	NULL,
+};
+
+STATIC ssize_t
+xfs_stats_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+}
+
+STATIC ssize_t
+xfs_stats_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+}
+
+static struct sysfs_ops xfs_stats_ops = {
+	.show = xfs_stats_show,
+	.store = xfs_stats_store,
+};
+
+struct kobj_type xfs_stats_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_stats_ops,
+	.default_attrs = xfs_stats_attrs,
+};
+
 /* xlog */
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 240eee3..be692e5 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -22,6 +22,7 @@
 extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
 extern struct kobj_type xfs_dbg_ktype;	/* debug */
 extern struct kobj_type xfs_log_ktype;	/* xlog */
+extern struct kobj_type xfs_stats_ktype;	/* stats */
 
 static inline struct xfs_kobj *
 to_kobj(struct kobject *kobject)
-- 
cgit v1.1


From 32f0ea05219e5212cafcbeaa255e627314f87e7e Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 05:16:45 +1100
Subject: xfs: create symlink proc/fs/xfs/stat to sys/fs/xfs/stats

As a part of the work to move xfs global stats from procfs to sysfs,
this patch creates the symlink from proc/fs/xfs/stat to sys/fs/xfs/stats.

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_stats.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index e19b84a..e6efebb 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -243,9 +243,10 @@ xfs_init_procfs(void)
 	if (!proc_mkdir("fs/xfs", NULL))
 		goto out;
 
-	if (!proc_create("fs/xfs/stat", 0, NULL,
-			 &xfs_stat_proc_fops))
+	if (!proc_symlink("fs/xfs/stat", NULL,
+			  "/sys/fs/xfs/stats/stats"))
 		goto out_remove_xfs_dir;
+
 #ifdef CONFIG_XFS_QUOTA
 	if (!proc_create("fs/xfs/xqmstat", 0, NULL,
 			 &xqmstat_proc_fops))
-- 
cgit v1.1


From 50cf5b7402633265fe11430fd63af82c401fad46 Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 05:17:45 +1100
Subject: xfs: remove unused procfs code

As a part of the work to move xfs global stats from procfs to sysfs,
this patch removes the now unused procfs code that was xfs stat specific.

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_stats.c | 74 ------------------------------------------------------
 1 file changed, 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index e6efebb..f481da0 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -111,80 +111,6 @@ void xfs_stats_clearall(void)
 	}
 }
 
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
-{
-	int		i, j;
-	__uint64_t	xs_xstrat_bytes = 0;
-	__uint64_t	xs_write_bytes = 0;
-	__uint64_t	xs_read_bytes = 0;
-
-	static const struct xstats_entry {
-		char	*desc;
-		int	endpoint;
-	} xstats[] = {
-		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
-		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
-		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
-		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
-		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
-		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
-		{ "ig",			XFSSTAT_END_INODE_OPS		},
-		{ "log",		XFSSTAT_END_LOG_OPS		},
-		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
-		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
-		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
-		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
-		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
-		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
-		{ "buf",		XFSSTAT_END_BUF			},
-		{ "abtb2",		XFSSTAT_END_ABTB_V2		},
-		{ "abtc2",		XFSSTAT_END_ABTC_V2		},
-		{ "bmbt2",		XFSSTAT_END_BMBT_V2		},
-		{ "ibt2",		XFSSTAT_END_IBT_V2		},
-		{ "fibt2",		XFSSTAT_END_FIBT_V2		},
-		/* we print both series of quota information together */
-		{ "qm",			XFSSTAT_END_QM			},
-	};
-
-	/* Loop over all stats groups */
-	for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
-		seq_printf(m, "%s", xstats[i].desc);
-		/* inner loop does each group */
-		for (; j < xstats[i].endpoint; j++)
-			seq_printf(m, " %u", counter_val(j));
-		seq_putc(m, '\n');
-	}
-	/* extra precision counters */
-	for_each_possible_cpu(i) {
-		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
-		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
-		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
-	}
-
-	seq_printf(m, "xpc %Lu %Lu %Lu\n",
-			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-	seq_printf(m, "debug %u\n",
-#if defined(DEBUG)
-		1);
-#else
-		0);
-#endif
-	return 0;
-}
-
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, xfs_stat_proc_show, NULL);
-}
-
-static const struct file_operations xfs_stat_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= xfs_stat_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* legacy quota interfaces */
 #ifdef CONFIG_XFS_QUOTA
 static int xqm_proc_show(struct seq_file *m, void *v)
-- 
cgit v1.1


From a27c264009cb236dc209875043a0c237af46a1af Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 05:18:45 +1100
Subject: xfs: consolidate sysfs ops

As a part of the series to move xfs global stats from procfs to sysfs,
this patch consolidates the sysfs ops functions and removes redundancy.

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_sysfs.c | 182 +++++++++++++++++++----------------------------------
 1 file changed, 63 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index a094e20..fa3dc66 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -25,8 +25,9 @@
 
 struct xfs_sysfs_attr {
 	struct attribute attr;
-	ssize_t (*show)(char *buf, void *data);
-	ssize_t (*store)(const char *buf, size_t count, void *data);
+	ssize_t (*show)(struct kobject *kobject, char *buf);
+	ssize_t (*store)(struct kobject *kobject, const char *buf,
+			 size_t count);
 };
 
 static inline struct xfs_sysfs_attr *
@@ -54,14 +55,42 @@ struct kobj_type xfs_mp_ktype = {
 	.release = xfs_sysfs_release,
 };
 
+STATIC ssize_t
+xfs_sysfs_object_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+	.show = xfs_sysfs_object_show,
+	.store = xfs_sysfs_object_store,
+};
+
 #ifdef DEBUG
 /* debug */
 
 STATIC ssize_t
 log_recovery_delay_store(
+	struct kobject	*kobject,
 	const char	*buf,
-	size_t		count,
-	void		*data)
+	size_t		count)
 {
 	int		ret;
 	int		val;
@@ -80,8 +109,8 @@ log_recovery_delay_store(
 
 STATIC ssize_t
 log_recovery_delay_show(
-	char	*buf,
-	void	*data)
+	struct kobject	*kobject,
+	char		*buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
 }
@@ -92,49 +121,20 @@ static struct attribute *xfs_dbg_attrs[] = {
 	NULL,
 };
 
-STATIC ssize_t
-xfs_dbg_show(
-	struct kobject		*kobject,
-	struct attribute	*attr,
-	char			*buf)
-{
-	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
-	return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
-}
-
-STATIC ssize_t
-xfs_dbg_store(
-	struct kobject		*kobject,
-	struct attribute	*attr,
-	const char		*buf,
-	size_t			count)
-{
-	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
-	return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
-}
-
-static struct sysfs_ops xfs_dbg_ops = {
-	.show = xfs_dbg_show,
-	.store = xfs_dbg_store,
-};
-
 struct kobj_type xfs_dbg_ktype = {
 	.release = xfs_sysfs_release,
-	.sysfs_ops = &xfs_dbg_ops,
+	.sysfs_ops = &xfs_sysfs_ops,
 	.default_attrs = xfs_dbg_attrs,
 };
 
 #endif /* DEBUG */
 
-
 /* stats */
 
 STATIC ssize_t
 stats_show(
-	char	*buf,
-	void	*data)
+	struct kobject	*kobject,
+	char		*buf)
 {
 	return xfs_stats_format(buf);
 }
@@ -142,9 +142,9 @@ XFS_SYSFS_ATTR_RO(stats);
 
 STATIC ssize_t
 stats_clear_store(
+	struct kobject	*kobject,
 	const char	*buf,
-	size_t		count,
-	void		*data)
+	size_t		count)
 {
 	int		ret;
 	int		val;
@@ -166,50 +166,30 @@ static struct attribute *xfs_stats_attrs[] = {
 	NULL,
 };
 
-STATIC ssize_t
-xfs_stats_show(
-	struct kobject		*kobject,
-	struct attribute	*attr,
-	char			*buf)
-{
-	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
-	return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
-}
-
-STATIC ssize_t
-xfs_stats_store(
-	struct kobject		*kobject,
-	struct attribute	*attr,
-	const char		*buf,
-	size_t			count)
-{
-	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
-	return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
-}
-
-static struct sysfs_ops xfs_stats_ops = {
-	.show = xfs_stats_show,
-	.store = xfs_stats_store,
-};
-
 struct kobj_type xfs_stats_ktype = {
 	.release = xfs_sysfs_release,
-	.sysfs_ops = &xfs_stats_ops,
+	.sysfs_ops = &xfs_sysfs_ops,
 	.default_attrs = xfs_stats_attrs,
 };
 
 /* xlog */
 
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xlog, l_kobj);
+}
+
 STATIC ssize_t
 log_head_lsn_show(
-	char	*buf,
-	void	*data)
+	struct kobject	*kobject,
+	char		*buf)
 {
-	struct xlog *log = data;
 	int cycle;
 	int block;
+	struct xlog *log = to_xlog(kobject);
 
 	spin_lock(&log->l_icloglock);
 	cycle = log->l_curr_cycle;
@@ -222,12 +202,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn);
 
 STATIC ssize_t
 log_tail_lsn_show(
-	char	*buf,
-	void	*data)
+	struct kobject	*kobject,
+	char		*buf)
 {
-	struct xlog *log = data;
 	int cycle;
 	int block;
+	struct xlog *log = to_xlog(kobject);
 
 	xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
 	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
@@ -236,12 +216,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn);
 
 STATIC ssize_t
 reserve_grant_head_show(
-	char	*buf,
-	void	*data)
+	struct kobject	*kobject,
+	char		*buf)
+
 {
-	struct xlog *log = data;
 	int cycle;
 	int bytes;
+	struct xlog *log = to_xlog(kobject);
 
 	xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
 	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -250,12 +231,12 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head);
 
 STATIC ssize_t
 write_grant_head_show(
-	char	*buf,
-	void	*data)
+	struct kobject	*kobject,
+	char		*buf)
 {
-	struct xlog *log = data;
 	int cycle;
 	int bytes;
+	struct xlog *log = to_xlog(kobject);
 
 	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
 	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -270,45 +251,8 @@ static struct attribute *xfs_log_attrs[] = {
 	NULL,
 };
 
-static inline struct xlog *
-to_xlog(struct kobject *kobject)
-{
-	struct xfs_kobj *kobj = to_kobj(kobject);
-	return container_of(kobj, struct xlog, l_kobj);
-}
-
-STATIC ssize_t
-xfs_log_show(
-	struct kobject		*kobject,
-	struct attribute	*attr,
-	char			*buf)
-{
-	struct xlog *log = to_xlog(kobject);
-	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
-	return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
-}
-
-STATIC ssize_t
-xfs_log_store(
-	struct kobject		*kobject,
-	struct attribute	*attr,
-	const char		*buf,
-	size_t			count)
-{
-	struct xlog *log = to_xlog(kobject);
-	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
-	return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
-}
-
-static struct sysfs_ops xfs_log_ops = {
-	.show = xfs_log_show,
-	.store = xfs_log_store,
-};
-
 struct kobj_type xfs_log_ktype = {
 	.release = xfs_sysfs_release,
-	.sysfs_ops = &xfs_log_ops,
+	.sysfs_ops = &xfs_sysfs_ops,
 	.default_attrs = xfs_log_attrs,
 };
-- 
cgit v1.1


From 80529c45ab66188a0b3814ba31409b31f5fcb53d Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 05:19:45 +1100
Subject: xfs: pass xfsstats structures to handlers and macros

This patch is the next step toward per-fs xfs stats. The patch makes
the show and clear routines able to handle any stats structure
associated with a kobject.

Instead of a single global xfsstats structure, add kobject and a pointer
to a per-cpu struct xfsstats. Modify the macros that manipulate the stats
accordingly: XFS_STATS_INC, XFS_STATS_DEC, and XFS_STATS_ADD now access
xfsstats->xs_stats.

The sysfs functions need to get from the kobject back to the xfsstats
structure which contains it, and pass the pointer to the ->xs_stats
percpu structure into the show & clear routines.

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_linux.h  |  7 +++++++
 fs/xfs/xfs_stats.c  | 32 +++++++++++++++-----------------
 fs/xfs/xfs_stats.h  | 22 +++++++++++-----------
 fs/xfs/xfs_super.c  | 21 +++++++++++++++------
 fs/xfs/xfs_sysctl.c |  2 +-
 fs/xfs/xfs_sysfs.c  | 16 ++++++++++++++--
 6 files changed, 63 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 85f883d..ec0e239 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -171,6 +171,13 @@ struct xfs_kobj {
 	struct completion	complete;
 };
 
+struct xstats {
+	struct xfsstats __percpu	*xs_stats;
+	struct xfs_kobj			xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
 /* Kernel uid/gid conversion. These are used to convert to/from the on disk
  * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
  * The conversion here is type only, the value will remain the same since we
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f481da0..3348b0b 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -18,18 +18,18 @@
 #include "xfs.h"
 #include <linux/proc_fs.h>
 
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
+struct xstats xfsstats;
 
-static int counter_val(int idx)
+static int counter_val(struct xfsstats __percpu *stats, int idx)
 {
 	int val = 0, cpu;
 
 	for_each_possible_cpu(cpu)
-		val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+		val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
 	return val;
 }
 
-int xfs_stats_format(char *buf)
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 {
 	int		i, j;
 	int		len = 0;
@@ -73,14 +73,14 @@ int xfs_stats_format(char *buf)
 		/* inner loop does each group */
 		for (; j < xstats[i].endpoint; j++)
 			len += snprintf(buf + len, PATH_MAX - len, " %u",
-					counter_val(j));
+					counter_val(stats, j));
 		len += snprintf(buf + len, PATH_MAX - len, "\n");
 	}
 	/* extra precision counters */
 	for_each_possible_cpu(i) {
-		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
-		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
-		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+		xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
+		xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
+		xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
 	}
 
 	len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
@@ -95,7 +95,7 @@ int xfs_stats_format(char *buf)
 	return len;
 }
 
-void xfs_stats_clearall(void)
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
 {
 	int		c;
 	__uint32_t	vn_active;
@@ -104,9 +104,9 @@ void xfs_stats_clearall(void)
 	for_each_possible_cpu(c) {
 		preempt_disable();
 		/* save vn_active, it's a universal truth! */
-		vn_active = per_cpu(xfsstats, c).vn_active;
-		memset(&per_cpu(xfsstats, c), 0, sizeof(struct xfsstats));
-		per_cpu(xfsstats, c).vn_active = vn_active;
+		vn_active = per_cpu_ptr(stats, c)->vn_active;
+		memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+		per_cpu_ptr(stats, c)->vn_active = vn_active;
 		preempt_enable();
 	}
 }
@@ -117,10 +117,8 @@ static int xqm_proc_show(struct seq_file *m, void *v)
 {
 	/* maximum; incore; ratio free to inuse; freelist */
 	seq_printf(m, "%d\t%d\t%d\t%u\n",
-			0,
-			counter_val(XFSSTAT_END_XQMSTAT),
-			0,
-			counter_val(XFSSTAT_END_XQMSTAT + 1));
+		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+		   0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
 	return 0;
 }
 
@@ -144,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "qm");
 	for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
-		seq_printf(m, " %u", counter_val(j));
+		seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
 	seq_putc(m, '\n');
 	return 0;
 }
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 18807b5..54f2260 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -18,9 +18,6 @@
 #ifndef __XFS_STATS_H__
 #define __XFS_STATS_H__
 
-int xfs_stats_format(char *buf);
-void xfs_stats_clearall(void);
-
 #if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
 
 #include <linux/percpu.h>
@@ -217,15 +214,18 @@ struct xfsstats {
 	__uint64_t		xs_read_bytes;
 };
 
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
 
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v)	(per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v)	(per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc)	(per_cpu(xfsstats, current_cpu()).v += (inc))
+#define XFS_STATS_INC(v)	\
+	(per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++)
+
+#define XFS_STATS_DEC(v)	\
+	(per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--)
+
+#define XFS_STATS_ADD(v, inc)	\
+	(per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc))
 
 extern int xfs_init_procfs(void);
 extern void xfs_cleanup_procfs(void);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0dfc53b..e1a35a5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -61,7 +61,6 @@ static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
 
 static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
-static struct xfs_kobj xfs_stats_kobj;	/* global stats sysfs attrs */
 #ifdef DEBUG
 static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #endif
@@ -1842,11 +1841,18 @@ init_xfs_fs(void)
 		goto out_sysctl_unregister;
 	}
 
-	xfs_stats_kobj.kobject.kset = xfs_kset;
-	error = xfs_sysfs_init(&xfs_stats_kobj, &xfs_stats_ktype, NULL,
+	xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+	xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+	if (!xfsstats.xs_stats) {
+		error = -ENOMEM;
+		goto out_kset_unregister;
+	}
+
+	error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
 			       "stats");
 	if (error)
-		goto out_kset_unregister;
+		goto out_free_stats;
 
 #ifdef DEBUG
 	xfs_dbg_kobj.kobject.kset = xfs_kset;
@@ -1871,7 +1877,9 @@ init_xfs_fs(void)
 	xfs_sysfs_del(&xfs_dbg_kobj);
  out_remove_stats_kobj:
 #endif
-	xfs_sysfs_del(&xfs_stats_kobj);
+	xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+	free_percpu(xfsstats.xs_stats);
  out_kset_unregister:
 	kset_unregister(xfs_kset);
  out_sysctl_unregister:
@@ -1898,7 +1906,8 @@ exit_xfs_fs(void)
 #ifdef DEBUG
 	xfs_sysfs_del(&xfs_dbg_kobj);
 #endif
-	xfs_sysfs_del(&xfs_stats_kobj);
+	xfs_sysfs_del(&xfsstats.xs_kobj);
+	free_percpu(xfsstats.xs_stats);
 	kset_unregister(xfs_kset);
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 5defabb..aed74d3 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler(
 	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
 	if (!ret && write && *valp) {
-		xfs_stats_clearall();
+		xfs_stats_clearall(xfsstats.xs_stats);
 		xfs_stats_clear = 0;
 	}
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index fa3dc66..ee70f5d 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -131,12 +131,22 @@ struct kobj_type xfs_dbg_ktype = {
 
 /* stats */
 
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xstats, xs_kobj);
+}
+
 STATIC ssize_t
 stats_show(
 	struct kobject	*kobject,
 	char		*buf)
 {
-	return xfs_stats_format(buf);
+	struct xstats	*stats = to_xstats(kobject);
+
+	return xfs_stats_format(stats->xs_stats, buf);
 }
 XFS_SYSFS_ATTR_RO(stats);
 
@@ -148,6 +158,7 @@ stats_clear_store(
 {
 	int		ret;
 	int		val;
+	struct xstats	*stats = to_xstats(kobject);
 
 	ret = kstrtoint(buf, 0, &val);
 	if (ret)
@@ -155,7 +166,8 @@ stats_clear_store(
 
 	if (val != 1)
 		return -EINVAL;
-	xfs_stats_clearall();
+
+	xfs_stats_clearall(stats->xs_stats);
 	return count;
 }
 XFS_SYSFS_ATTR_WO(stats_clear);
-- 
cgit v1.1


From 5cb13dcd0fac071b45c4bebe1801a08ff0d89cad Mon Sep 17 00:00:00 2001
From: Zhaohongjiang <zhaohongjiang@huawei.com>
Date: Mon, 12 Oct 2015 15:28:39 +1100
Subject: cancel the setfilesize transation when io error happen

When I ran xfstest/073 case, the remount process was blocked to wait
transactions to be zero. I found there was a io error happened, and
the setfilesize transaction was not released properly. We should add
the changes to cancel the io error in this case.

Reproduction steps:
1. dd if=/dev/zero of=xfs1.img bs=1M count=2048
2. mkfs.xfs xfs1.img
3. losetup -f ./xfs1.img /dev/loop0
4. mount -t xfs /dev/loop0 /home/test_dir/
5. mkdir /home/test_dir/test
6. mkfs.xfs -dfile,name=image,size=2g
7. mount -t xfs -o loop image /home/test_dir/test
8. cp a file bigger than 2g to /home/test_dir/test
9. mount -t xfs -o remount,ro /home/test_dir/test

[ dchinner: moved io error detection to xfs_setfilesize_ioend() after
  transaction context restoration. ]

Signed-off-by: Zhao Hongjiang <zhaohongjiang@huawei.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 50ab287..e485e31 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -172,6 +172,12 @@ xfs_setfilesize_ioend(
 	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
+	/* we abort the update if there was an IO error */
+	if (ioend->io_error) {
+		xfs_trans_cancel(tp);
+		return ioend->io_error;
+	}
+
 	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 
@@ -212,14 +218,17 @@ xfs_end_io(
 		ioend->io_error = -EIO;
 		goto done;
 	}
-	if (ioend->io_error)
-		goto done;
 
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
+	 * Detecting and handling completion IO errors is done individually
+	 * for each case as different cleanup operations need to be performed
+	 * on error.
 	 */
 	if (ioend->io_type == XFS_IO_UNWRITTEN) {
+		if (ioend->io_error)
+			goto done;
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						  ioend->io_size);
 	} else if (ioend->io_append_trans) {
-- 
cgit v1.1


From 009c6e871e98aa23bc2e58474c3d9feb05dd1ae6 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 12 Oct 2015 15:34:20 +1100
Subject: xfs: add missing ilock around dio write last extent alignment

The iomap codepath (via get_blocks()) acquires and release the inode
lock in the case of a direct write that requires block allocation. This
is because xfs_iomap_write_direct() allocates a transaction, which means
the ilock must be dropped and reacquired after the transaction is
allocated and reserved.

xfs_iomap_write_direct() invokes xfs_iomap_eof_align_last_fsb() before
the transaction is created and thus before the ilock is reacquired. This
can lead to calls to xfs_iread_extents() and reads of the in-core extent
list without any synchronization (via xfs_bmap_eof() and
xfs_bmap_last_extent()). xfs_iread_extents() assert fails if the ilock
is not held, but this is not currently seen in practice as the current
callers had already invoked xfs_bmapi_read().

What has been seen in practice are reports of crashes down in the
xfs_bmap_eof() codepath on direct writes due to seemingly bogus pointer
references from xfs_iext_get_ext(). While an explicit reproducer is not
currently available to confirm the cause of the problem, crash analysis
and code inspection from David Jeffrey had identified the insufficient
locking.

xfs_iomap_eof_align_last_fsb() is called from other contexts with the
inode lock already held, so we cannot acquire it therein.
__xfs_get_blocks() acquires and drops the ilock with variable flags to
cover the event that the extent list must be read in. The common case is
that __xfs_get_blocks() acquires the shared ilock. To provide locking
around the last extent alignment call without adding more lock cycles to
the dio path, update xfs_iomap_write_direct() to expect the shared ilock
held on entry and do the extent alignment under its protection. Demote
the lock, if necessary, from __xfs_get_blocks() and push the
xfs_qm_dqattach() call outside of the shared lock critical section.
Also, add an assert to document that the extent list is always expected
to be present in this path. Otherwise, we risk a call to
xfs_iread_extents() while under the shared ilock. This is safe as all
current callers have executed an xfs_bmapi_read() call under the current
iolock context.

Reported-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c  | 10 +++++-----
 fs/xfs/xfs_iomap.c | 33 ++++++++++++++++++++++++++-------
 fs/xfs/xfs_pnfs.c  |  5 +++++
 3 files changed, 36 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e485e31..e4fff58 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1408,12 +1408,12 @@ __xfs_get_blocks(
 	      imap.br_startblock == DELAYSTARTBLOCK))) {
 		if (direct || xfs_get_extsz_hint(ip)) {
 			/*
-			 * Drop the ilock in preparation for starting the block
-			 * allocation transaction.  It will be retaken
-			 * exclusively inside xfs_iomap_write_direct for the
-			 * actual allocation.
+			 * xfs_iomap_write_direct() expects the shared lock. It
+			 * is unlocked on return.
 			 */
-			xfs_iunlock(ip, lockmode);
+			if (lockmode == XFS_ILOCK_EXCL)
+				xfs_ilock_demote(ip, lockmode);
+
 			error = xfs_iomap_write_direct(ip, offset, size,
 						       &imap, nimaps);
 			if (error)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033..1beda33 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -131,20 +131,29 @@ xfs_iomap_write_direct(
 	uint		qblocks, resblks, resrtextents;
 	int		committed;
 	int		error;
-
-	error = xfs_qm_dqattach(ip, 0);
-	if (error)
-		return error;
+	int		lockmode;
 
 	rt = XFS_IS_REALTIME_INODE(ip);
 	extsz = xfs_get_extsz_hint(ip);
+	lockmode = XFS_ILOCK_SHARED;	/* locked by caller */
+
+	ASSERT(xfs_isilocked(ip, lockmode));
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
 	if ((offset + count) > XFS_ISIZE(ip)) {
+		/*
+		 * Assert that the in-core extent list is present since this can
+		 * call xfs_iread_extents() and we only have the ilock shared.
+		 * This should be safe because the lock was held around a bmapi
+		 * call in the caller and we only need it to access the in-core
+		 * list.
+		 */
+		ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
+								XFS_IFEXTENTS);
 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
 		if (error)
-			return error;
+			goto out_unlock;
 	} else {
 		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -174,6 +183,15 @@ xfs_iomap_write_direct(
 	}
 
 	/*
+	 * Drop the shared lock acquired by the caller, attach the dquot if
+	 * necessary and move on to transaction setup.
+	 */
+	xfs_iunlock(ip, lockmode);
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	/*
 	 * Allocate and setup the transaction
 	 */
 	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
@@ -187,7 +205,8 @@ xfs_iomap_write_direct(
 		return error;
 	}
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	lockmode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lockmode);
 
 	error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 	if (error)
@@ -229,7 +248,7 @@ xfs_iomap_write_direct(
 		error = xfs_alert_fsblock_zero(ip, imap);
 
 out_unlock:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_iunlock(ip, lockmode);
 	return error;
 
 out_bmap_cancel:
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ab4a606..dc62219 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -181,6 +181,11 @@ xfs_fs_map_blocks(
 		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
 
 		if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * xfs_iomap_write_direct() expects to take ownership of
+			 * the shared ilock.
+			 */
+			xfs_ilock(ip, XFS_ILOCK_SHARED);
 			error = xfs_iomap_write_direct(ip, offset, length,
 						       &imap, nimaps);
 			if (error)
-- 
cgit v1.1


From b7cdc66be54b64daef593894d12ecc405f117829 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 12 Oct 2015 15:40:24 +1100
Subject: xfs: log local to remote symlink conversions correctly on v5 supers

A local format symlink inode is converted to extent format when an
extended attribute is set on an inode as part of the attribute fork
creation. This means a block is allocated, the local symlink target name
is copied to the block and the block is logged. Currently,
xfs_bmap_local_to_extents() handles logging the remote block data based
on the size of the data fork prior to the conversion. This is not
correct on v5 superblock filesystems, which add an additional header to
remote symlink blocks that is nonexistent in local format inodes.

As a result, the full length of the remote symlink block content is not
logged. This can lead to corruption should a crash occur and log
recovery replay this transaction.

Since a callout is already used to initialize the new remote symlink
block, update the local-to-extents conversion mechanism to make the
callout also responsible for logging the block. It is already required
to set the log buffer type and format the block appropriately based on
the superblock version. This ensures the remote symlink is always logged
correctly. Note that xfs_bmap_local_to_extents() is only called for
symlinks so there are no other callouts that require modification.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c           | 10 ++++++----
 fs/xfs/libxfs/xfs_symlink_remote.c |  3 +++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8e2010d..b5cb61e 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -948,14 +948,16 @@ xfs_bmap_local_to_extents(
 	bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
 
 	/*
-	 * Initialise the block and copy the data
+	 * Initialize the block, copy the data and log the remote buffer.
 	 *
-	 * Note: init_fn must set the buffer log item type correctly!
+	 * The callout is responsible for logging because the remote format
+	 * might differ from the local format and thus we don't know how much to
+	 * log here. Note that init_fn must also set the buffer log item type
+	 * correctly.
 	 */
 	init_fn(tp, bp, ip, ifp);
 
-	/* account for the change in fork size and log everything */
-	xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+	/* account for the change in fork size */
 	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
 	xfs_bmap_local_to_extents_empty(ip, whichfork);
 	flags |= XFS_ILOG_CORE;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 8f8af05..b9884aa 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -183,6 +183,7 @@ xfs_symlink_local_to_remote(
 	if (!xfs_sb_version_hascrc(&mp->m_sb)) {
 		bp->b_ops = NULL;
 		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
 		return;
 	}
 
@@ -198,4 +199,6 @@ xfs_symlink_local_to_remote(
 	buf = bp->b_addr;
 	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
 	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+					ifp->if_bytes - 1);
 }
-- 
cgit v1.1


From 5bf97b1cb430a3a6da4341ae913299706ebc52f5 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Oct 2015 15:41:29 +1100
Subject: xfs: Print name and pid when memory allocation loops

This patch adds comm name and pid to warning messages printed by
kmem_alloc(), kmem_zone_alloc() and xfs_buf_allocate_memory().
This will help telling which memory allocations (e.g. kernel worker
threads, OOM victim tasks, neither) are stalling because these functions
are passing __GFP_NOWARN which suppresses not only backtrace but comm name
and pid.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/kmem.c    | 10 ++++++----
 fs/xfs/xfs_buf.c |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63..535c136 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
-					__func__, lflags);
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+				current->comm, current->pid,
+				__func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
-					__func__, lflags);
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+				current->comm, current->pid,
+				__func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ecffb3..cac62e1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -354,7 +354,8 @@ retry:
 			 */
 			if (!(++retries % 100))
 				xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+					current->comm, current->pid,
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
-- 
cgit v1.1


From a45086e27dfa21a4b39134f7505c8f60a3ecdec4 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 12 Oct 2015 15:59:25 +1100
Subject: xfs: validate metadata LSNs against log on v5 superblocks

Since the onset of v5 superblocks, the LSN of the last modification has
been included in a variety of on-disk data structures. This LSN is used
to provide log recovery ordering guarantees (e.g., to ensure an older
log recovery item is not replayed over a newer target data structure).

While this works correctly from the point a filesystem is formatted and
mounted, userspace tools have some problematic behaviors that defeat
this mechanism. For example, xfs_repair historically zeroes out the log
unconditionally (regardless of whether corruption is detected). If this
occurs, the LSN of the filesystem is reset and the log is now in a
problematic state with respect to on-disk metadata structures that might
have a larger LSN. Until either the log catches up to the highest
previously used metadata LSN or each affected data structure is modified
and written out without incident (which resets the metadata LSN), log
recovery is susceptible to filesystem corruption.

This problem is ultimately addressed and repaired in the associated
userspace tools. The kernel is still responsible to detect the problem
and notify the user that something is wrong. Check the superblock LSN at
mount time and fail the mount if it is invalid. From that point on,
trigger verifier failure on any metadata I/O where an invalid LSN is
detected. This results in a filesystem shutdown and guarantees that we
do not log metadata changes with invalid LSNs on disk. Since this is a
known issue with a known recovery path, present a warning to instruct
the user how to recover.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_alloc.c          | 12 ++++++---
 fs/xfs/libxfs/xfs_attr_leaf.c      |  3 +++
 fs/xfs/libxfs/xfs_btree.c          | 17 ++++++++++--
 fs/xfs/libxfs/xfs_da_btree.c       |  4 +++
 fs/xfs/libxfs/xfs_dir2_block.c     |  3 +++
 fs/xfs/libxfs/xfs_dir2_data.c      |  3 +++
 fs/xfs/libxfs/xfs_dir2_leaf.c      |  3 +++
 fs/xfs/libxfs/xfs_dir2_node.c      |  3 +++
 fs/xfs/libxfs/xfs_ialloc.c         | 10 +++++--
 fs/xfs/libxfs/xfs_sb.c             | 10 +++++++
 fs/xfs/libxfs/xfs_symlink_remote.c |  4 +++
 fs/xfs/xfs_log.c                   | 54 ++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_log.h                   |  1 +
 fs/xfs/xfs_log_priv.h              | 51 +++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_log_recover.c           | 12 ++++++++-
 15 files changed, 180 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ffad7f2..d39e6d8 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -482,7 +482,9 @@ xfs_agfl_verify(
 		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
 			return false;
 	}
-	return true;
+
+	return xfs_log_check_lsn(mp,
+				 be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
 }
 
 static void
@@ -2259,9 +2261,13 @@ xfs_agf_verify(
  {
 	struct xfs_agf	*agf = XFS_BUF_TO_AGF(bp);
 
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
 			return false;
+		if (!xfs_log_check_lsn(mp,
+				be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
+			return false;
+	}
 
 	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
 	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 33df52d..aa187f7 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 #include "xfs_dir2.h"
+#include "xfs_log.h"
 
 
 /*
@@ -266,6 +267,8 @@ xfs_attr3_leaf_verify(
 			return false;
 		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
 			return false;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+			return false;
 	} else {
 		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
 			return false;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f7d7ee7..235d026 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_alloc.h"
+#include "xfs_log.h"
 
 /*
  * Cursor allocation zone.
@@ -243,8 +244,14 @@ bool
 xfs_btree_lblock_verify_crc(
 	struct xfs_buf		*bp)
 {
-	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+			return false;
 		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+	}
 
 	return true;
 }
@@ -275,8 +282,14 @@ bool
 xfs_btree_sblock_verify_crc(
 	struct xfs_buf		*bp)
 {
-	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+			return false;
 		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+	}
 
 	return true;
 }
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index be43248..e89a0f8f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -39,6 +39,7 @@
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 
 /*
  * xfs_da_btree.c
@@ -150,6 +151,8 @@ xfs_da3_node_verify(
 			return false;
 		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
 			return false;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+			return false;
 	} else {
 		if (ichdr.magic != XFS_DA_NODE_MAGIC)
 			return false;
@@ -322,6 +325,7 @@ xfs_da3_node_create(
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 
+		memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
 		ichdr.magic = XFS_DA3_NODE_MAGIC;
 		hdr3->info.blkno = cpu_to_be64(bp->b_bn);
 		hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 4778d1d..9c10e2b 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Local function prototypes.
@@ -71,6 +72,8 @@ xfs_dir3_block_verify(
 			return false;
 		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
 			return false;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return false;
 	} else {
 		if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
 			return false;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 824131e..af71a84f 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -31,6 +31,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Check the consistency of the data block.
@@ -224,6 +225,8 @@ xfs_dir3_data_verify(
 			return false;
 		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
 			return false;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return false;
 	} else {
 		if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
 			return false;
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index f300240..3923e1f 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -33,6 +33,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Local function declarations.
@@ -164,6 +165,8 @@ xfs_dir3_leaf_verify(
 			return false;
 		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
 			return false;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+			return false;
 	} else {
 		if (leaf->hdr.info.magic != cpu_to_be16(magic))
 			return false;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index cc28e92..70b0cb2 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -33,6 +33,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Function declarations.
@@ -97,6 +98,8 @@ xfs_dir3_free_verify(
 			return false;
 		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
 			return false;
+		if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+			return false;
 	} else {
 		if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
 			return false;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 54deb2d..70c1db9 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
 #include "xfs_icreate_item.h"
 #include "xfs_icache.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 
 /*
@@ -2500,9 +2501,14 @@ xfs_agi_verify(
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
 
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+			return false;
+		if (!xfs_log_check_lsn(mp,
+				be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
 			return false;
+	}
+
 	/*
 	 * Validate the magic number of the agi block.
 	 */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4742514..a0b071d 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -35,6 +35,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -163,6 +164,15 @@ xfs_mount_validate_sb(
 "Filesystem can not be safely mounted by this kernel.");
 			return -EINVAL;
 		}
+	} else if (xfs_sb_version_hascrc(sbp)) {
+		/*
+		 * We can't read verify the sb LSN because the read verifier is
+		 * called before the log is allocated and processed. We know the
+		 * log is set up before write verifier (!check_version) calls,
+		 * so just check it here.
+		 */
+		if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+			return -EFSCORRUPTED;
 	}
 
 	if (xfs_sb_version_has_pquotino(sbp)) {
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index b9884aa..cb6fd20 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -31,6 +31,7 @@
 #include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 
 
 /*
@@ -60,6 +61,7 @@ xfs_symlink_hdr_set(
 	if (!xfs_sb_version_hascrc(&mp->m_sb))
 		return 0;
 
+	memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
 	dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
 	dsl->sl_offset = cpu_to_be32(offset);
 	dsl->sl_bytes = cpu_to_be32(size);
@@ -116,6 +118,8 @@ xfs_symlink_verify(
 		return false;
 	if (dsl->sl_owner == 0)
 		return false;
+	if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+		return false;
 
 	return true;
 }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index aaadee0..0c8ef76 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3165,11 +3165,19 @@ xlog_state_switch_iclogs(
 	}
 
 	if (log->l_curr_block >= log->l_logBBsize) {
+		/*
+		 * Rewind the current block before the cycle is bumped to make
+		 * sure that the combined LSN never transiently moves forward
+		 * when the log wraps to the next cycle. This is to support the
+		 * unlocked sample of these fields from xlog_valid_lsn(). Most
+		 * other cases should acquire l_icloglock.
+		 */
+		log->l_curr_block -= log->l_logBBsize;
+		ASSERT(log->l_curr_block >= 0);
+		smp_wmb();
 		log->l_curr_cycle++;
 		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
 			log->l_curr_cycle++;
-		log->l_curr_block -= log->l_logBBsize;
-		ASSERT(log->l_curr_block >= 0);
 	}
 	ASSERT(iclog == log->l_iclog);
 	log->l_iclog = iclog->ic_next;
@@ -4023,3 +4031,45 @@ xlog_iclogs_empty(
 	return 1;
 }
 
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+	struct xfs_mount	*mp,
+	xfs_lsn_t		lsn)
+{
+	struct xlog		*log = mp->m_log;
+	bool			valid;
+
+	/*
+	 * norecovery mode skips mount-time log processing and unconditionally
+	 * resets the in-core LSN. We can't validate in this mode, but
+	 * modifications are not allowed anyways so just return true.
+	 */
+	if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+		return true;
+
+	/*
+	 * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+	 * handled by recovery and thus safe to ignore here.
+	 */
+	if (lsn == NULLCOMMITLSN)
+		return true;
+
+	valid = xlog_valid_lsn(mp->m_log, lsn);
+
+	/* warn the user about what's gone wrong before verifier failure */
+	if (!valid) {
+		spin_lock(&log->l_icloglock);
+		xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+			 CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+			 log->l_curr_cycle, log->l_curr_block);
+		spin_unlock(&log->l_icloglock);
+	}
+
+	return valid;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 09d91d3..aa533a7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,6 @@ bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 void	xfs_log_work_queue(struct xfs_mount *mp);
 void	xfs_log_worker(struct work_struct *work);
 void	xfs_log_quiesce(struct xfs_mount *mp);
+bool	xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
 
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 950f3f9..8daba74 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -560,4 +560,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
 	remove_wait_queue(wq, &wait);
 }
 
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+	struct xlog	*log,
+	xfs_lsn_t	lsn)
+{
+	int		cur_cycle;
+	int		cur_block;
+	bool		valid = true;
+
+	/*
+	 * First, sample the current lsn without locking to avoid added
+	 * contention from metadata I/O. The current cycle and block are updated
+	 * (in xlog_state_switch_iclogs()) and read here in a particular order
+	 * to avoid false negatives (e.g., thinking the metadata LSN is valid
+	 * when it is not).
+	 *
+	 * The current block is always rewound before the cycle is bumped in
+	 * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+	 * a transiently forward state. Instead, we can see the LSN in a
+	 * transiently behind state if we happen to race with a cycle wrap.
+	 */
+	cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+	smp_rmb();
+	cur_block = ACCESS_ONCE(log->l_curr_block);
+
+	if ((CYCLE_LSN(lsn) > cur_cycle) ||
+	    (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+		/*
+		 * If the metadata LSN appears invalid, it's possible the check
+		 * above raced with a wrap to the next log cycle. Grab the lock
+		 * to check for sure.
+		 */
+		spin_lock(&log->l_icloglock);
+		cur_cycle = log->l_curr_cycle;
+		cur_block = log->l_curr_block;
+		spin_unlock(&log->l_icloglock);
+
+		if ((CYCLE_LSN(lsn) > cur_cycle) ||
+		    (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+			valid = false;
+	}
+
+	return valid;
+}
+
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 512a094..f8f1363 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4609,9 +4609,19 @@ xlog_recover(
 	int		error;
 
 	/* find the tail of the log */
-	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+	error = xlog_find_tail(log, &head_blk, &tail_blk);
+	if (error)
 		return error;
 
+	/*
+	 * The superblock was read before the log was available and thus the LSN
+	 * could not be verified. Check the superblock LSN against the current
+	 * LSN now that it's known.
+	 */
+	if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+	    !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+		return -EINVAL;
+
 	if (tail_blk != head_blk) {
 		/* There used to be a comment here:
 		 *
-- 
cgit v1.1


From 3136e8bb3054d3bb68942f8f1ee6c26c05f798b0 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 12 Oct 2015 16:02:05 +1100
Subject: xfs: always drain dio before extending aio write submission

XFS supports and typically allows concurrent asynchronous direct I/O
submission to a single file. One exception to the rule is that file
extending dio writes that start beyond the current EOF (e.g.,
potentially create a hole at EOF) require exclusive I/O access to the
file. This is because such writes must zero any pre-existing blocks
beyond EOF that are exposed by virtue of now residing within EOF as a
result of the write about to be submitted.

Before EOF zeroing can occur, the current file i_size must be stabilized
to avoid data corruption. In this scenario, XFS upgrades the iolock to
exclude any further I/O submission, waits on in-flight I/O to complete
to ensure i_size is up to date (i_size is updated on dio write
completion) and restarts the various checks against the state of the
file. The problem is that this protection sequence is triggered only
when the iolock is currently held shared. While this is true for async
dio in most cases, the caller may upgrade the lock in advance based on
arbitrary circumstances with respect to EOF zeroing. For example, the
iolock is always acquired exclusively if the start offset is not block
aligned. This means that even though the iolock is already held
exclusive for such I/Os, pending I/O is not drained and thus EOF zeroing
can occur based on an unstable i_size.

This problem has been reproduced as guest data corruption in virtual
machines with file-backed qcow2 virtual disks hosted on an XFS
filesystem. The virtual disks must be configured with aio=native mode
and the must not be truncated out to the maximum file size (as some virt
managers will do).

Update xfs_file_aio_write_checks() to unconditionally drain in-flight
dio before EOF zeroing can occur. Rather than trigger the wait based on
iolock state, use a new flag and upgrade the iolock when necessary. Note
that this results in a full restart of the inode checks even when the
iolock was already held exclusive when technically it is only required
to recheck i_size. This should be a rare enough occurrence that it is
preferable to keep the code simple rather than create an alternate
restart jump target.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb4..347b3e0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -574,6 +574,7 @@ xfs_file_aio_write_checks(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			error = 0;
 	size_t			count = iov_iter_count(from);
+	bool			drained_dio = false;
 
 restart:
 	error = generic_write_checks(iocb, from);
@@ -611,12 +612,13 @@ restart:
 		bool	zero = false;
 
 		spin_unlock(&ip->i_flags_lock);
-		if (*iolock == XFS_IOLOCK_SHARED) {
-			xfs_rw_iunlock(ip, *iolock);
-			*iolock = XFS_IOLOCK_EXCL;
-			xfs_rw_ilock(ip, *iolock);
-			iov_iter_reexpand(from, count);
-
+		if (!drained_dio) {
+			if (*iolock == XFS_IOLOCK_SHARED) {
+				xfs_rw_iunlock(ip, *iolock);
+				*iolock = XFS_IOLOCK_EXCL;
+				xfs_rw_ilock(ip, *iolock);
+				iov_iter_reexpand(from, count);
+			}
 			/*
 			 * We now have an IO submission barrier in place, but
 			 * AIO can do EOF updates during IO completion and hence
@@ -626,6 +628,7 @@ restart:
 			 * no-op.
 			 */
 			inode_dio_wait(inode);
+			drained_dio = true;
 			goto restart;
 		}
 		error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
-- 
cgit v1.1


From 0a50f162af6ddc2db02c9edc5bbb823c336100a4 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 12 Oct 2015 16:02:08 +1100
Subject: xfs: add an xfs_zero_eof() tracepoint

Add a tracepoint in xfs_zero_eof() to facilitate tracking and debugging
EOF zeroing events. This has proven useful in the context of other
direct I/O tracepoints to ensure EOF zeroing occurs within appropriate
file ranges.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c  | 2 ++
 fs/xfs/xfs_trace.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 347b3e0..541dcfb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -482,6 +482,8 @@ xfs_zero_eof(
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(offset > isize);
 
+	trace_xfs_zero_eof(ip, isize, offset - isize);
+
 	/*
 	 * First handle zeroing the block on which isize resides.
 	 *
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5ed36b1..957f5cc 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1312,6 +1312,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
 DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
 
 DECLARE_EVENT_CLASS(xfs_itrunc_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
-- 
cgit v1.1


From fef4ded8cb2e53a47093b334e7730d55a3badc59 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Mon, 12 Oct 2015 16:02:32 +1100
Subject: libxfs: fix two comment typos

Just fix two typos in code comments.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_btree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f7d7ee7..1bd3532 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -222,7 +222,7 @@ xfs_btree_check_ptr(
  * long-form btree header.
  *
  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
  * it to disk.
  */
 void
@@ -254,7 +254,7 @@ xfs_btree_lblock_verify_crc(
  * short-form btree header.
  *
  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
  * it to disk.
  */
 void
-- 
cgit v1.1


From 4e247614a92852e8eb29e49be094b6b981a7895f Mon Sep 17 00:00:00 2001
From: Jan Tulak <jtulak@redhat.com>
Date: Mon, 12 Oct 2015 16:02:56 +1100
Subject: xfs: prefix XATTR_LIST_MAX with XFS_

Remove a hard dependency of Linux XATTR_LIST_MAX value by using
a prefixed version. This patch reflects the same change in xfsprogs.

Signed-off-by: Jan Tulak <jtulak@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_fs.h | 10 ++++++++++
 fs/xfs/xfs_ioctl.c     |  2 +-
 fs/xfs/xfs_ioctl32.c   |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 89689c6..b2b73a9 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -490,6 +490,16 @@ typedef struct xfs_swapext
 #define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
 
 /*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+#  define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+#  define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
  * ioctl commands that are used by Linux filesystems
  */
 #define XFS_IOC_GETXFLAGS	FS_IOC_GETFLAGS
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85a..0e692a6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -411,7 +411,7 @@ xfs_attrlist_by_handle(
 	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
 		return -EFAULT;
 	if (al_hreq.buflen < sizeof(struct attrlist) ||
-	    al_hreq.buflen > XATTR_LIST_MAX)
+	    al_hreq.buflen > XFS_XATTR_LIST_MAX)
 		return -EINVAL;
 
 	/*
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index b88bdc8..1a05d8a 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle(
 			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
 		return -EFAULT;
 	if (al_hreq.buflen < sizeof(struct attrlist) ||
-	    al_hreq.buflen > XATTR_LIST_MAX)
+	    al_hreq.buflen > XFS_XATTR_LIST_MAX)
 		return -EINVAL;
 
 	/*
-- 
cgit v1.1


From 51fcbfe7092a18a138e28110df3ab7e666bf69ee Mon Sep 17 00:00:00 2001
From: Jan Tulak <jtulak@redhat.com>
Date: Mon, 12 Oct 2015 16:03:59 +1100
Subject: xfs: avoid dependency on Linux XATTR_SIZE_MAX

Currently, we depends on Linux XATTR value for on disk
definition. Which causes trouble on other platforms and
maybe also if this value was to change.

Fix it by creating a custom definition independent from
those in Linux (although with the same values), so it is OK
with the be16 fields used for holding these attributes.

This patch reflects a change in xfsprogs.

Signed-off-by: Jan Tulak <jtulak@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_attr_remote.c |  2 +-
 fs/xfs/libxfs/xfs_format.h      | 10 +++++++++-
 fs/xfs/xfs_ioctl.c              |  4 ++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index f38f9bd..5ab95ff 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -107,7 +107,7 @@ xfs_attr3_rmt_verify(
 	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
 		return false;
 	if (be32_to_cpu(rmt->rm_offset) +
-				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+				be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
 		return false;
 	if (rmt->rm_owner == 0)
 		return false;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9590a06..8568de1 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -60,6 +60,14 @@ struct xfs_ifork;
 #define	XFS_SB_VERSION_MOREBITSBIT	0x8000
 
 /*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
  * Supported feature bit list is just all bits in the versionnum field because
  * we've used them all up and understand them all. Except, of course, for the
  * shared superblock bit, which nobody knows what it does and so is unsupported.
@@ -1483,7 +1491,7 @@ struct xfs_acl {
  */
 #define XFS_ACL_MAX_ENTRIES(mp)	\
 	(xfs_sb_version_hascrc(&mp->m_sb) \
-		?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+		?  (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
 						sizeof(struct xfs_acl_entry) \
 		: 25)
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0e692a6..9963f7c 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -455,7 +455,7 @@ xfs_attrmulti_attr_get(
 	unsigned char		*kbuf;
 	int			error = -EFAULT;
 
-	if (*len > XATTR_SIZE_MAX)
+	if (*len > XFS_XATTR_SIZE_MAX)
 		return -EINVAL;
 	kbuf = kmem_zalloc_large(*len, KM_SLEEP);
 	if (!kbuf)
@@ -485,7 +485,7 @@ xfs_attrmulti_attr_set(
 
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		return -EPERM;
-	if (len > XATTR_SIZE_MAX)
+	if (len > XFS_XATTR_SIZE_MAX)
 		return -EINVAL;
 
 	kbuf = memdup_user(ubuf, len);
-- 
cgit v1.1


From dbd5c8c9a28899c6ca719eb21afc0afba9dd5574 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 12 Oct 2015 16:04:13 +1100
Subject: xfs: pass total block res. as total xfs_bmapi_write() parameter

The total field from struct xfs_alloc_arg is a bit of an unknown
commodity. It is documented as the total block requirement for the
transaction and is used in this manner from most call sites by virtue of
passing the total block reservation of the transaction associated with
an allocation. Several xfs_bmapi_write() callers pass hardcoded values
of 0 or 1 for the total block requirement, which is a historical oddity
without any clear reasoning.

The xfs_iomap_write_direct() caller, for example, passes 0 for the total
block requirement. This has been determined to cause problems in the
form of ABBA deadlocks of AGF buffers due to incorrect AG selection in
the block allocator. Specifically, the xfs_alloc_space_available()
function incorrectly selects an AG that doesn't actually have sufficient
space for the allocation. This occurs because the args.total field is 0
and thus the remaining free space check on the AG doesn't actually
consider the size of the allocation request. This locks the AGF buffer,
the allocation attempt proceeds and ultimately fails (in
xfs_alloc_fix_minleft()), and xfs_alloc_vexent() moves on to the next
AG. In turn, this can lead to incorrect AG locking order (if the
allocator wraps around, attempting to lock AG 0 after acquiring AG N)
and thus deadlock if racing with another operation. This problem has
been reproduced via generic/299 on smallish (1GB) ramdisk test devices.

To avoid this problem, replace the undocumented hardcoded total
parameters from the iomap and utility callers to pass the block
reservation used for the associated transaction. This is consistent with
other xfs_bmapi_write() callers throughout XFS. The assumption is that
the total field allows the selection of an AG that can handle the entire
operation rather than simply the allocation/range being requested (e.g.,
resulting btree splits, etc.). This addresses the aforementioned
generic/299 hang by ensuring AG selection only occurs when the
allocation can be satisfied by the AG.

Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap_util.c |  2 +-
 fs/xfs/xfs_iomap.c     | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3bf4ad0..eca325e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1027,7 +1027,7 @@ xfs_alloc_file_space(
 		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
 					allocatesize_fsb, alloc_type, &firstfsb,
-					0, imapp, &nimaps, &free_list);
+					resblks, imapp, &nimaps, &free_list);
 		if (error) {
 			goto error0;
 		}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033..3d9fa36 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -202,8 +202,8 @@ xfs_iomap_write_direct(
 	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-				XFS_BMAPI_PREALLOC, &firstfsb, 0,
-				imap, &nimaps, &free_list);
+				XFS_BMAPI_PREALLOC, &firstfsb, resblks, imap,
+				&nimaps, &free_list);
 	if (error)
 		goto out_bmap_cancel;
 
@@ -750,9 +750,9 @@ xfs_iomap_write_allocate(
 			 * pointer that the caller gave to us.
 			 */
 			error = xfs_bmapi_write(tp, ip, map_start_fsb,
-						count_fsb, 0,
-						&first_block, 1,
-						imap, &nimaps, &free_list);
+						count_fsb, 0, &first_block,
+						nres, imap, &nimaps,
+						&free_list);
 			if (error)
 				goto trans_cancel;
 
@@ -866,8 +866,8 @@ xfs_iomap_write_unwritten(
 		xfs_bmap_init(&free_list, &firstfsb);
 		nimaps = 1;
 		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-				  XFS_BMAPI_CONVERT, &firstfsb,
-				  1, &imap, &nimaps, &free_list);
+					XFS_BMAPI_CONVERT, &firstfsb, resblks,
+					&imap, &nimaps, &free_list);
 		if (error)
 			goto error_on_bmapi_transaction;
 
-- 
cgit v1.1


From 91f9f5fe1e7350e872b3fbc3194e8183bddce514 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 12 Oct 2015 16:04:15 +1100
Subject: xfs: avoid null *src in memcpy call in xlog_write

The gcc undefined behavior sanitizer caught this; surely
any sane memcpy implementation will no-op if size == 0,
but behavior with a *src of NULL is technically undefined
(declared nonnull), so avoid it here.

We are actually in this situation frequently via
xlog_commit_record(), because:

        struct xfs_log_iovec reg = {
                .i_addr = NULL,
                .i_len = 0,
                .i_type = XLOG_REG_TYPE_COMMIT,
        };

Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0c8ef76..af08326 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2422,11 +2422,20 @@ xlog_write(
 						     &partial_copy_len);
 			xlog_verify_dest_ptr(log, ptr);
 
-			/* copy region */
+			/*
+			 * Copy region.
+			 *
+			 * Unmount records just log an opheader, so can have
+			 * empty payloads with no data region to copy. Hence we
+			 * only copy the payload if the vector says it has data
+			 * to copy.
+			 */
 			ASSERT(copy_len >= 0);
-			memcpy(ptr, reg->i_addr + copy_off, copy_len);
-			xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
-
+			if (copy_len > 0) {
+				memcpy(ptr, reg->i_addr + copy_off, copy_len);
+				xlog_write_adv_cnt(&ptr, &len, &log_offset,
+						   copy_len);
+			}
 			copy_len += start_rec_copy + sizeof(xlog_op_header_t);
 			record_cnt++;
 			data_cnt += contwr ? copy_len : 0;
-- 
cgit v1.1


From 847f9f6875fb02b576035e3dc31f5e647b7617a7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 12 Oct 2015 16:04:45 +1100
Subject: xfs: more info from kmem deadlocks and high-level error msgs

In an effort to get more useful out of "possible memory
allocation deadlock" messages, print the size of the
requested allocation, and dump the stack if the xfs error
level is tuned high.

The stack dump is implemented in define_xfs_printk_level()
for error levels >= LOGLEVEL_ERR, partly because it
seems generically useful, and also because kmem.c has
no knowledge of xfs error level tunables or other such bits,
it's very kmem-specific.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/kmem.c        | 4 ++--
 fs/xfs/xfs_message.c | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 535c136..686ba6f 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,9 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 			return ptr;
 		if (!(++retries % 100))
 			xfs_err(NULL,
-		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+	"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
 				current->comm, current->pid,
-				__func__, lflags);
+				(unsigned int)size, __func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index d8b6754..11792d8 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,6 +17,7 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_error.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...)	\
 {								\
 	struct va_format	vaf;				\
 	va_list			args;				\
+	int			level;				\
 								\
 	va_start(args, fmt);					\
 								\
@@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...)	\
 								\
 	__xfs_printk(kern_level, mp, &vaf);			\
 	va_end(args);						\
+								\
+	if (!kstrtoint(kern_level, 0, &level) &&		\
+	    level <= LOGLEVEL_ERR &&				\
+	    xfs_error_level >= XFS_ERRLEVEL_HIGH)		\
+		xfs_stack_trace();				\
 }								\
 
 define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-- 
cgit v1.1


From 225e4635580ce9fb12f8a2dc88473161cd64dbf6 Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 18:21:19 +1100
Subject: xfs: per-filesystem stats in sysfs

This patch implements per-filesystem stats objects in sysfs. It
depends on the application of the previous patch series that
develops the infrastructure to support both xfs global stats and
xfs per-fs stats in sysfs.

Stats objects are instantiated when an xfs filesystem is mounted
and deleted on unmount. With this patch, the stats directory is
created and populated with the familiar stats and stats_clear files.
Example:
        /sys/fs/xfs/sda9/stats/stats
        /sys/fs/xfs/sda9/stats/stats_clear

With this patch, the individual counts within the new per-fs
stats file(s) remain at zero. Functions that use the the macros
to increment, decrement, and add-to the per-fs stats counts will
be covered in a separate new patch to follow this one. Note that
the counts within the global stats file (/sys/fs/xfs/stats/stats)
advance normally and can be cleared as it was prior to this patch.

[dchinner: move setup/teardown to xfs_fs_{fill|put}_super() so
it is down before/after any path that uses the per-mount stats. ]

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_mount.c | 11 ++++++++++-
 fs/xfs/xfs_mount.h |  1 +
 fs/xfs/xfs_super.c | 14 ++++++++++++--
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bf92e0c..eb498ce 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -693,10 +693,15 @@ xfs_mountfs(
 	if (error)
 		goto out;
 
-	error = xfs_uuid_mount(mp);
+	error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+			       &mp->m_kobj, "stats");
 	if (error)
 		goto out_remove_sysfs;
 
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out_del_stats;
+
 	/*
 	 * Set the minimum read and write sizes
 	 */
@@ -971,6 +976,8 @@ xfs_mountfs(
 	xfs_da_unmount(mp);
  out_remove_uuid:
 	xfs_uuid_unmount(mp);
+ out_del_stats:
+	xfs_sysfs_del(&mp->m_stats.xs_kobj);
  out_remove_sysfs:
 	xfs_sysfs_del(&mp->m_kobj);
  out:
@@ -1047,6 +1054,7 @@ xfs_unmountfs(
 		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 
+
 	xfs_log_unmount(mp);
 	xfs_da_unmount(mp);
 	xfs_uuid_unmount(mp);
@@ -1056,6 +1064,7 @@ xfs_unmountfs(
 #endif
 	xfs_free_perag(mp);
 
+	xfs_sysfs_del(&mp->m_stats.xs_kobj);
 	xfs_sysfs_del(&mp->m_kobj);
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91..8795272 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -127,6 +127,7 @@ typedef struct xfs_mount {
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 						/* low free space thresholds */
 	struct xfs_kobj		m_kobj;
+	struct xstats		m_stats;	/* per-fs stats */
 
 	struct workqueue_struct *m_buf_workqueue;
 	struct workqueue_struct	*m_data_workqueue;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e1a35a5..9d11d3e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1474,9 +1474,16 @@ xfs_fs_fill_super(
 	if (error)
 		goto out_destroy_workqueues;
 
+	/* Allocate stats memory before we do operations that might use it */
+	mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+	if (!mp->m_stats.xs_stats) {
+		error = PTR_ERR(mp->m_stats.xs_stats);
+		goto out_destroy_counters;
+	}
+
 	error = xfs_readsb(mp, flags);
 	if (error)
-		goto out_destroy_counters;
+		goto out_free_stats;
 
 	error = xfs_finish_flags(mp);
 	if (error)
@@ -1545,9 +1552,11 @@ xfs_fs_fill_super(
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
+ out_free_stats:
+	free_percpu(mp->m_stats.xs_stats);
  out_destroy_counters:
 	xfs_destroy_percpu_counters(mp);
-out_destroy_workqueues:
+ out_destroy_workqueues:
 	xfs_destroy_mount_workqueues(mp);
  out_close_devices:
 	xfs_close_devices(mp);
@@ -1574,6 +1583,7 @@ xfs_fs_put_super(
 	xfs_unmountfs(mp);
 
 	xfs_freesb(mp);
+	free_percpu(mp->m_stats.xs_stats);
 	xfs_destroy_percpu_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
 	xfs_close_devices(mp);
-- 
cgit v1.1


From ff6d6af2351caea7db681f4539d0d893e400557a Mon Sep 17 00:00:00 2001
From: Bill O'Donnell <billodo@redhat.com>
Date: Mon, 12 Oct 2015 18:21:22 +1100
Subject: xfs: per-filesystem stats counter implementation

This patch modifies the stats counting macros and the callers
to those macros to properly increment, decrement, and add-to
the xfs stats counts. The counts for global and per-fs stats
are correctly advanced, and cleared by writing a "1" to the
corresponding clear file.

global counts: /sys/fs/xfs/stats/stats
per-fs counts: /sys/fs/xfs/sda*/stats/stats

global clear:  /sys/fs/xfs/stats/stats_clear
per-fs clear:  /sys/fs/xfs/sda*/stats/stats_clear

[dchinner: cleaned up macro variables, removed CONFIG_FS_PROC around
 stats structures and macros. ]

Signed-off-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_alloc.c |  8 ++++----
 fs/xfs/libxfs/xfs_attr.c  |  6 +++---
 fs/xfs/libxfs/xfs_bmap.c  | 20 ++++++++++----------
 fs/xfs/libxfs/xfs_btree.h | 39 +++++++++++++++++++++++----------------
 fs/xfs/libxfs/xfs_dir2.c  |  6 +++---
 fs/xfs/xfs_attr_list.c    |  2 +-
 fs/xfs/xfs_buf.c          | 18 +++++++++---------
 fs/xfs/xfs_dir2_readdir.c |  2 +-
 fs/xfs/xfs_dquot.c        | 14 +++++++-------
 fs/xfs/xfs_file.c         | 12 ++++++------
 fs/xfs/xfs_icache.c       | 18 +++++++++---------
 fs/xfs/xfs_inode.c        |  6 +++---
 fs/xfs/xfs_ioctl.c        |  2 +-
 fs/xfs/xfs_iomap.c        |  4 ++--
 fs/xfs/xfs_iops.c         |  4 ++--
 fs/xfs/xfs_log.c          | 22 +++++++++++-----------
 fs/xfs/xfs_qm.c           | 14 +++++++-------
 fs/xfs/xfs_stats.h        | 28 +++++++++++++++++-----------
 fs/xfs/xfs_super.c        |  6 +++---
 fs/xfs/xfs_trans.c        |  6 +++---
 fs/xfs/xfs_trans_ail.c    | 12 ++++++------
 21 files changed, 131 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ffad7f2..9b5da7e3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -651,8 +651,8 @@ xfs_alloc_ag_vextent(
 				 -((long)(args->len)));
 	}
 
-	XFS_STATS_INC(xs_allocx);
-	XFS_STATS_ADD(xs_allocb, args->len);
+	XFS_STATS_INC(args->mp, xs_allocx);
+	XFS_STATS_ADD(args->mp, xs_allocb, args->len);
 	return error;
 }
 
@@ -1808,8 +1808,8 @@ xfs_free_ag_extent(
 
 	if (!isfl)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-	XFS_STATS_INC(xs_freex);
-	XFS_STATS_ADD(xs_freeb, len);
+	XFS_STATS_INC(mp, xs_freex);
+	XFS_STATS_ADD(mp, xs_freeb, len);
 
 	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
 
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index ff06557..f949818 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -125,7 +125,7 @@ xfs_attr_get(
 	uint			lock_mode;
 	int			error;
 
-	XFS_STATS_INC(xs_attr_get);
+	XFS_STATS_INC(ip->i_mount, xs_attr_get);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
@@ -209,7 +209,7 @@ xfs_attr_set(
 	int			rsvd = (flags & ATTR_ROOT) != 0;
 	int			error, err2, committed, local;
 
-	XFS_STATS_INC(xs_attr_set);
+	XFS_STATS_INC(mp, xs_attr_set);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
@@ -412,7 +412,7 @@ xfs_attr_remove(
 	xfs_fsblock_t		firstblock;
 	int			error;
 
-	XFS_STATS_INC(xs_attr_remove);
+	XFS_STATS_INC(mp, xs_attr_remove);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8e2010d..5256fe5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1435,7 +1435,7 @@ xfs_bmap_search_extents(
 	xfs_ifork_t	*ifp;		/* inode fork pointer */
 	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
 
-	XFS_STATS_INC(xs_look_exlist);
+	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
 	ifp = XFS_IFORK_PTR(ip, fork);
 
 	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
@@ -1732,7 +1732,7 @@ xfs_bmap_add_extent_delay_real(
 	ASSERT(!bma->cur ||
 	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 
-	XFS_STATS_INC(xs_add_exlist);
+	XFS_STATS_INC(mp, xs_add_exlist);
 
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
@@ -2286,7 +2286,7 @@ xfs_bmap_add_extent_unwritten_real(
 	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
 	ASSERT(!isnullstartblock(new->br_startblock));
 
-	XFS_STATS_INC(xs_add_exlist);
+	XFS_STATS_INC(mp, xs_add_exlist);
 
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
@@ -2946,7 +2946,7 @@ xfs_bmap_add_extent_hole_real(
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 
-	XFS_STATS_INC(xs_add_exlist);
+	XFS_STATS_INC(mp, xs_add_exlist);
 
 	state = 0;
 	if (whichfork == XFS_ATTR_FORK)
@@ -4036,7 +4036,7 @@ xfs_bmapi_read(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	XFS_STATS_INC(xs_blk_mapr);
+	XFS_STATS_INC(mp, xs_blk_mapr);
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 
@@ -4221,7 +4221,7 @@ xfs_bmapi_delay(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	XFS_STATS_INC(xs_blk_mapw);
+	XFS_STATS_INC(mp, xs_blk_mapw);
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
@@ -4525,7 +4525,7 @@ xfs_bmapi_write(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 
-	XFS_STATS_INC(xs_blk_mapw);
+	XFS_STATS_INC(mp, xs_blk_mapw);
 
 	if (*firstblock == NULLFSBLOCK) {
 		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
@@ -4718,12 +4718,12 @@ xfs_bmap_del_extent(
 	xfs_filblks_t		temp2;	/* for indirect length calculations */
 	int			state = 0;
 
-	XFS_STATS_INC(xs_del_exlist);
+	mp = ip->i_mount;
+	XFS_STATS_INC(mp, xs_del_exlist);
 
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
 
-	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
 		(uint)sizeof(xfs_bmbt_rec_t)));
@@ -5070,7 +5070,7 @@ xfs_bunmapi(
 		*done = 1;
 		return 0;
 	}
-	XFS_STATS_INC(xs_blk_unmap);
+	XFS_STATS_INC(mp, xs_blk_unmap);
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	start = bno;
 	bno = start + len - 1;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 8f18bab..992dec0 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -84,31 +84,38 @@ union xfs_btree_rec {
 /*
  * Generic stats interface
  */
-#define __XFS_BTREE_STATS_INC(type, stat) \
-	XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat)  \
+#define __XFS_BTREE_STATS_INC(mp, type, stat) \
+	XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)	\
 do {    \
+	struct xfs_mount *__mp = cur->bc_mp; \
 	switch (cur->bc_btnum) {  \
-	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;	\
-	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;	\
-	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;	\
-	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;	\
-	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break;	\
+	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
+	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
+	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
+	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
+	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
 	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
 	}       \
 } while (0)
 
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
-	XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
+	XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
 #define XFS_BTREE_STATS_ADD(cur, stat, val)  \
 do {    \
+	struct xfs_mount *__mp = cur->bc_mp; \
 	switch (cur->bc_btnum) {  \
-	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
-	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
-	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
-	case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
-	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
-	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
+	case XFS_BTNUM_BNO:	\
+		__XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
+	case XFS_BTNUM_CNT:	\
+		__XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
+	case XFS_BTNUM_BMAP:	\
+		__XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
+	case XFS_BTNUM_INO:	\
+		__XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
+	case XFS_BTNUM_FINO:	\
+		__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
 	}       \
 } while (0)
 
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 9de401d..2fb53a5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -271,7 +271,7 @@ xfs_dir_createname(
 		rval = xfs_dir_ino_validate(tp->t_mountp, inum);
 		if (rval)
 			return rval;
-		XFS_STATS_INC(xs_dir_create);
+		XFS_STATS_INC(dp->i_mount, xs_dir_create);
 	}
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -365,7 +365,7 @@ xfs_dir_lookup(
 	int		lock_mode;
 
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	XFS_STATS_INC(xs_dir_lookup);
+	XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
 
 	/*
 	 * We need to use KM_NOFS here so that lockdep will not throw false
@@ -444,7 +444,7 @@ xfs_dir_removename(
 	int		v;		/* type-checking value */
 
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	XFS_STATS_INC(xs_dir_remove);
+	XFS_STATS_INC(dp->i_mount, xs_dir_remove);
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 65fb37a..0ef7c2e 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -511,7 +511,7 @@ xfs_attr_list_int(
 	xfs_inode_t *dp = context->dp;
 	uint		lock_mode;
 
-	XFS_STATS_INC(xs_attr_list);
+	XFS_STATS_INC(dp->i_mount, xs_attr_list);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ecffb3..90815c2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -201,7 +201,7 @@ _xfs_buf_alloc(
 	atomic_set(&bp->b_pin_count, 0);
 	init_waitqueue_head(&bp->b_waiters);
 
-	XFS_STATS_INC(xb_create);
+	XFS_STATS_INC(target->bt_mount, xb_create);
 	trace_xfs_buf_init(bp, _RET_IP_);
 
 	return bp;
@@ -357,12 +357,12 @@ retry:
 		"possible memory allocation deadlock in %s (mode:0x%x)",
 					__func__, gfp_mask);
 
-			XFS_STATS_INC(xb_page_retries);
+			XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
 
-		XFS_STATS_INC(xb_page_found);
+		XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
 
 		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
 		size -= nbytes;
@@ -516,7 +516,7 @@ _xfs_buf_find(
 		new_bp->b_pag = pag;
 		spin_unlock(&pag->pag_buf_lock);
 	} else {
-		XFS_STATS_INC(xb_miss_locked);
+		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
 		spin_unlock(&pag->pag_buf_lock);
 		xfs_perag_put(pag);
 	}
@@ -529,11 +529,11 @@ found:
 	if (!xfs_buf_trylock(bp)) {
 		if (flags & XBF_TRYLOCK) {
 			xfs_buf_rele(bp);
-			XFS_STATS_INC(xb_busy_locked);
+			XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
 			return NULL;
 		}
 		xfs_buf_lock(bp);
-		XFS_STATS_INC(xb_get_locked_waited);
+		XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
 	}
 
 	/*
@@ -549,7 +549,7 @@ found:
 	}
 
 	trace_xfs_buf_find(bp, flags, _RET_IP_);
-	XFS_STATS_INC(xb_get_locked);
+	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
 	return bp;
 }
 
@@ -603,7 +603,7 @@ found:
 		}
 	}
 
-	XFS_STATS_INC(xb_get);
+	XFS_STATS_INC(target->bt_mount, xb_get);
 	trace_xfs_buf_get(bp, flags, _RET_IP_);
 	return bp;
 }
@@ -643,7 +643,7 @@ xfs_buf_read_map(
 		trace_xfs_buf_read(bp, flags, _RET_IP_);
 
 		if (!XFS_BUF_ISDONE(bp)) {
-			XFS_STATS_INC(xb_get_read);
+			XFS_STATS_INC(target->bt_mount, xb_get_read);
 			bp->b_ops = ops;
 			_xfs_buf_read(bp, flags);
 		} else if (flags & XBF_ASYNC) {
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a989a9c..642d55d 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -666,7 +666,7 @@ xfs_readdir(
 		return -EIO;
 
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	XFS_STATS_INC(xs_dir_getdents);
+	XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
 
 	args.dp = dp;
 	args.geo = dp->i_mount->m_dir_geo;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 30cb3af..7ac6c5c 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -75,9 +75,9 @@ xfs_qm_dqdestroy(
 	ASSERT(list_empty(&dqp->q_lru));
 
 	mutex_destroy(&dqp->q_qlock);
-	kmem_zone_free(xfs_qm_dqzone, dqp);
 
-	XFS_STATS_DEC(xs_qm_dquot);
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+	kmem_zone_free(xfs_qm_dqzone, dqp);
 }
 
 /*
@@ -605,7 +605,7 @@ xfs_qm_dqread(
 		break;
 	}
 
-	XFS_STATS_INC(xs_qm_dquot);
+	XFS_STATS_INC(mp, xs_qm_dquot);
 
 	trace_xfs_dqread(dqp);
 
@@ -747,12 +747,12 @@ restart:
 		mutex_unlock(&qi->qi_tree_lock);
 
 		trace_xfs_dqget_hit(dqp);
-		XFS_STATS_INC(xs_qm_dqcachehits);
+		XFS_STATS_INC(mp, xs_qm_dqcachehits);
 		*O_dqpp = dqp;
 		return 0;
 	}
 	mutex_unlock(&qi->qi_tree_lock);
-	XFS_STATS_INC(xs_qm_dqcachemisses);
+	XFS_STATS_INC(mp, xs_qm_dqcachemisses);
 
 	/*
 	 * Dquot cache miss. We don't want to keep the inode lock across
@@ -806,7 +806,7 @@ restart:
 		mutex_unlock(&qi->qi_tree_lock);
 		trace_xfs_dqget_dup(dqp);
 		xfs_qm_dqdestroy(dqp);
-		XFS_STATS_INC(xs_qm_dquot_dups);
+		XFS_STATS_INC(mp, xs_qm_dquot_dups);
 		goto restart;
 	}
 
@@ -846,7 +846,7 @@ xfs_qm_dqput(
 		trace_xfs_dqput_free(dqp);
 
 		if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
-			XFS_STATS_INC(xs_qm_dquot_unused);
+			XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
 	}
 	xfs_dqunlock(dqp);
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb4..088e509 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -287,7 +287,7 @@ xfs_file_read_iter(
 	xfs_fsize_t		n;
 	loff_t			pos = iocb->ki_pos;
 
-	XFS_STATS_INC(xs_read_calls);
+	XFS_STATS_INC(mp, xs_read_calls);
 
 	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
 		ioflags |= XFS_IO_ISDIRECT;
@@ -365,7 +365,7 @@ xfs_file_read_iter(
 
 	ret = generic_file_read_iter(iocb, to);
 	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
+		XFS_STATS_ADD(mp, xs_read_bytes, ret);
 
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
@@ -383,7 +383,7 @@ xfs_file_splice_read(
 	int			ioflags = 0;
 	ssize_t			ret;
 
-	XFS_STATS_INC(xs_read_calls);
+	XFS_STATS_INC(ip->i_mount, xs_read_calls);
 
 	if (infilp->f_mode & FMODE_NOCMTIME)
 		ioflags |= XFS_IO_INVIS;
@@ -401,7 +401,7 @@ xfs_file_splice_read(
 	else
 		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
+		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
 
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
@@ -867,7 +867,7 @@ xfs_file_write_iter(
 	ssize_t			ret;
 	size_t			ocount = iov_iter_count(from);
 
-	XFS_STATS_INC(xs_write_calls);
+	XFS_STATS_INC(ip->i_mount, xs_write_calls);
 
 	if (ocount == 0)
 		return 0;
@@ -883,7 +883,7 @@ xfs_file_write_iter(
 	if (ret > 0) {
 		ssize_t err;
 
-		XFS_STATS_ADD(xs_write_bytes, ret);
+		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 
 		/* Handle various SYNC-type writes */
 		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0a326bd..d7a490f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,7 +63,7 @@ xfs_inode_alloc(
 		return NULL;
 	}
 
-	XFS_STATS_INC(vn_active);
+	XFS_STATS_INC(mp, vn_active);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(!xfs_isiflocked(ip));
@@ -129,7 +129,7 @@ xfs_inode_free(
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!xfs_isiflocked(ip));
-	XFS_STATS_DEC(vn_active);
+	XFS_STATS_DEC(ip->i_mount, vn_active);
 
 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
@@ -159,7 +159,7 @@ xfs_iget_cache_hit(
 	spin_lock(&ip->i_flags_lock);
 	if (ip->i_ino != ino) {
 		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
+		XFS_STATS_INC(mp, xs_ig_frecycle);
 		error = -EAGAIN;
 		goto out_error;
 	}
@@ -177,7 +177,7 @@ xfs_iget_cache_hit(
 	 */
 	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
+		XFS_STATS_INC(mp, xs_ig_frecycle);
 		error = -EAGAIN;
 		goto out_error;
 	}
@@ -259,7 +259,7 @@ xfs_iget_cache_hit(
 		xfs_ilock(ip, lock_flags);
 
 	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-	XFS_STATS_INC(xs_ig_found);
+	XFS_STATS_INC(mp, xs_ig_found);
 
 	return 0;
 
@@ -342,7 +342,7 @@ xfs_iget_cache_miss(
 	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 	if (unlikely(error)) {
 		WARN_ON(error != -EEXIST);
-		XFS_STATS_INC(xs_ig_dup);
+		XFS_STATS_INC(mp, xs_ig_dup);
 		error = -EAGAIN;
 		goto out_preload_end;
 	}
@@ -412,7 +412,7 @@ xfs_iget(
 	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return -EINVAL;
 
-	XFS_STATS_INC(xs_ig_attempts);
+	XFS_STATS_INC(mp, xs_ig_attempts);
 
 	/* get the perag structure and ensure that it's inode capable */
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -429,7 +429,7 @@ again:
 			goto out_error_or_again;
 	} else {
 		rcu_read_unlock();
-		XFS_STATS_INC(xs_ig_missed);
+		XFS_STATS_INC(mp, xs_ig_missed);
 
 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 							flags, lock_flags);
@@ -965,7 +965,7 @@ reclaim:
 	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-	XFS_STATS_INC(xs_ig_reclaims);
+	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
 	/*
 	 * Remove the inode from the per-AG radix tree.
 	 *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dc40a6d..a0f2bae 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3271,8 +3271,8 @@ xfs_iflush_cluster(
 	}
 
 	if (clcount) {
-		XFS_STATS_INC(xs_icluster_flushcnt);
-		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+		XFS_STATS_INC(mp, xs_icluster_flushcnt);
+		XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
 	}
 
 out_free:
@@ -3345,7 +3345,7 @@ xfs_iflush(
 	struct xfs_dinode	*dip;
 	int			error;
 
-	XFS_STATS_INC(xs_iflush_count);
+	XFS_STATS_INC(mp, xs_iflush_count);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(xfs_isiflocked(ip));
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85a..b67a130 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1028,7 +1028,7 @@ xfs_ioctl_setattr_xflags(
 	xfs_diflags_to_linux(ip);
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	XFS_STATS_INC(xs_ig_attrchg);
+	XFS_STATS_INC(mp, xs_ig_attrchg);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033..dca69c6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -670,7 +670,7 @@ xfs_iomap_write_allocate(
 	count_fsb = imap->br_blockcount;
 	map_start_fsb = imap->br_startoff;
 
-	XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+	XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
 
 	while (count_fsb != 0) {
 		/*
@@ -777,7 +777,7 @@ xfs_iomap_write_allocate(
 		if ((offset_fsb >= imap->br_startoff) &&
 		    (offset_fsb < (imap->br_startoff +
 				   imap->br_blockcount))) {
-			XFS_STATS_INC(xs_xstrat_quick);
+			XFS_STATS_INC(mp, xs_xstrat_quick);
 			return 0;
 		}
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8294132..245268a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -695,7 +695,7 @@ xfs_setattr_nonsize(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	XFS_STATS_INC(xs_ig_attrchg);
+	XFS_STATS_INC(mp, xs_ig_attrchg);
 
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
@@ -922,7 +922,7 @@ xfs_setattr_size(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	XFS_STATS_INC(xs_ig_attrchg);
+	XFS_STATS_INC(mp, xs_ig_attrchg);
 
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index aaadee0..4012523 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -268,7 +268,7 @@ xlog_grant_head_wait(
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		spin_unlock(&head->lock);
 
-		XFS_STATS_INC(xs_sleep_logspace);
+		XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
 
 		trace_xfs_log_grant_sleep(log, tic);
 		schedule();
@@ -379,7 +379,7 @@ xfs_log_regrant(
 	if (XLOG_FORCED_SHUTDOWN(log))
 		return -EIO;
 
-	XFS_STATS_INC(xs_try_logspace);
+	XFS_STATS_INC(mp, xs_try_logspace);
 
 	/*
 	 * This is a new transaction on the ticket, so we need to change the
@@ -448,7 +448,7 @@ xfs_log_reserve(
 	if (XLOG_FORCED_SHUTDOWN(log))
 		return -EIO;
 
-	XFS_STATS_INC(xs_try_logspace);
+	XFS_STATS_INC(mp, xs_try_logspace);
 
 	ASSERT(*ticp == NULL);
 	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
@@ -1768,7 +1768,7 @@ xlog_sync(
 	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
 	int		size;
 
-	XFS_STATS_INC(xs_log_writes);
+	XFS_STATS_INC(log->l_mp, xs_log_writes);
 	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 
 	/* Add for LR header */
@@ -1805,7 +1805,7 @@ xlog_sync(
 	bp = iclog->ic_bp;
 	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
 
-	XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
 
 	/* Do we need to split this write into 2 parts? */
 	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -2913,7 +2913,7 @@ restart:
 
 	iclog = log->l_iclog;
 	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
-		XFS_STATS_INC(xs_log_noiclogs);
+		XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
 
 		/* Wait for log writes to have flushed */
 		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
@@ -3212,7 +3212,7 @@ _xfs_log_force(
 	struct xlog_in_core	*iclog;
 	xfs_lsn_t		lsn;
 
-	XFS_STATS_INC(xs_log_force);
+	XFS_STATS_INC(mp, xs_log_force);
 
 	xlog_cil_force(log);
 
@@ -3297,7 +3297,7 @@ maybe_sleep:
 			spin_unlock(&log->l_icloglock);
 			return -EIO;
 		}
-		XFS_STATS_INC(xs_log_force_sleep);
+		XFS_STATS_INC(mp, xs_log_force_sleep);
 		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
 		/*
 		 * No need to grab the log lock here since we're
@@ -3362,7 +3362,7 @@ _xfs_log_force_lsn(
 
 	ASSERT(lsn != 0);
 
-	XFS_STATS_INC(xs_log_force);
+	XFS_STATS_INC(mp, xs_log_force);
 
 	lsn = xlog_cil_force_lsn(log, lsn);
 	if (lsn == NULLCOMMITLSN)
@@ -3411,7 +3411,7 @@ try_again:
 			     (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
 				ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
 
-				XFS_STATS_INC(xs_log_force_sleep);
+				XFS_STATS_INC(mp, xs_log_force_sleep);
 
 				xlog_wait(&iclog->ic_prev->ic_write_wait,
 							&log->l_icloglock);
@@ -3441,7 +3441,7 @@ try_again:
 				spin_unlock(&log->l_icloglock);
 				return -EIO;
 			}
-			XFS_STATS_INC(xs_log_force_sleep);
+			XFS_STATS_INC(mp, xs_log_force_sleep);
 			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
 			/*
 			 * No need to grab the log lock here since we're
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eac9549..7af7648 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -184,7 +184,7 @@ xfs_qm_dqpurge(
 	 */
 	ASSERT(!list_empty(&dqp->q_lru));
 	list_lru_del(&qi->qi_lru, &dqp->q_lru);
-	XFS_STATS_DEC(xs_qm_dquot_unused);
+	XFS_STATS_DEC(mp, xs_qm_dquot_unused);
 
 	xfs_qm_dqdestroy(dqp);
 	return 0;
@@ -448,11 +448,11 @@ xfs_qm_dquot_isolate(
 	 */
 	if (dqp->q_nrefs) {
 		xfs_dqunlock(dqp);
-		XFS_STATS_INC(xs_qm_dqwants);
+		XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
 
 		trace_xfs_dqreclaim_want(dqp);
 		list_lru_isolate(lru, &dqp->q_lru);
-		XFS_STATS_DEC(xs_qm_dquot_unused);
+		XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
 		return LRU_REMOVED;
 	}
 
@@ -496,19 +496,19 @@ xfs_qm_dquot_isolate(
 
 	ASSERT(dqp->q_nrefs == 0);
 	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
-	XFS_STATS_DEC(xs_qm_dquot_unused);
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
 	trace_xfs_dqreclaim_done(dqp);
-	XFS_STATS_INC(xs_qm_dqreclaims);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
 	return LRU_REMOVED;
 
 out_miss_busy:
 	trace_xfs_dqreclaim_busy(dqp);
-	XFS_STATS_INC(xs_qm_dqreclaim_misses);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
 	return LRU_SKIP;
 
 out_unlock_dirty:
 	trace_xfs_dqreclaim_busy(dqp);
-	XFS_STATS_INC(xs_qm_dqreclaim_misses);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
 	xfs_dqunlock(dqp);
 	spin_lock(lru_lock);
 	return LRU_RETRY;
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 54f2260..483b0ef 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_STATS_H__
 #define __XFS_STATS_H__
 
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
 
 #include <linux/percpu.h>
 
@@ -218,14 +217,25 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
 void xfs_stats_clearall(struct xfsstats __percpu *stats);
 extern struct xstats xfsstats;
 
-#define XFS_STATS_INC(v)	\
-	(per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++)
+#define XFS_STATS_INC(mp, v)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++;	\
+} while (0)
 
-#define XFS_STATS_DEC(v)	\
-	(per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--)
+#define XFS_STATS_DEC(mp, v)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--;	\
+} while (0)
 
-#define XFS_STATS_ADD(v, inc)	\
-	(per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc))
+#define XFS_STATS_ADD(mp, v, inc)					\
+do {									\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc);	\
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
 
 extern int xfs_init_procfs(void);
 extern void xfs_cleanup_procfs(void);
@@ -233,10 +243,6 @@ extern void xfs_cleanup_procfs(void);
 
 #else	/* !CONFIG_PROC_FS */
 
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
 static inline int xfs_init_procfs(void)
 {
 	return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9d11d3e..368c55a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -922,7 +922,7 @@ xfs_fs_destroy_inode(
 
 	trace_xfs_destroy_inode(ip);
 
-	XFS_STATS_INC(vn_reclaim);
+	XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
@@ -983,8 +983,8 @@ xfs_fs_evict_inode(
 
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
-	XFS_STATS_INC(vn_rele);
-	XFS_STATS_INC(vn_remove);
+	XFS_STATS_INC(ip->i_mount, vn_rele);
+	XFS_STATS_INC(ip->i_mount, vn_remove);
 
 	xfs_inactive(ip);
 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a0ab1da..748b16a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -930,9 +930,9 @@ __xfs_trans_commit(
 	 */
 	if (sync) {
 		error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
-		XFS_STATS_INC(xs_trans_sync);
+		XFS_STATS_INC(mp, xs_trans_sync);
 	} else {
-		XFS_STATS_INC(xs_trans_async);
+		XFS_STATS_INC(mp, xs_trans_async);
 	}
 
 	return error;
@@ -955,7 +955,7 @@ out_unreserve:
 	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
 	xfs_trans_free(tp);
 
-	XFS_STATS_INC(xs_trans_empty);
+	XFS_STATS_INC(mp, xs_trans_empty);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098cf4..4f18fd9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -349,7 +349,7 @@ xfsaild_push(
 	     xfs_ail_min_lsn(ailp))) {
 		ailp->xa_log_flush = 0;
 
-		XFS_STATS_INC(xs_push_ail_flush);
+		XFS_STATS_INC(mp, xs_push_ail_flush);
 		xfs_log_force(mp, XFS_LOG_SYNC);
 	}
 
@@ -371,7 +371,7 @@ xfsaild_push(
 		goto out_done;
 	}
 
-	XFS_STATS_INC(xs_push_ail);
+	XFS_STATS_INC(mp, xs_push_ail);
 
 	lsn = lip->li_lsn;
 	while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
@@ -385,7 +385,7 @@ xfsaild_push(
 		lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
 		switch (lock_result) {
 		case XFS_ITEM_SUCCESS:
-			XFS_STATS_INC(xs_push_ail_success);
+			XFS_STATS_INC(mp, xs_push_ail_success);
 			trace_xfs_ail_push(lip);
 
 			ailp->xa_last_pushed_lsn = lsn;
@@ -403,7 +403,7 @@ xfsaild_push(
 			 * re-try the flushing relatively soon if most of the
 			 * AIL is beeing flushed.
 			 */
-			XFS_STATS_INC(xs_push_ail_flushing);
+			XFS_STATS_INC(mp, xs_push_ail_flushing);
 			trace_xfs_ail_flushing(lip);
 
 			flushing++;
@@ -411,14 +411,14 @@ xfsaild_push(
 			break;
 
 		case XFS_ITEM_PINNED:
-			XFS_STATS_INC(xs_push_ail_pinned);
+			XFS_STATS_INC(mp, xs_push_ail_pinned);
 			trace_xfs_ail_pinned(lip);
 
 			stuck++;
 			ailp->xa_log_flush++;
 			break;
 		case XFS_ITEM_LOCKED:
-			XFS_STATS_INC(xs_push_ail_locked);
+			XFS_STATS_INC(mp, xs_push_ail_locked);
 			trace_xfs_ail_locked(lip);
 
 			stuck++;
-- 
cgit v1.1


From 9e92054e8e049f8f4c64d2c6961b2a7e3e13977f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 12 Oct 2015 18:21:22 +1100
Subject: xfs: simplify /proc teardown & error handling

remove_proc_subtree() was added in 3.9, and can be
used to simplify our procfile creation error handling
and cleanup, removing the nested gotos.  It simply
removes fs/xfs and everything created under it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_stats.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 3348b0b..bd50619 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -165,41 +165,29 @@ int
 xfs_init_procfs(void)
 {
 	if (!proc_mkdir("fs/xfs", NULL))
-		goto out;
+		return -ENOMEM;
 
 	if (!proc_symlink("fs/xfs/stat", NULL,
 			  "/sys/fs/xfs/stats/stats"))
-		goto out_remove_xfs_dir;
+		goto out;
 
 #ifdef CONFIG_XFS_QUOTA
 	if (!proc_create("fs/xfs/xqmstat", 0, NULL,
 			 &xqmstat_proc_fops))
-		goto out_remove_stat_file;
+		goto out;
 	if (!proc_create("fs/xfs/xqm", 0, NULL,
 			 &xqm_proc_fops))
-		goto out_remove_xqmstat_file;
+		goto out;
 #endif
 	return 0;
 
-#ifdef CONFIG_XFS_QUOTA
- out_remove_xqmstat_file:
-	remove_proc_entry("fs/xfs/xqmstat", NULL);
- out_remove_stat_file:
-	remove_proc_entry("fs/xfs/stat", NULL);
-#endif
- out_remove_xfs_dir:
-	remove_proc_entry("fs/xfs", NULL);
- out:
+out:
+	remove_proc_subtree("fs/xfs", NULL);
 	return -ENOMEM;
 }
 
 void
 xfs_cleanup_procfs(void)
 {
-#ifdef CONFIG_XFS_QUOTA
-	remove_proc_entry("fs/xfs/xqm", NULL);
-	remove_proc_entry("fs/xfs/xqmstat", NULL);
-#endif
-	remove_proc_entry("fs/xfs/stat", NULL);
-	remove_proc_entry("fs/xfs", NULL);
+	remove_proc_subtree("fs/xfs", NULL);
 }
-- 
cgit v1.1


From 0480334fa60488d12ae101a02d7d9e1a3d03d7dd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 Sep 2015 11:45:12 +0100
Subject: ovl: use O_LARGEFILE in ovl_copy_up()

Open the lower file with O_LARGEFILE in ovl_copy_up().

Pass O_LARGEFILE unconditionally in ovl_copy_up_data() as it's purely for
catching 32-bit userspace dealing with a file large enough that it'll be
mishandled if the application isn't aware that there might be an integer
overflow.  Inside the kernel, there shouldn't be any problems.

Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org> # v3.18+
---
 fs/overlayfs/copy_up.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 84d693d..b1990ac 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -81,11 +81,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
 	if (len == 0)
 		return 0;
 
-	old_file = ovl_path_open(old, O_RDONLY);
+	old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
 	if (IS_ERR(old_file))
 		return PTR_ERR(old_file);
 
-	new_file = ovl_path_open(new, O_WRONLY);
+	new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
 	if (IS_ERR(new_file)) {
 		error = PTR_ERR(new_file);
 		goto out_fput;
-- 
cgit v1.1


From ab79efab0a0ba01a74df782eb7fa44b044dae8b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 Sep 2015 11:45:22 +0100
Subject: ovl: fix dentry reference leak

In ovl_copy_up_locked(), newdentry is leaked if the function exits through
out_cleanup as this just to out after calling ovl_cleanup() - which doesn't
actually release the ref on newdentry.

The out_cleanup segment should instead exit through out2 as certainly
newdentry leaks - and possibly upper does also, though this isn't caught
given the catch of newdentry.

Without this fix, something like the following is seen:

	BUG: Dentry ffff880023e9eb20{i=f861,n=#ffff880023e82d90} still in use (1) [unmount of tmpfs tmpfs]
	BUG: Dentry ffff880023ece640{i=0,n=bigfile}  still in use (1) [unmount of tmpfs tmpfs]

when unmounting the upper layer after an error occurred in copyup.

An error can be induced by creating a big file in a lower layer with
something like:

	dd if=/dev/zero of=/lower/a/bigfile bs=65536 count=1 seek=$((0xf000))

to create a large file (4.1G).  Overlay an upper layer that is too small
(on tmpfs might do) and then induce a copy up by opening it writably.

Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org> # v3.18+
---
 fs/overlayfs/copy_up.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b1990ac..871fcb6 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -267,7 +267,7 @@ out:
 
 out_cleanup:
 	ovl_cleanup(wdir, newdentry);
-	goto out;
+	goto out2;
 }
 
 /*
-- 
cgit v1.1


From 1c8a47df36d72ace8cf78eb6c228aa0f8027d3c2 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 12 Oct 2015 15:56:20 +0200
Subject: ovl: fix open in stacked overlay

If two overlayfs filesystems are stacked on top of each other, then we need
recursion in ovl_d_select_inode().

I guess d_backing_inode() is supposed to do that.  But currently it doesn't
and that functionality is open coded in vfs_open().  This is now copied
into ovl_d_select_inode() to fix this regression.

Reported-by: Alban Crequy <alban.crequy@gmail.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay...")
Cc: David Howells <dhowells@redhat.com>
Cc: <stable@vger.kernel.org> # v4.2+
---
 fs/overlayfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index d9da5a4..ec0c2a050 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -363,6 +363,9 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
 		ovl_path_upper(dentry, &realpath);
 	}
 
+	if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
+		return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
+
 	return d_backing_inode(realpath.dentry);
 }
 
-- 
cgit v1.1


From 0f95502ad84874b3c05fc7cdd9d4d9d5cddf7859 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Mon, 24 Aug 2015 15:57:18 +0300
Subject: ovl: free stack of paths in ovl_fill_super

This fixes small memory leak after mount.

Kmemleak report:

unreferenced object 0xffff88003683fe00 (size 16):
  comm "mount", pid 2029, jiffies 4294909563 (age 33.380s)
  hex dump (first 16 bytes):
    20 27 1f bb 00 88 ff ff 40 4b 0f 36 02 88 ff ff   '......@K.6....
  backtrace:
    [<ffffffff811f8cd4>] create_object+0x124/0x2c0
    [<ffffffff817a059b>] kmemleak_alloc+0x7b/0xc0
    [<ffffffff811dffe6>] __kmalloc+0x106/0x340
    [<ffffffffa01b7a29>] ovl_fill_super+0x389/0x9a0 [overlay]
    [<ffffffff81200ac4>] mount_nodev+0x54/0xa0
    [<ffffffffa01b7118>] ovl_mount+0x18/0x20 [overlay]
    [<ffffffff81201ab3>] mount_fs+0x43/0x170
    [<ffffffff81220d34>] vfs_kern_mount+0x74/0x170
    [<ffffffff812233ad>] do_mount+0x22d/0xdf0
    [<ffffffff812242cb>] SyS_mount+0x7b/0xc0
    [<ffffffff817b6bee>] entry_SYSCALL_64_fastpath+0x12/0x76
    [<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Fixes: a78d9f0d5d5c ("ovl: support multiple lower layers")
Cc: <stable@vger.kernel.org> # v4.0+
---
 fs/overlayfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff3..3f90c43c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1048,6 +1048,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 		oe->lowerstack[i].dentry = stack[i].dentry;
 		oe->lowerstack[i].mnt = ufs->lower_mnt[i];
 	}
+	kfree(stack);
 
 	root_dentry->d_fsdata = oe;
 
-- 
cgit v1.1


From 5ffdbe8bf1e485026e1c7e4714d2841553cf0b40 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Mon, 24 Aug 2015 15:57:19 +0300
Subject: ovl: free lower_mnt array in ovl_put_super

This fixes memory leak after umount.

Kmemleak report:

unreferenced object 0xffff8800ba791010 (size 8):
  comm "mount", pid 2394, jiffies 4294996294 (age 53.920s)
  hex dump (first 8 bytes):
    20 1c 13 02 00 88 ff ff                           .......
  backtrace:
    [<ffffffff811f8cd4>] create_object+0x124/0x2c0
    [<ffffffff817a059b>] kmemleak_alloc+0x7b/0xc0
    [<ffffffff811dffe6>] __kmalloc+0x106/0x340
    [<ffffffffa0152bfc>] ovl_fill_super+0x55c/0x9b0 [overlay]
    [<ffffffff81200ac4>] mount_nodev+0x54/0xa0
    [<ffffffffa0152118>] ovl_mount+0x18/0x20 [overlay]
    [<ffffffff81201ab3>] mount_fs+0x43/0x170
    [<ffffffff81220d34>] vfs_kern_mount+0x74/0x170
    [<ffffffff812233ad>] do_mount+0x22d/0xdf0
    [<ffffffff812242cb>] SyS_mount+0x7b/0xc0
    [<ffffffff817b6bee>] entry_SYSCALL_64_fastpath+0x12/0x76
    [<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Fixes: dd662667e6d3 ("ovl: add mutli-layer infrastructure")
Cc: <stable@vger.kernel.org> # v4.0+
---
 fs/overlayfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3f90c43c..8d04b86 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -544,6 +544,7 @@ static void ovl_put_super(struct super_block *sb)
 	mntput(ufs->upper_mnt);
 	for (i = 0; i < ufs->numlower; i++)
 		mntput(ufs->lower_mnt[i]);
+	kfree(ufs->lower_mnt);
 
 	kfree(ufs->config.lowerdir);
 	kfree(ufs->config.upperdir);
-- 
cgit v1.1


From 6fdf860f15d4a6be8f0947bad608d687fe0c7af7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 12:47:51 -0400
Subject: writeback: fix bdi_writeback iteration in
 wakeup_dirtytime_writeback()

wakeup_dirtytime_writeback() walks and wakes up all wb's of all bdi's;
unfortunately, it was always waking up bdi->wb instead of the wb being
walked.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 001fe6f617b1 ("writeback: make wakeup_dirtytime_writeback() handle multiple bdi_writeback's")
Reviewed-by: Jan Kara <jack@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 091a364..d0da306 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1897,8 +1897,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 		struct wb_iter iter;
 
 		bdi_for_each_wb(wb, bdi, &iter, 0)
-			if (!list_empty(&bdi->wb.b_dirty_time))
-				wb_wakeup(&bdi->wb);
+			if (!list_empty(&wb->b_dirty_time))
+				wb_wakeup(wb);
 	}
 	rcu_read_unlock();
 	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
-- 
cgit v1.1


From b817525a4a80c04e4ca44192d97a1ffa9f2be572 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Oct 2015 14:47:05 -0400
Subject: writeback: bdi_writeback iteration must not skip dying ones

bdi_for_each_wb() is used in several places to wake up or issue
writeback work items to all wb's (bdi_writeback's) on a given bdi.
The iteration is performed by walking bdi->cgwb_tree; however, the
tree only indexes wb's which are currently active.

For example, when a memcg gets associated with a different blkcg, the
old wb is removed from the tree so that the new one can be indexed.
The old wb starts dying from then on but will linger till all its
inodes are drained.  As these dying wb's may still host dirty inodes,
writeback operations which affect all wb's must include them.
bdi_for_each_wb() skipping dying wb's led to sync(2) missing and
failing to sync the inodes belonging to those wb's.

This patch adds a RCU protected @bdi->wb_list which lists all wb's
beloinging to that bdi.  wb's are added on creation and removed on
release rather than on the start of destruction.  bdi_for_each_wb()
usages are replaced with list_for_each[_continue]_rcu() iterations
over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed.

v2: Updated as per Jan.  last_wb ref leak in bdi_split_work_to_wbs()
    fixed and unnecessary list head severing in cgwb_bdi_destroy()
    removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Artem Bityutskiy <dedekind1@gmail.com>
Fixes: ebe41ab0c79d ("writeback: implement bdi_for_each_wb()")
Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0da306..29e4599 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 				  struct wb_writeback_work *base_work,
 				  bool skip_if_busy)
 {
-	int next_memcg_id = 0;
-	struct bdi_writeback *wb;
-	struct wb_iter iter;
+	struct bdi_writeback *last_wb = NULL;
+	struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
+						struct bdi_writeback, bdi_node);
 
 	might_sleep();
 restart:
 	rcu_read_lock();
-	bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+	list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
 		DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
 		struct wb_writeback_work fallback_work;
 		struct wb_writeback_work *work;
 		long nr_pages;
 
+		if (last_wb) {
+			wb_put(last_wb);
+			last_wb = NULL;
+		}
+
 		/* SYNC_ALL writes out I_DIRTY_TIME too */
 		if (!wb_has_dirty_io(wb) &&
 		    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@ restart:
 
 		wb_queue_work(wb, work);
 
-		next_memcg_id = wb->memcg_css->id + 1;
+		/*
+		 * Pin @wb so that it stays on @bdi->wb_list.  This allows
+		 * continuing iteration from @wb after dropping and
+		 * regrabbing rcu read lock.
+		 */
+		wb_get(wb);
+		last_wb = wb;
+
 		rcu_read_unlock();
 		wb_wait_for_completion(bdi, &fallback_work_done);
 		goto restart;
 	}
 	rcu_read_unlock();
+
+	if (last_wb)
+		wb_put(last_wb);
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
@@ -1857,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		struct bdi_writeback *wb;
-		struct wb_iter iter;
 
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_for_each_wb(wb, bdi, &iter, 0)
+		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
 					   false, reason);
 	}
@@ -1894,9 +1908,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		struct bdi_writeback *wb;
-		struct wb_iter iter;
 
-		bdi_for_each_wb(wb, bdi, &iter, 0)
+		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			if (!list_empty(&wb->b_dirty_time))
 				wb_wakeup(wb);
 	}
-- 
cgit v1.1


From c912a8298c16ef15aa2b7203022c935f439f488b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 7 Oct 2015 09:46:37 -0700
Subject: f2fs: add F2FS_GOING_DOWN_METAFLUSH to test power-failure

This patch introduces F2FS_GOING_DOWN_METAFLUSH which flushes meta pages like
SSA blocks and then blocks all the writes.
This can be used by power-failure tests.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 1 +
 fs/f2fs/file.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index aad4720..f05ae22 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -250,6 +250,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_GOING_DOWN_FULLSYNC	0x0	/* going down with full sync */
 #define F2FS_GOING_DOWN_METASYNC	0x1	/* going down with metadata */
 #define F2FS_GOING_DOWN_NOSYNC		0x2	/* going down */
+#define F2FS_GOING_DOWN_METAFLUSH	0x3	/* going down with meta flush */
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b3985a6..6d3cfd5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1498,6 +1498,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 	case F2FS_GOING_DOWN_NOSYNC:
 		f2fs_stop_checkpoint(sbi);
 		break;
+	case F2FS_GOING_DOWN_METAFLUSH:
+		sync_meta_pages(sbi, META, LONG_MAX);
+		f2fs_stop_checkpoint(sbi);
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.1


From a125702326d9c3b753fe9c9b9727d3b3dd1cba4a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 8 Oct 2015 10:40:07 -0700
Subject: Revert "f2fs: do not skip dentry block writes"

The periodic checkpoint can resolve the previous issue.
So, now we can use this again to improve the reported performance regression:

https://lkml.org/lkml/2015/10/8/20

This reverts commit 15bec0ff5a9ba6d203178fa8772259df6207942a.
---
 fs/f2fs/data.c    | 5 +++++
 fs/f2fs/node.c    | 5 +++++
 fs/f2fs/node.h    | 1 +
 fs/f2fs/segment.h | 4 +++-
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a903423..bc04e92 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1340,6 +1340,11 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
 		return 0;
 
+	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
+			available_free_memory(sbi, DIRTY_DENTS))
+		goto skip_write;
+
 	/* during POR, we don't need to trigger writepage at all. */
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto skip_write;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 1fe49ca..4d9bedf 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -52,6 +52,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
 							PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+	} else if (type == DIRTY_DENTS) {
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
+			return false;
+		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == INO_ENTRIES) {
 		int i;
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 51c62ed..7427e95 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -118,6 +118,7 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
 enum mem_type {
 	FREE_NIDS,	/* indicates the free nid list */
 	NAT_ENTRIES,	/* indicates the cached nat entry */
+	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	EXTENT_CACHE,	/* indicates extent cache */
 	BASE_CHECK,	/* check kernel status */
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e9afb58..ee44d34 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -699,7 +699,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	if (sbi->sb->s_bdi->wb.dirty_exceeded)
 		return 0;
 
-	if (type == NODE)
+	if (type == DATA)
+		return sbi->blocks_per_seg;
+	else if (type == NODE)
 		return 3 * sbi->blocks_per_seg;
 	else if (type == META)
 		return MAX_BIO_BLOCKS(sbi);
-- 
cgit v1.1


From 6e2c64ad7cebf8740c5e1241de374c6b6ea80f81 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 7 Oct 2015 12:28:41 -0700
Subject: f2fs: fix SSA updates resulting in corruption

The f2fs_collapse_range and f2fs_insert_range changes the block addresses
directly. But that can cause uncovered SSA updates.
In that case, we need to give up to change the block addresses and do buffered
writes to keep filesystem consistency.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  11 +++
 fs/f2fs/file.c    | 205 +++++++++++++++++++++++++-----------------------------
 fs/f2fs/segment.c |  33 ++++++++-
 3 files changed, 136 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f05ae22..6f2310c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1233,6 +1233,16 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
 	return sbi->total_valid_inode_count;
 }
 
+static inline void f2fs_copy_page(struct page *src, struct page *dst)
+{
+	char *src_kaddr = kmap(src);
+	char *dst_kaddr = kmap(dst);
+
+	memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
+	kunmap(dst);
+	kunmap(src);
+}
+
 static inline void f2fs_put_page(struct page *page, int unlock)
 {
 	if (!page)
@@ -1754,6 +1764,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *);
 int create_flush_cmd_control(struct f2fs_sb_info *);
 void destroy_flush_cmd_control(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
+bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6d3cfd5..8d5583b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -826,86 +826,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	return ret;
 }
 
-static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+static int __exchange_data_block(struct inode *inode, pgoff_t src,
+					pgoff_t dst, bool full)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
-	pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
-	int ret = 0;
-
-	for (; end < nrpages; start++, end++) {
-		block_t new_addr, old_addr;
-
-		f2fs_lock_op(sbi);
+	block_t new_addr;
+	bool do_replace = false;
+	int ret;
 
-		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
-		if (ret && ret != -ENOENT) {
-			goto out;
-		} else if (ret == -ENOENT) {
-			new_addr = NULL_ADDR;
-		} else {
-			new_addr = dn.data_blkaddr;
-			truncate_data_blocks_range(&dn, 1);
-			f2fs_put_dnode(&dn);
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+	if (ret && ret != -ENOENT) {
+		return ret;
+	} else if (ret == -ENOENT) {
+		new_addr = NULL_ADDR;
+	} else {
+		new_addr = dn.data_blkaddr;
+		if (!is_checkpointed_data(sbi, new_addr)) {
+			dn.data_blkaddr = NULL_ADDR;
+			/* do not invalidate this block address */
+			set_data_blkaddr(&dn);
+			f2fs_update_extent_cache(&dn);
+			do_replace = true;
 		}
+		f2fs_put_dnode(&dn);
+	}
 
-		if (new_addr == NULL_ADDR) {
-			set_new_dnode(&dn, inode, NULL, NULL, 0);
-			ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
-			if (ret && ret != -ENOENT) {
-				goto out;
-			} else if (ret == -ENOENT) {
-				f2fs_unlock_op(sbi);
-				continue;
-			}
+	if (new_addr == NULL_ADDR)
+		return full ? truncate_hole(inode, dst, dst + 1) : 0;
 
-			if (dn.data_blkaddr == NULL_ADDR) {
-				f2fs_put_dnode(&dn);
-				f2fs_unlock_op(sbi);
-				continue;
-			} else {
-				truncate_data_blocks_range(&dn, 1);
-			}
+	if (do_replace) {
+		struct page *ipage = get_node_page(sbi, inode->i_ino);
+		struct node_info ni;
 
-			f2fs_put_dnode(&dn);
-		} else {
-			struct page *ipage;
+		if (IS_ERR(ipage)) {
+			ret = PTR_ERR(ipage);
+			goto err_out;
+		}
 
-			ipage = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(ipage)) {
-				ret = PTR_ERR(ipage);
-				goto out;
-			}
+		set_new_dnode(&dn, inode, ipage, NULL, 0);
+		ret = f2fs_reserve_block(&dn, dst);
+		if (ret)
+			goto err_out;
 
-			set_new_dnode(&dn, inode, ipage, NULL, 0);
-			ret = f2fs_reserve_block(&dn, start);
-			if (ret)
-				goto out;
+		truncate_data_blocks_range(&dn, 1);
 
-			old_addr = dn.data_blkaddr;
-			if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) {
-				dn.data_blkaddr = NULL_ADDR;
-				f2fs_update_extent_cache(&dn);
-				invalidate_blocks(sbi, old_addr);
+		get_node_info(sbi, dn.nid, &ni);
+		f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
+				ni.version, true);
+		f2fs_put_dnode(&dn);
+	} else {
+		struct page *psrc, *pdst;
+
+		psrc = get_lock_data_page(inode, src);
+		if (IS_ERR(psrc))
+			return PTR_ERR(psrc);
+		pdst = get_new_data_page(inode, NULL, dst, false);
+		if (IS_ERR(pdst)) {
+			f2fs_put_page(psrc, 1);
+			return PTR_ERR(pdst);
+		}
+		f2fs_copy_page(psrc, pdst);
+		set_page_dirty(pdst);
+		f2fs_put_page(pdst, 1);
+		f2fs_put_page(psrc, 1);
 
-				dn.data_blkaddr = new_addr;
-				set_data_blkaddr(&dn);
-			} else if (new_addr != NEW_ADDR) {
-				struct node_info ni;
+		return truncate_hole(inode, src, src + 1);
+	}
+	return 0;
 
-				get_node_info(sbi, dn.nid, &ni);
-				f2fs_replace_block(sbi, &dn, old_addr, new_addr,
-							ni.version, true);
-			}
+err_out:
+	if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
+		dn.data_blkaddr = new_addr;
+		set_data_blkaddr(&dn);
+		f2fs_update_extent_cache(&dn);
+		f2fs_put_dnode(&dn);
+	}
+	return ret;
+}
 
-			f2fs_put_dnode(&dn);
-		}
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int ret = 0;
+
+	for (; end < nrpages; start++, end++) {
+		f2fs_balance_fs(sbi);
+		f2fs_lock_op(sbi);
+		ret = __exchange_data_block(inode, end, start, true);
 		f2fs_unlock_op(sbi);
+		if (ret)
+			break;
 	}
-	return 0;
-out:
-	f2fs_unlock_op(sbi);
 	return ret;
 }
 
@@ -944,7 +958,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
+	/* write out all moved pages, if possible */
+	filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+	truncate_pagecache(inode, offset);
+
 	new_size = i_size_read(inode) - len;
+	truncate_pagecache(inode, new_size);
 
 	ret = truncate_blocks(inode, new_size, true);
 	if (!ret)
@@ -1067,7 +1086,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	pgoff_t pg_start, pg_end, delta, nrpages, idx;
 	loff_t new_size;
-	int ret;
+	int ret = 0;
 
 	new_size = i_size_read(inode) + len;
 	if (new_size > inode->i_sb->s_maxbytes)
@@ -1105,57 +1124,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
 
 	for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
-		struct dnode_of_data dn;
-		struct page *ipage;
-		block_t new_addr, old_addr;
-
 		f2fs_lock_op(sbi);
-
-		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA);
-		if (ret && ret != -ENOENT) {
-			goto out;
-		} else if (ret == -ENOENT) {
-			goto next;
-		} else if (dn.data_blkaddr == NULL_ADDR) {
-			f2fs_put_dnode(&dn);
-			goto next;
-		} else {
-			new_addr = dn.data_blkaddr;
-			truncate_data_blocks_range(&dn, 1);
-			f2fs_put_dnode(&dn);
-		}
-
-		ipage = get_node_page(sbi, inode->i_ino);
-		if (IS_ERR(ipage)) {
-			ret = PTR_ERR(ipage);
-			goto out;
-		}
-
-		set_new_dnode(&dn, inode, ipage, NULL, 0);
-		ret = f2fs_reserve_block(&dn, idx + delta);
-		if (ret)
-			goto out;
-
-		old_addr = dn.data_blkaddr;
-		f2fs_bug_on(sbi, old_addr != NEW_ADDR);
-
-		if (new_addr != NEW_ADDR) {
-			struct node_info ni;
-
-			get_node_info(sbi, dn.nid, &ni);
-			f2fs_replace_block(sbi, &dn, old_addr, new_addr,
-							ni.version, true);
-		}
-		f2fs_put_dnode(&dn);
-next:
+		ret = __exchange_data_block(inode, idx, idx + delta, false);
 		f2fs_unlock_op(sbi);
+		if (ret)
+			break;
 	}
 
-	i_size_write(inode, new_size);
-	return 0;
-out:
-	f2fs_unlock_op(sbi);
+	/* write out all moved pages, if possible */
+	filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+	truncate_pagecache(inode, offset);
+
+	if (!ret)
+		i_size_write(inode, new_size);
 	return ret;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1d86a35..581a9af 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -768,6 +768,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 	mutex_unlock(&sit_i->sentry_lock);
 }
 
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	struct sit_info *sit_i = SIT_I(sbi);
+	unsigned int segno, offset;
+	struct seg_entry *se;
+	bool is_cp = false;
+
+	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+		return true;
+
+	mutex_lock(&sit_i->sentry_lock);
+
+	segno = GET_SEGNO(sbi, blkaddr);
+	se = get_seg_entry(sbi, segno);
+	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+	if (f2fs_test_bit(offset, se->ckpt_valid_map))
+		is_cp = true;
+
+	mutex_unlock(&sit_i->sentry_lock);
+
+	return is_cp;
+}
+
 /*
  * This function should be resided under the curseg_mutex lock
  */
@@ -1370,7 +1394,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
 	__add_sum_entry(sbi, type, sum);
 
-	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+	if (!recover_curseg)
+		update_sit_entry(sbi, new_blkaddr, 1);
+	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+		update_sit_entry(sbi, old_blkaddr, -1);
+
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
+
 	locate_dirty_segment(sbi, old_cursegno);
 
 	if (recover_curseg) {
-- 
cgit v1.1


From a56c7c6fb3c60857c1335bcb8b914e6f65655486 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 9 Oct 2015 15:11:38 -0700
Subject: f2fs: set GFP_NOFS for grab_cache_page

For normal inodes, their pages are allocated with __GFP_FS, which can cause
filesystem calls when reclaiming memory.
This can incur a dead lock condition accordingly.

So, this patch addresses this problem by introducing
f2fs_grab_cache_page(.., bool for_write), which calls
grab_cache_page_write_begin() with AOP_FLAG_NOFS.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 16 +++++++++-------
 fs/f2fs/dir.c  |  6 +++---
 fs/f2fs/f2fs.h | 12 ++++++++++--
 fs/f2fs/file.c |  6 +++---
 fs/f2fs/gc.c   |  6 +++---
 5 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index bc04e92..d4f1c74 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -275,7 +275,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 	return f2fs_reserve_block(dn, index);
 }
 
-struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+						int rw, bool for_write)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
@@ -292,7 +293,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		return read_mapping_page(mapping, index, NULL);
 
-	page = grab_cache_page(mapping, index);
+	page = f2fs_grab_cache_page(mapping, index, for_write);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -352,7 +353,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = get_read_data_page(inode, index, READ_SYNC);
+	page = get_read_data_page(inode, index, READ_SYNC, false);
 	if (IS_ERR(page))
 		return page;
 
@@ -372,12 +373,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
  * Because, the callers, functions in dir.c and GC, should be able to know
  * whether this page exists or not.
  */
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+							bool for_write)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = get_read_data_page(inode, index, READ_SYNC);
+	page = get_read_data_page(inode, index, READ_SYNC, for_write);
 	if (IS_ERR(page))
 		return page;
 
@@ -411,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode,
 	struct dnode_of_data dn;
 	int err;
 repeat:
-	page = grab_cache_page(mapping, index);
+	page = f2fs_grab_cache_page(mapping, index, true);
 	if (!page) {
 		/*
 		 * before exiting, we should make sure ipage will be released
@@ -439,7 +441,7 @@ repeat:
 	} else {
 		f2fs_put_page(page, 1);
 
-		page = get_read_data_page(inode, index, READ_SYNC);
+		page = get_read_data_page(inode, index, READ_SYNC, true);
 		if (IS_ERR(page))
 			goto repeat;
 
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 6726c4a..7c1678b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -258,7 +258,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_parent_inline_dir(dir, p);
 
-	page = get_lock_data_page(dir, 0);
+	page = get_lock_data_page(dir, 0, false);
 	if (IS_ERR(page))
 		return NULL;
 
@@ -740,7 +740,7 @@ bool f2fs_empty_dir(struct inode *dir)
 		return f2fs_empty_inline_dir(dir);
 
 	for (bidx = 0; bidx < nblock; bidx++) {
-		dentry_page = get_lock_data_page(dir, bidx);
+		dentry_page = get_lock_data_page(dir, bidx, false);
 		if (IS_ERR(dentry_page)) {
 			if (PTR_ERR(dentry_page) == -ENOENT)
 				continue;
@@ -854,7 +854,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
 	for (; n < npages; n++) {
-		dentry_page = get_lock_data_page(inode, n);
+		dentry_page = get_lock_data_page(inode, n, false);
 		if (IS_ERR(dentry_page))
 			continue;
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6f2310c..6ba5a59 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1233,6 +1233,14 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
 	return sbi->total_valid_inode_count;
 }
 
+static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
+						pgoff_t index, bool for_write)
+{
+	if (!for_write)
+		return grab_cache_page(mapping, index);
+	return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+}
+
 static inline void f2fs_copy_page(struct page *src, struct page *dst)
 {
 	char *src_kaddr = kmap(src);
@@ -1831,9 +1839,9 @@ void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_get_block(struct dnode_of_data *, pgoff_t);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-struct page *get_read_data_page(struct inode *, pgoff_t, int);
+struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
 struct page *find_data_page(struct inode *, pgoff_t);
-struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8d5583b..5d2a2ee 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -505,14 +505,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 		return 0;
 
 	if (cache_only) {
-		page = grab_cache_page(mapping, index);
+		page = f2fs_grab_cache_page(mapping, index, false);
 		if (page && PageUptodate(page))
 			goto truncate_out;
 		f2fs_put_page(page, 1);
 		return 0;
 	}
 
-	page = get_lock_data_page(inode, index);
+	page = get_lock_data_page(inode, index, true);
 	if (IS_ERR(page))
 		return 0;
 truncate_out:
@@ -879,7 +879,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
 	} else {
 		struct page *psrc, *pdst;
 
-		psrc = get_lock_data_page(inode, src);
+		psrc = get_lock_data_page(inode, src, true);
 		if (IS_ERR(psrc))
 			return PTR_ERR(psrc);
 		pdst = get_new_data_page(inode, NULL, dst, false);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index e7cec86..94278db 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -550,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	int err;
 
 	/* do not read out */
-	page = grab_cache_page(inode->i_mapping, bidx);
+	page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
 	if (!page)
 		return;
 
@@ -620,7 +620,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 {
 	struct page *page;
 
-	page = get_lock_data_page(inode, bidx);
+	page = get_lock_data_page(inode, bidx, true);
 	if (IS_ERR(page))
 		return;
 
@@ -714,7 +714,7 @@ next_step:
 
 			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
 			data_page = get_read_data_page(inode,
-					start_bidx + ofs_in_node, READA);
+					start_bidx + ofs_in_node, READA, true);
 			if (IS_ERR(data_page)) {
 				iput(inode);
 				continue;
-- 
cgit v1.1


From b8c2940048adf4b2fcc5ae738f2bd4821ebf6a8a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 12 Oct 2015 17:02:26 +0800
Subject: f2fs: add a tracepoint for f2fs_read_data_pages

This patch adds a tracepoint for f2fs_read_data_pages to trace when pages
are readahead by VFS.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d4f1c74..2285376 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1032,6 +1032,9 @@ static int f2fs_read_data_pages(struct file *file,
 			struct list_head *pages, unsigned nr_pages)
 {
 	struct inode *inode = file->f_mapping->host;
+	struct page *page = list_entry(pages->prev, struct page, lru);
+
+	trace_f2fs_readpages(inode, page, nr_pages);
 
 	/* If the file has inline data, skip readpages */
 	if (f2fs_has_inline_data(inode))
-- 
cgit v1.1


From 2b947003fa98d5a39f3b21214380d0b1daf750b5 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 12 Oct 2015 17:04:21 +0800
Subject: f2fs: don't tag REQ_META for temporary non-meta pages

In recovery or checkpoint flow, we grab pages temperarily in meta inode's
mapping for caching temperary data, actually, datas in these pages were
not meta data of f2fs, but still we tag them with REQ_META flag. However,
lower device like eMMC may do some optimization for data of such type.
So in order to avoid wrong optimization, we'd better remove such flag
for temperary non-meta pages.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 20 +++++++++++++++++++-
 fs/f2fs/f2fs.h       |  1 +
 fs/f2fs/node.c       |  2 +-
 fs/f2fs/recovery.c   |  4 ++--
 fs/f2fs/segment.c    |  3 +++
 5 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0a9ec42..60a9599 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -47,7 +47,8 @@ repeat:
 /*
  * We guarantee no failure on the returned page.
  */
-struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
+							bool is_meta)
 {
 	struct address_space *mapping = META_MAPPING(sbi);
 	struct page *page;
@@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 		.blk_addr = index,
 		.encrypted_page = NULL,
 	};
+
+	if (unlikely(!is_meta))
+		fio.rw &= ~REQ_META;
 repeat:
 	page = grab_cache_page(mapping, index);
 	if (!page) {
@@ -91,6 +95,17 @@ out:
 	return page;
 }
 
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+	return __get_meta_page(sbi, index, true);
+}
+
+/* for POR only */
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+	return __get_meta_page(sbi, index, false);
+}
+
 bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 {
 	switch (type) {
@@ -137,6 +152,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
 		.encrypted_page = NULL,
 	};
 
+	if (unlikely(type == META_POR))
+		fio.rw &= ~REQ_META;
+
 	for (; nrpages-- > 0; blkno++) {
 
 		if (!is_valid_blkaddr(sbi, blkno, type))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6ba5a59..6c0e83c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1806,6 +1806,7 @@ void destroy_segment_manager_caches(void);
  */
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
 bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
 int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4d9bedf..c61dfb6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1807,7 +1807,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 		ra_meta_pages(sbi, addr, nrpages, META_POR);
 
 		for (idx = addr; idx < addr + nrpages; idx++) {
-			struct page *page = get_meta_page(sbi, idx);
+			struct page *page = get_tmp_page(sbi, idx);
 
 			rn = F2FS_NODE(page);
 			sum_entry->nid = rn->footer.nid;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index c5daec5..75dbc07 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -188,7 +188,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 		if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
 			return 0;
 
-		page = get_meta_page(sbi, blkaddr);
+		page = get_tmp_page(sbi, blkaddr);
 
 		if (cp_ver != cpver_of_node(page))
 			break;
@@ -480,7 +480,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 
 		ra_meta_pages_cond(sbi, blkaddr);
 
-		page = get_meta_page(sbi, blkaddr);
+		page = get_tmp_page(sbi, blkaddr);
 
 		if (cp_ver != cpver_of_node(page)) {
 			f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 581a9af..13aa7a6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1317,6 +1317,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 		.encrypted_page = NULL,
 	};
 
+	if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
+		fio.rw &= ~REQ_META;
+
 	set_page_writeback(page);
 	f2fs_submit_page_mbio(&fio);
 }
-- 
cgit v1.1


From 26879fb101f28c554294eaf25ac7817a2825b180 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 12 Oct 2015 17:05:59 +0800
Subject: f2fs: support lower priority asynchronous readahead in ra_meta_pages

Now, we use ra_meta_pages to reads continuous physical blocks as much as
possible to improve performance of following reads. However, ra_meta_pages
uses a synchronous readahead approach by submitting bio with READ, as READ
is with high priority, it can not be used in the case of preloading blocks,
and it's not sure when these RAed pages will be used.

This patch supports asynchronous readahead in ra_meta_pages by tagging bio
with READA flag in order to allow preloading.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 9 +++++----
 fs/f2fs/f2fs.h       | 2 +-
 fs/f2fs/gc.c         | 2 +-
 fs/f2fs/node.c       | 5 +++--
 fs/f2fs/recovery.c   | 2 +-
 fs/f2fs/segment.c    | 6 +++---
 6 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 60a9599..f661d80 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -140,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 /*
  * Readahead CP/NAT/SIT/SSA pages
  */
-int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+							int type, bool sync)
 {
 	block_t prev_blk_addr = 0;
 	struct page *page;
@@ -148,7 +149,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = META,
-		.rw = READ_SYNC | REQ_META | REQ_PRIO,
+		.rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
 		.encrypted_page = NULL,
 	};
 
@@ -214,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 	f2fs_put_page(page, 0);
 
 	if (readahead)
-		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
 }
 
 static int f2fs_write_meta_page(struct page *page,
@@ -521,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
 	orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
 
-	ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
+	ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
 
 	for (i = 0; i < orphan_blocks; i++) {
 		struct page *page = get_meta_page(sbi, start_blk + i);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6c0e83c..5f372c0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1808,7 +1808,7 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
 bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
 void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 94278db..3f19634 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -840,7 +840,7 @@ gc_more:
 	/* readahead multi ssa blocks those have contiguous address */
 	if (sbi->segs_per_sec > 1)
 		ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
-								META_SSA);
+							META_SSA, true);
 
 	for (i = 0; i < sbi->segs_per_sec; i++) {
 		/*
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c61dfb6..ad98e35 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1529,7 +1529,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 		return;
 
 	/* readahead nat pages to be scanned */
-	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
+	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
+							META_NAT, true);
 
 	while (1) {
 		struct page *page = get_current_nat_page(sbi, nid);
@@ -1804,7 +1805,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 		nrpages = min(last_offset - i, bio_blocks);
 
 		/* readahead node pages */
-		ra_meta_pages(sbi, addr, nrpages, META_POR);
+		ra_meta_pages(sbi, addr, nrpages, META_POR, true);
 
 		for (idx = addr; idx < addr + nrpages; idx++) {
 			struct page *page = get_tmp_page(sbi, idx);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 75dbc07..cbf74f4 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -180,7 +180,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
-	ra_meta_pages(sbi, blkaddr, 1, META_POR);
+	ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
 
 	while (1) {
 		struct fsync_inode_entry *entry;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 13aa7a6..5337fd6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1621,7 +1621,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 
 		if (npages >= 2)
 			ra_meta_pages(sbi, start_sum_block(sbi), npages,
-								META_CP);
+							META_CP, true);
 
 		/* restore for compacted data summary */
 		if (read_compacted_summaries(sbi))
@@ -1631,7 +1631,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 
 	if (__exist_node_summaries(sbi))
 		ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
-					NR_CURSEG_TYPE - type, META_CP);
+					NR_CURSEG_TYPE - type, META_CP, true);
 
 	for (; type <= CURSEG_COLD_NODE; type++) {
 		err = read_normal_summaries(sbi, type);
@@ -2118,7 +2118,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 	int nrpages = MAX_BIO_BLOCKS(sbi);
 
 	do {
-		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
 
 		start = start_blk * sit_i->sents_per_block;
 		end = (start_blk + readed) * sit_i->sents_per_block;
-- 
cgit v1.1


From 2db2388fcf0d880bd7160264274ed873baec4f0d Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 12 Oct 2015 17:07:33 +0800
Subject: f2fs: readahead for free nids building

When there is no free nid in nid cache, all new node allocaters stop their
job to wait for reloading of free nids, however reloading is synchronous as
we will read 4 NAT pages for building nid cache, it cause the long latency.

This patch tries to readahead more NAT pages with READA request flag after
reloading of free nids. It helps to improve performance when users allocate
node id intensively.

Env: Sandisk 32G sd card
time for i in `seq 1 60000`; { echo -n > /mnt/f2fs/$i; echo XXXXXX > /mnt/f2fs/$i;}

Before:
real    0m2.814s
user    0m1.220s
sys     0m1.536s

After:
real    0m2.711s
user    0m1.136s
sys     0m1.568s

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ad98e35..14f4606 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1560,6 +1560,9 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 			remove_free_nid(nm_i, nid);
 	}
 	mutex_unlock(&curseg->curseg_mutex);
+
+	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
+					FREE_NID_PAGES, META_NAT, false);
 }
 
 /*
-- 
cgit v1.1


From ea1a29a0bdfffd56ca98335c0655308e8d7d0e22 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 12 Oct 2015 17:08:48 +0800
Subject: f2fs: export ra_nid_pages to sysfs

After finishing building free nid cache, we will try to readahead
asynchronously 4 more pages for the next reloading, the count of
readahead nid pages is fixed.

In some case, like SMR drive, read less sectors with fixed count
each time we trigger RA may be low efficient, since we will face
high seeking overhead, so we'd better let user to configure this
parameter from sysfs in specific workload.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 1 +
 fs/f2fs/node.c  | 3 ++-
 fs/f2fs/node.h  | 4 +++-
 fs/f2fs/super.c | 2 ++
 4 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5f372c0..94cf6bc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -510,6 +510,7 @@ struct f2fs_nm_info {
 	nid_t available_nids;		/* maximum available node ids */
 	nid_t next_scan_nid;		/* the next nid to be scanned */
 	unsigned int ram_thresh;	/* control the memory footprint */
+	unsigned int ra_nid_pages;	/* # of nid pages to be readaheaded */
 
 	/* NAT cache management */
 	struct radix_tree_root nat_root;/* root of the nat entry cache */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 14f4606..7bcbc6e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1562,7 +1562,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 	mutex_unlock(&curseg->curseg_mutex);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
-					FREE_NID_PAGES, META_NAT, false);
+					nm_i->ra_nid_pages, META_NAT, false);
 }
 
 /*
@@ -2005,6 +2005,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	nm_i->fcnt = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
 	INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7427e95..e4fffd2 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -14,9 +14,11 @@
 /* node block offset on the NAT area dedicated to the given start node id */
 #define	NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
 
-/* # of pages to perform readahead before building free nids */
+/* # of pages to perform synchronous readahead before building free nids */
 #define FREE_NID_PAGES 4
 
+#define DEF_RA_NID_PAGES	4	/* # of nid pages to be readaheaded */
+
 /* maximum readahead size for node during getting data blocks */
 #define MAX_RA_NODE		128
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index cb23d85..3a65e01 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -213,6 +213,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval);
@@ -232,6 +233,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(max_victim_search),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
+	ATTR_LIST(ra_nid_pages),
 	ATTR_LIST(cp_interval),
 	NULL,
 };
-- 
cgit v1.1


From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 07:47:08 -0400
Subject: nfsd: serialize state seqid morphing operations

Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
running in parallel. The server would receive the OPEN_DOWNGRADE first
and check its seqid, but then an OPEN would race in and bump it. The
OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
it should have been rejected since the seqid changed.

The only recourse we have here I think is to serialize operations that
bump the seqid in a stateid, particularly when we're given a seqid in
the call. To address this, we add a new rw_semaphore to the
nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
after looking up the stateid to ensure that nothing else is going to
bump it while we're operating on it.

In the case of OPEN, we do a down_read, as the call doesn't contain a
seqid. Those can run in parallel -- we just need to serialize them when
there is a concurrent OPEN_DOWNGRADE or CLOSE.

LOCK and LOCKU however always take the write lock as there is no
opportunity for parallelizing those.

Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
 fs/nfsd/state.h     | 19 ++++++++++---------
 2 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f1d569..1b39edf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
 	stp->st_openstp = NULL;
+	init_rwsem(&stp->st_rwsem);
 	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	spin_lock(&fp->fi_lock);
@@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 */
 	if (stp) {
 		/* Stateid was found, this is an OPEN upgrade */
+		down_read(&stp->st_rwsem);
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
-		if (status)
+		if (status) {
+			up_read(&stp->st_rwsem);
 			goto out;
+		}
 	} else {
 		stp = open->op_stp;
 		open->op_stp = NULL;
 		init_open_stateid(stp, fp, open);
+		down_read(&stp->st_rwsem);
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
+			up_read(&stp->st_rwsem);
 			release_open_stateid(stp);
 			goto out;
 		}
@@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	}
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_read(&stp->st_rwsem);
 
 	if (nfsd4_has_session(&resp->cstate)) {
 		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 		 * revoked delegations are kept only for free_stateid.
 		 */
 		return nfserr_bad_stateid;
+	down_write(&stp->st_rwsem);
 	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
-	if (status)
-		return status;
-	return nfs4_check_fh(current_fh, &stp->st_stid);
+	if (status == nfs_ok)
+		status = nfs4_check_fh(current_fh, &stp->st_stid);
+	if (status != nfs_ok)
+		up_write(&stp->st_rwsem);
+	return status;
 }
 
 /* 
@@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 		return status;
 	oo = openowner(stp->st_stateowner);
 	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+		up_write(&stp->st_rwsem);
 		nfs4_put_stid(&stp->st_stid);
 		return nfserr_bad_stateid;
 	}
@@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	oo = openowner(stp->st_stateowner);
 	status = nfserr_bad_stateid;
-	if (oo->oo_flags & NFS4_OO_CONFIRMED)
+	if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+		up_write(&stp->st_rwsem);
 		goto put_stateid;
+	}
 	oo->oo_flags |= NFS4_OO_CONFIRMED;
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_write(&stp->st_rwsem);
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
 
@@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 put_stateid:
+	up_write(&stp->st_rwsem);
 	nfs4_put_stid(&stp->st_stid);
 out:
 	nfsd4_bump_seqid(cstate, status);
@@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out; 
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_write(&stp->st_rwsem);
 
 	nfsd4_close_open_stateid(stp);
 
@@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
+	init_rwsem(&stp->st_rwsem);
 	list_add(&stp->st_locks, &open_stp->st_locks);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	spin_lock(&fp->fi_lock);
@@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					&open_stp, nn);
 		if (status)
 			goto out;
+		up_write(&open_stp->st_rwsem);
 		open_sop = openowner(open_stp->st_stateowner);
 		status = nfserr_bad_stateid;
 		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		status = lookup_or_create_lock_state(cstate, open_stp, lock,
 							&lock_stp, &new);
+		if (status == nfs_ok)
+			down_write(&lock_stp->st_rwsem);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid,
@@ -5540,6 +5560,8 @@ out:
 		    seqid_mutating_err(ntohl(status)))
 			lock_sop->lo_owner.so_seqid++;
 
+		up_write(&lock_stp->st_rwsem);
+
 		/*
 		 * If this is a new, never-before-used stateid, and we are
 		 * returning an error, then just go ahead and release it.
@@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 fput:
 	fput(filp);
 put_stateid:
+	up_write(&stp->st_rwsem);
 	nfs4_put_stid(&stp->st_stid);
 out:
 	nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 583ffc1..31bde12 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -534,15 +534,16 @@ struct nfs4_file {
  * Better suggestions welcome.
  */
 struct nfs4_ol_stateid {
-	struct nfs4_stid    st_stid; /* must be first field */
-	struct list_head              st_perfile;
-	struct list_head              st_perstateowner;
-	struct list_head              st_locks;
-	struct nfs4_stateowner      * st_stateowner;
-	struct nfs4_clnt_odstate    * st_clnt_odstate;
-	unsigned char                 st_access_bmap;
-	unsigned char                 st_deny_bmap;
-	struct nfs4_ol_stateid         * st_openstp;
+	struct nfs4_stid		st_stid;
+	struct list_head		st_perfile;
+	struct list_head		st_perstateowner;
+	struct list_head		st_locks;
+	struct nfs4_stateowner		*st_stateowner;
+	struct nfs4_clnt_odstate	*st_clnt_odstate;
+	unsigned char			st_access_bmap;
+	unsigned char			st_deny_bmap;
+	struct nfs4_ol_stateid		*st_openstp;
+	struct rw_semaphore		st_rwsem;
 };
 
 static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
-- 
cgit v1.1


From e79017ddce8273514d24728523d17033bcbeba6f Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sun, 13 Sep 2015 14:15:15 +0200
Subject: nfsd: drop null test before destroy functions

Remove unneeded NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@ expression x; @@
-if (x != NULL) {
  \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
  x = NULL;
-}
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 46ec934..116940c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -217,10 +217,8 @@ void nfsd_reply_cache_shutdown(void)
 	drc_hashtbl = NULL;
 	drc_hashsize = 0;
 
-	if (drc_slab) {
-		kmem_cache_destroy(drc_slab);
-		drc_slab = NULL;
-	}
+	kmem_cache_destroy(drc_slab);
+	drc_slab = NULL;
 }
 
 /*
-- 
cgit v1.1


From fcaba026a55803dd21523e6e191ba7f59e02a737 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 08:28:38 -0400
Subject: nfsd: move svc_fh->fh_maxsize to just after fh_handle

This moves the hole in the struct down below the flags fields, which
allows us to potentially add a new flag without growing the struct.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsfh.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 1e90dad..b1edd6b 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -26,9 +26,9 @@ static inline ino_t u32_to_ino_t(__u32 uino)
  */
 typedef struct svc_fh {
 	struct knfsd_fh		fh_handle;	/* FH data */
+	int			fh_maxsize;	/* max size for fh_handle */
 	struct dentry *		fh_dentry;	/* validated dentry */
 	struct svc_export *	fh_export;	/* export pointer */
-	int			fh_maxsize;	/* max size for fh_handle */
 
 	unsigned char		fh_locked;	/* inode locked by us */
 	unsigned char		fh_want_write;	/* remount protection taken */
-- 
cgit v1.1


From aaf91ec148910e0c2bfd135ea19f870e7196e64f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 08:28:39 -0400
Subject: nfsd: switch unsigned char flags in svc_fh to bools

...just for clarity.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c |  4 ++--
 fs/nfsd/nfsfh.c   |  5 +----
 fs/nfsd/nfsfh.h   | 18 +++++++++---------
 fs/nfsd/vfs.c     |  4 ++--
 fs/nfsd/vfs.h     |  4 ++--
 fs/nfsd/xdr4.h    |  2 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cba..00575d7 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp)
 	err = fh_getattr(fhp, &fhp->fh_post_attr);
 	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
 	if (err) {
-		fhp->fh_post_saved = 0;
+		fhp->fh_post_saved = false;
 		/* Grab the ctime anyway - set_change_info might use it */
 		fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
 	} else
-		fhp->fh_post_saved = 1;
+		fhp->fh_post_saved = true;
 }
 
 /*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 350041a..c1681ce 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp)
 		fh_unlock(fhp);
 		fhp->fh_dentry = NULL;
 		dput(dentry);
-#ifdef CONFIG_NFSD_V3
-		fhp->fh_pre_saved = 0;
-		fhp->fh_post_saved = 0;
-#endif
+		fh_clear_wcc(fhp);
 	}
 	fh_drop_write(fhp);
 	if (exp) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index b1edd6b..2087bae 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -30,12 +30,12 @@ typedef struct svc_fh {
 	struct dentry *		fh_dentry;	/* validated dentry */
 	struct svc_export *	fh_export;	/* export pointer */
 
-	unsigned char		fh_locked;	/* inode locked by us */
-	unsigned char		fh_want_write;	/* remount protection taken */
+	bool			fh_locked;	/* inode locked by us */
+	bool			fh_want_write;	/* remount protection taken */
 
 #ifdef CONFIG_NFSD_V3
-	unsigned char		fh_post_saved;	/* post-op attrs saved */
-	unsigned char		fh_pre_saved;	/* pre-op attrs saved */
+	bool			fh_post_saved;	/* post-op attrs saved */
+	bool			fh_pre_saved;	/* pre-op attrs saved */
 
 	/* Pre-op attributes saved during fh_lock */
 	__u64			fh_pre_size;	/* size before operation */
@@ -213,8 +213,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
 static inline void
 fh_clear_wcc(struct svc_fh *fhp)
 {
-	fhp->fh_post_saved = 0;
-	fhp->fh_pre_saved = 0;
+	fhp->fh_post_saved = false;
+	fhp->fh_pre_saved = false;
 }
 
 /*
@@ -231,7 +231,7 @@ fill_pre_wcc(struct svc_fh *fhp)
 		fhp->fh_pre_ctime = inode->i_ctime;
 		fhp->fh_pre_size  = inode->i_size;
 		fhp->fh_pre_change = inode->i_version;
-		fhp->fh_pre_saved = 1;
+		fhp->fh_pre_saved = true;
 	}
 }
 
@@ -267,7 +267,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
 	inode = d_inode(dentry);
 	mutex_lock_nested(&inode->i_mutex, subclass);
 	fill_pre_wcc(fhp);
-	fhp->fh_locked = 1;
+	fhp->fh_locked = true;
 }
 
 static inline void
@@ -285,7 +285,7 @@ fh_unlock(struct svc_fh *fhp)
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
 		mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
-		fhp->fh_locked = 0;
+		fhp->fh_locked = false;
 	}
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c0497..994d66f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1631,7 +1631,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	/* cannot use fh_lock as we need deadlock protective ordering
 	 * so do it by hand */
 	trap = lock_rename(tdentry, fdentry);
-	ffhp->fh_locked = tfhp->fh_locked = 1;
+	ffhp->fh_locked = tfhp->fh_locked = true;
 	fill_pre_wcc(ffhp);
 	fill_pre_wcc(tfhp);
 
@@ -1681,7 +1681,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	fill_post_wcc(ffhp);
 	fill_post_wcc(tfhp);
 	unlock_rename(tdentry, fdentry);
-	ffhp->fh_locked = tfhp->fh_locked = 0;
+	ffhp->fh_locked = tfhp->fh_locked = false;
 	fh_drop_write(ffhp);
 
 out:
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fee2451..fcfc48c 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -112,14 +112,14 @@ static inline int fh_want_write(struct svc_fh *fh)
 	int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
 
 	if (!ret)
-		fh->fh_want_write = 1;
+		fh->fh_want_write = true;
 	return ret;
 }
 
 static inline void fh_drop_write(struct svc_fh *fh)
 {
 	if (fh->fh_want_write) {
-		fh->fh_want_write = 0;
+		fh->fh_want_write = false;
 		mnt_drop_write(fh->fh_export->ex_path.mnt);
 	}
 }
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9f99100..ce7362c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -632,7 +632,7 @@ static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
 	BUG_ON(!fhp->fh_pre_saved);
-	cinfo->atomic = fhp->fh_post_saved;
+	cinfo->atomic = (u32)fhp->fh_post_saved;
 	cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
 
 	cinfo->before_change = fhp->fh_pre_change;
-- 
cgit v1.1


From 0ad95472bf169a3501991f8f33f5147f792a8116 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 23 Sep 2015 15:49:29 +0300
Subject: lockd: create NSM handles per net namespace

Commit cb7323fffa85 ("lockd: create and use per-net NSM
 RPC clients on MON/UNMON requests") introduced per-net
NSM RPC clients. Unfortunately this doesn't make any sense
without per-net nsm_handle.

E.g. the following scenario could happen
Two hosts (X and Y) in different namespaces (A and B) share
the same nsm struct.

1. nsm_monitor(host_X) called => NSM rpc client created,
	nsm->sm_monitored bit set.
2. nsm_mointor(host-Y) called => nsm->sm_monitored already set,
	we just exit. Thus in namespace B ln->nsm_clnt == NULL.
3. host X destroyed => nsm->sm_count decremented to 1
4. host Y destroyed => nsm_unmonitor() => nsm_mon_unmon() => NULL-ptr
	dereference of *ln->nsm_clnt

So this could be fixed by making per-net nsm_handles list,
instead of global. Thus different net namespaces will not be able
share the same nsm_handle.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/host.c     |  7 ++++---
 fs/lockd/mon.c      | 36 ++++++++++++++++++++++--------------
 fs/lockd/netns.h    |  1 +
 fs/lockd/svc.c      |  1 +
 fs/lockd/svc4proc.c |  2 +-
 fs/lockd/svcproc.c  |  2 +-
 6 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 969d589..b5f3c3ab 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
 		atomic_inc(&nsm->sm_count);
 	else {
 		host = NULL;
-		nsm = nsm_get_handle(ni->sap, ni->salen,
+		nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
 					ni->hostname, ni->hostname_len);
 		if (unlikely(nsm == NULL)) {
 			dprintk("lockd: %s failed; no nsm handle\n",
@@ -534,17 +534,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 
 /**
  * nlm_host_rebooted - Release all resources held by rebooted host
+ * @net:  network namespace
  * @info: pointer to decoded results of NLM_SM_NOTIFY call
  *
  * We were notified that the specified host has rebooted.  Release
  * all resources held by that peer.
  */
-void nlm_host_rebooted(const struct nlm_reboot *info)
+void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info)
 {
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	nsm = nsm_reboot_lookup(info);
+	nsm = nsm_reboot_lookup(net, info);
 	if (unlikely(nsm == NULL))
 		return;
 
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 47a32b6..6c05cd1 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,6 @@ struct nsm_res {
 };
 
 static const struct rpc_program	nsm_program;
-static				LIST_HEAD(nsm_handles);
 static				DEFINE_SPINLOCK(nsm_lock);
 
 /*
@@ -264,33 +263,35 @@ void nsm_unmonitor(const struct nlm_host *host)
 	}
 }
 
-static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
-					      const size_t len)
+static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles,
+					const char *hostname, const size_t len)
 {
 	struct nsm_handle *nsm;
 
-	list_for_each_entry(nsm, &nsm_handles, sm_link)
+	list_for_each_entry(nsm, nsm_handles, sm_link)
 		if (strlen(nsm->sm_name) == len &&
 		    memcmp(nsm->sm_name, hostname, len) == 0)
 			return nsm;
 	return NULL;
 }
 
-static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles,
+					const struct sockaddr *sap)
 {
 	struct nsm_handle *nsm;
 
-	list_for_each_entry(nsm, &nsm_handles, sm_link)
+	list_for_each_entry(nsm, nsm_handles, sm_link)
 		if (rpc_cmp_addr(nsm_addr(nsm), sap))
 			return nsm;
 	return NULL;
 }
 
-static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles,
+					const struct nsm_private *priv)
 {
 	struct nsm_handle *nsm;
 
-	list_for_each_entry(nsm, &nsm_handles, sm_link)
+	list_for_each_entry(nsm, nsm_handles, sm_link)
 		if (memcmp(nsm->sm_priv.data, priv->data,
 					sizeof(priv->data)) == 0)
 			return nsm;
@@ -353,6 +354,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
 
 /**
  * nsm_get_handle - Find or create a cached nsm_handle
+ * @net: network namespace
  * @sap: pointer to socket address of handle to find
  * @salen: length of socket address
  * @hostname: pointer to C string containing hostname to find
@@ -365,11 +367,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
  * @hostname cannot be found in the handle cache.  Returns NULL if
  * an error occurs.
  */
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+				  const struct sockaddr *sap,
 				  const size_t salen, const char *hostname,
 				  const size_t hostname_len)
 {
 	struct nsm_handle *cached, *new = NULL;
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
 
 	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
 		if (printk_ratelimit()) {
@@ -384,9 +388,10 @@ retry:
 	spin_lock(&nsm_lock);
 
 	if (nsm_use_hostnames && hostname != NULL)
-		cached = nsm_lookup_hostname(hostname, hostname_len);
+		cached = nsm_lookup_hostname(&ln->nsm_handles,
+					hostname, hostname_len);
 	else
-		cached = nsm_lookup_addr(sap);
+		cached = nsm_lookup_addr(&ln->nsm_handles, sap);
 
 	if (cached != NULL) {
 		atomic_inc(&cached->sm_count);
@@ -400,7 +405,7 @@ retry:
 	}
 
 	if (new != NULL) {
-		list_add(&new->sm_link, &nsm_handles);
+		list_add(&new->sm_link, &ln->nsm_handles);
 		spin_unlock(&nsm_lock);
 		dprintk("lockd: created nsm_handle for %s (%s)\n",
 				new->sm_name, new->sm_addrbuf);
@@ -417,19 +422,22 @@ retry:
 
 /**
  * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @net:  network namespace
  * @info: pointer to NLMPROC_SM_NOTIFY arguments
  *
  * Returns a matching nsm_handle if found in the nsm cache. The returned
  * nsm_handle's reference count is bumped. Otherwise returns NULL if some
  * error occurred.
  */
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+				const struct nlm_reboot *info)
 {
 	struct nsm_handle *cached;
+	struct lockd_net *ln = net_generic(net, lockd_net_id);
 
 	spin_lock(&nsm_lock);
 
-	cached = nsm_lookup_priv(&info->priv);
+	cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv);
 	if (unlikely(cached == NULL)) {
 		spin_unlock(&nsm_lock);
 		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 097bfa3..89fe011b 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -15,6 +15,7 @@ struct lockd_net {
 	spinlock_t nsm_clnt_lock;
 	unsigned int nsm_users;
 	struct rpc_clnt *nsm_clnt;
+	struct list_head nsm_handles;
 };
 
 extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d678bcc..0dff13f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -593,6 +593,7 @@ static int lockd_init_net(struct net *net)
 	INIT_LIST_HEAD(&ln->lockd_manager.list);
 	ln->lockd_manager.block_opens = false;
 	spin_lock_init(&ln->nsm_clnt_lock);
+	INIT_LIST_HEAD(&ln->nsm_handles);
 	return 0;
 }
 
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b147d1a..09c576f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	nlm_host_rebooted(argp);
+	nlm_host_rebooted(SVC_NET(rqstp), argp);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 21171f0..fb26b9f 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	nlm_host_rebooted(argp);
+	nlm_host_rebooted(SVC_NET(rqstp), argp);
 	return rpc_success;
 }
 
-- 
cgit v1.1


From d906d49fc5f49a0129527e8fbc13312f36b9b9ce Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 2 Oct 2015 10:47:34 +0100
Subject: Btrfs: send, fix file corruption due to incorrect cloning operations

If we have a file that shares an extent with other files, when processing
the extent item relative to a shared extent, we blindly issue a clone
operation that will target a length matching the length in the extent item
and uses as a source some other file the receiver already has and points
to the same extent. However that range in the other file might not
exclusively point only to the shared extent, and so using that length
will result in the receiver getting a file with different data from the
one in the send snapshot. This issue happened both for incremental and
full send operations.

So fix this by issuing clone operations with lengths that don't cover
regions of the source file that point to different extents (or have holes).

The following test case for fstests reproduces the problem.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $send_files_dir
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _need_to_be_root
  _require_cp_reflink
  _require_xfs_io_command "fpunch"

  send_files_dir=$TEST_DIR/btrfs-test-$seq

  rm -f $seqres.full
  rm -fr $send_files_dir
  mkdir $send_files_dir

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  # Create our test file with a single 100K extent.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K" \
     $SCRATCH_MNT/foo | _filter_xfs_io

  # Clone our file into a new file named bar.
  cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar

  # Now overwrite parts of our foo file.
  $XFS_IO_PROG -c "pwrite -S 0xbb 50K 10K" \
     -c "pwrite -S 0xcc 90K 10K" \
     -c "fpunch 70K 10k" \
     $SCRATCH_MNT/foo | _filter_xfs_io

  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
     $SCRATCH_MNT/snap

  echo "File digests in the original filesystem:"
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  _run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap

  # Now recreate the filesystem by receiving the send stream and verify
  # we get the same file contents that the original filesystem had.
  _scratch_unmount
  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount

  _run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap

  # We expect the destination filesystem to have exactly the same file
  # data as the original filesystem.
  # The btrfs send implementation had a bug where it sent a clone
  # operation from file foo into file bar covering the whole [0, 100K[
  # range after creating and writing the file foo. This was incorrect
  # because the file bar now included the updates done to file foo after
  # we cloned foo to bar, breaking the COW nature of reflink copies
  # (cloned extents).
  echo "File digests in the new filesystem:"
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  status=0
  exit

Another test case that reproduces the problem when we have compressed
extents:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $send_files_dir
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _need_to_be_root
  _require_cp_reflink

  send_files_dir=$TEST_DIR/btrfs-test-$seq

  rm -f $seqres.full
  rm -fr $send_files_dir
  mkdir $send_files_dir

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  # Create our file with an extent of 100K starting at file offset 0K.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 100K"       \
                  -c "fsync"                        \
                  $SCRATCH_MNT/foo | _filter_xfs_io

  # Rewrite part of the previous extent (its first 40K) and write a new
  # 100K extent starting at file offset 100K.
  $XFS_IO_PROG -c "pwrite -S 0xbb 0K 40K"    \
          -c "pwrite -S 0xcc 100K 100K"      \
          $SCRATCH_MNT/foo | _filter_xfs_io

  # Our file foo now has 3 file extent items in its metadata:
  #
  # 1) One covering the file range 0 to 40K;
  # 2) One covering the file range 40K to 100K, which points to the first
  #    extent we wrote to the file and has a data offset field with value
  #    40K (our file no longer uses the first 40K of data from that
  #    extent);
  # 3) One covering the file range 100K to 200K.

  # Now clone our file foo into file bar.
  cp --reflink=always $SCRATCH_MNT/foo $SCRATCH_MNT/bar

  # Create our snapshot for the send operation.
  _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT \
          $SCRATCH_MNT/snap

  echo "File digests in the original filesystem:"
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  _run_btrfs_util_prog send $SCRATCH_MNT/snap -f $send_files_dir/1.snap

  # Now recreate the filesystem by receiving the send stream and verify we
  # get the same file contents that the original filesystem had.
  # Btrfs send used to issue a clone operation from foo's range
  # [80K, 140K[ to bar's range [40K, 100K[ when cloning the extent pointed
  # to by foo's second file extent item, this was incorrect because of bad
  # accounting of the file extent item's data offset field. The correct
  # range to clone from should have been [40K, 100K[.
  _scratch_unmount
  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  _run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap

  echo "File digests in the new filesystem:"
  # Must match the digests we got in the original filesystem.
  md5sum $SCRATCH_MNT/snap/foo | _filter_scratch
  md5sum $SCRATCH_MNT/snap/bar | _filter_scratch

  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 173 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b5d47b9..ced935b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 	}
 
 	if (cur_clone_root) {
-		if (compressed != BTRFS_COMPRESS_NONE) {
-			/*
-			 * Offsets given by iterate_extent_inodes() are relative
-			 * to the start of the extent, we need to add logical
-			 * offset from the file extent item.
-			 * (See why at backref.c:check_extent_in_eb())
-			 */
-			cur_clone_root->offset += btrfs_file_extent_offset(eb,
-									   fi);
-		}
 		*found = cur_clone_root;
 		ret = 0;
 	} else {
@@ -4687,6 +4677,171 @@ tlv_put_failure:
 	return ret;
 }
 
+static int send_extent_data(struct send_ctx *sctx,
+			    const u64 offset,
+			    const u64 len)
+{
+	u64 sent = 0;
+
+	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+		return send_update_extent(sctx, offset, len);
+
+	while (sent < len) {
+		u64 size = len - sent;
+		int ret;
+
+		if (size > BTRFS_SEND_READ_SIZE)
+			size = BTRFS_SEND_READ_SIZE;
+		ret = send_write(sctx, offset + sent, size);
+		if (ret < 0)
+			return ret;
+		if (!ret)
+			break;
+		sent += ret;
+	}
+	return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+		       struct clone_root *clone_root,
+		       const u64 disk_byte,
+		       u64 data_offset,
+		       u64 offset,
+		       u64 len)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * We can't send a clone operation for the entire range if we find
+	 * extent items in the respective range in the source file that
+	 * refer to different extents or if we find holes.
+	 * So check for that and do a mix of clone and regular write/copy
+	 * operations if needed.
+	 *
+	 * Example:
+	 *
+	 * mkfs.btrfs -f /dev/sda
+	 * mount /dev/sda /mnt
+	 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+	 * cp --reflink=always /mnt/foo /mnt/bar
+	 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+	 * btrfs subvolume snapshot -r /mnt /mnt/snap
+	 *
+	 * If when we send the snapshot and we are processing file bar (which
+	 * has a higher inode number than foo) we blindly send a clone operation
+	 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+	 * a file bar that matches the content of file foo - iow, doesn't match
+	 * the content from bar in the original filesystem.
+	 */
+	key.objectid = clone_root->ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = clone_root->offset;
+	ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0 && path->slots[0] > 0) {
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+		if (key.objectid == clone_root->ino &&
+		    key.type == BTRFS_EXTENT_DATA_KEY)
+			path->slots[0]--;
+	}
+
+	while (true) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int slot = path->slots[0];
+		struct btrfs_file_extent_item *ei;
+		u8 type;
+		u64 ext_len;
+		u64 clone_len;
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(clone_root->root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		/*
+		 * We might have an implicit trailing hole (NO_HOLES feature
+		 * enabled). We deal with it after leaving this loop.
+		 */
+		if (key.objectid != clone_root->ino ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+		type = btrfs_file_extent_type(leaf, ei);
+		if (type == BTRFS_FILE_EXTENT_INLINE) {
+			ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+			ext_len = PAGE_CACHE_ALIGN(ext_len);
+		} else {
+			ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+		}
+
+		if (key.offset + ext_len <= clone_root->offset)
+			goto next;
+
+		if (key.offset > clone_root->offset) {
+			/* Implicit hole, NO_HOLES feature enabled. */
+			u64 hole_len = key.offset - clone_root->offset;
+
+			if (hole_len > len)
+				hole_len = len;
+			ret = send_extent_data(sctx, offset, hole_len);
+			if (ret < 0)
+				goto out;
+
+			len -= hole_len;
+			if (len == 0)
+				break;
+			offset += hole_len;
+			clone_root->offset += hole_len;
+			data_offset += hole_len;
+		}
+
+		if (key.offset >= clone_root->offset + len)
+			break;
+
+		clone_len = min_t(u64, ext_len, len);
+
+		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+		    btrfs_file_extent_offset(leaf, ei) == data_offset)
+			ret = send_clone(sctx, offset, clone_len, clone_root);
+		else
+			ret = send_extent_data(sctx, offset, clone_len);
+
+		if (ret < 0)
+			goto out;
+
+		len -= clone_len;
+		if (len == 0)
+			break;
+		offset += clone_len;
+		clone_root->offset += clone_len;
+		data_offset += clone_len;
+next:
+		path->slots[0]++;
+	}
+
+	if (len > 0)
+		ret = send_extent_data(sctx, offset, len);
+	else
+		ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
 			       struct btrfs_path *path,
 			       struct btrfs_key *key,
@@ -4695,9 +4850,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	int ret = 0;
 	struct btrfs_file_extent_item *ei;
 	u64 offset = key->offset;
-	u64 pos = 0;
 	u64 len;
-	u32 l;
 	u8 type;
 	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
 
@@ -4725,22 +4878,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	}
 
 	if (clone_root && IS_ALIGNED(offset + len, bs)) {
-		ret = send_clone(sctx, offset, len, clone_root);
-	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
-		ret = send_update_extent(sctx, offset, len);
+		u64 disk_byte;
+		u64 data_offset;
+
+		disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+		data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+		ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+				  offset, len);
 	} else {
-		while (pos < len) {
-			l = len - pos;
-			if (l > BTRFS_SEND_READ_SIZE)
-				l = BTRFS_SEND_READ_SIZE;
-			ret = send_write(sctx, pos + offset, l);
-			if (ret < 0)
-				goto out;
-			if (!ret)
-				break;
-			pos += ret;
-		}
-		ret = 0;
+		ret = send_extent_data(sctx, offset, len);
 	}
 out:
 	return ret;
-- 
cgit v1.1


From 08b39fbd59781729da9fb6367decaf4804a22721 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 8 Oct 2015 13:27:34 +0800
Subject: f2fs crypto: fix racing of accessing encrypted page among

 different competitors

Since we use different page cache (normally inode's page cache for R/W
and meta inode's page cache for GC) to cache the same physical block
which is belong to an encrypted inode. Writeback of these two page
cache should be exclusive, but now we didn't handle writeback state
well, so there may be potential racing problem:

a)
kworker:				f2fs_gc:
 - f2fs_write_data_pages
  - f2fs_write_data_page
   - do_write_data_page
    - write_data_page
     - f2fs_submit_page_mbio
(page#1 in inode's page cache was queued
in f2fs bio cache, and be ready to write
to new blkaddr)
					 - gc_data_segment
					  - move_encrypted_block
					   - pagecache_get_page
					(page#2 in meta inode's page cache
					was cached with the invalid datas
					of physical block located in new
					blkaddr)
					   - f2fs_submit_page_mbio
					(page#1 was submitted, later, page#2
					with invalid data will be submitted)

b)
f2fs_gc:
 - gc_data_segment
  - move_encrypted_block
   - f2fs_submit_page_mbio
(page#1 in meta inode's page cache was
queued in f2fs bio cache, and be ready
to write to new blkaddr)
					user thread:
					 - f2fs_write_begin
					  - f2fs_submit_page_bio
					(we submit the request to block layer
					to update page#2 in inode's page cache
					with physical block located in new
					blkaddr, so here we may read gabbage
					data from new blkaddr since GC hasn't
					writebacked the page#1 yet)

This patch fixes above potential racing problem for encrypted inode.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    | 20 +++++++++++---------
 fs/f2fs/f2fs.h    |  1 +
 fs/f2fs/file.c    |  5 +++++
 fs/f2fs/gc.c      | 12 ++++++++++--
 fs/f2fs/segment.c | 17 +++++++++++++++++
 5 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2285376..77dfc9e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -956,21 +956,14 @@ submit_and_realloc:
 
 			if (f2fs_encrypted_inode(inode) &&
 					S_ISREG(inode->i_mode)) {
-				struct page *cpage;
 
 				ctx = f2fs_get_crypto_ctx(inode);
 				if (IS_ERR(ctx))
 					goto set_error_page;
 
 				/* wait the page to be moved by cleaning */
-				cpage = find_lock_page(
-						META_MAPPING(F2FS_I_SB(inode)),
-						block_nr);
-				if (cpage) {
-					f2fs_wait_on_page_writeback(cpage,
-									DATA);
-					f2fs_put_page(cpage, 1);
-				}
+				f2fs_wait_on_encrypted_page_writeback(
+						F2FS_I_SB(inode), block_nr);
 			}
 
 			bio = bio_alloc(GFP_KERNEL,
@@ -1064,6 +1057,11 @@ int do_write_data_page(struct f2fs_io_info *fio)
 	}
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+
+		/* wait for GCed encrypted page writeback */
+		f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
+							fio->blk_addr);
+
 		fio->encrypted_page = f2fs_encrypt(inode, fio->page);
 		if (IS_ERR(fio->encrypted_page)) {
 			err = PTR_ERR(fio->encrypted_page);
@@ -1452,6 +1450,10 @@ put_next:
 
 	f2fs_wait_on_page_writeback(page, DATA);
 
+	/* wait for GCed encrypted page writeback */
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
 	if (len == PAGE_CACHE_SIZE)
 		goto out_update;
 	if (PageUptodate(page))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 94cf6bc..c3443da 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1792,6 +1792,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5d2a2ee..91c51a6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -87,6 +87,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 mapped:
 	/* fill the page */
 	f2fs_wait_on_page_writeback(page, DATA);
+
+	/* wait for GCed encrypted page writeback */
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
 	/* if gced page is attached, don't write to cold segment */
 	clear_cold_data(page);
 out:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 3f19634..af7c24c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -559,8 +559,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	if (err)
 		goto out;
 
-	if (unlikely(dn.data_blkaddr == NULL_ADDR))
+	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+		ClearPageUptodate(page);
 		goto put_out;
+	}
+
+	/*
+	 * don't cache encrypted data into meta inode until previous dirty
+	 * data were writebacked to avoid racing between GC and flush.
+	 */
+	f2fs_wait_on_page_writeback(page, DATA);
 
 	get_node_info(fio.sbi, dn.nid, &ni);
 	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
@@ -589,7 +597,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 		goto put_page_out;
 
 	set_page_dirty(fio.encrypted_page);
-	f2fs_wait_on_page_writeback(fio.encrypted_page, META);
+	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5337fd6..f37c212 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1484,6 +1484,23 @@ void f2fs_wait_on_page_writeback(struct page *page,
 	}
 }
 
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
+							block_t blkaddr)
+{
+	struct page *cpage;
+
+	if (blkaddr == NEW_ADDR)
+		return;
+
+	f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
+
+	cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
+	if (cpage) {
+		f2fs_wait_on_page_writeback(cpage, DATA);
+		f2fs_put_page(cpage, 1);
+	}
+}
+
 static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-- 
cgit v1.1


From 84e4214f0868ae77771837d0ed4cc6eff10738ba Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 13 Oct 2015 10:00:53 -0700
Subject: f2fs: relocate the tracepoint for background_gc

Once f2fs_gc is done, wait_ms is changed once more.
So, its tracepoint would be located after it.

Reported-by: He YunLei <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index af7c24c..fedbf67 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -77,13 +77,13 @@ static int gc_thread_func(void *data)
 
 		stat_inc_bggc_count(sbi);
 
-		trace_f2fs_background_gc(sbi->sb, wait_ms,
-				prefree_segments(sbi), free_segments(sbi));
-
 		/* if return value is not zero, no victim was selected */
 		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
 			wait_ms = gc_th->no_gc_sleep_time;
 
+		trace_f2fs_background_gc(sbi->sb, wait_ms,
+				prefree_segments(sbi), free_segments(sbi));
+
 		/* balancing f2fs's metadata periodically */
 		f2fs_balance_fs_bg(sbi);
 
-- 
cgit v1.1


From b96b1db039ebc584d03a9933b279e0d3e704c528 Mon Sep 17 00:00:00 2001
From: Robin Ruede <r.ruede@gmail.com>
Date: Wed, 30 Sep 2015 21:23:33 +0200
Subject: btrfs: fix resending received snapshot with parent

This fixes a regression introduced by 37b8d27d between v4.1 and v4.2.

When a snapshot is received, its received_uuid is set to the original
uuid of the subvolume. When that snapshot is then resent to a third
filesystem, it's received_uuid is set to the second uuid
instead of the original one. The same was true for the parent_uuid.
This behaviour was partially changed in 37b8d27d, but in that patch
only the parent_uuid was taken from the real original,
not the uuid itself, causing the search for the parent to fail in
the case below.

This happens for example when trying to send a series of linked
snapshots (e.g. created by snapper) from the backup file system back
to the original one.

The following commands reproduce the issue in v4.2.1
(no error in 4.1.6)

    # setup three test file systems
    for i in 1 2 3; do
	    truncate -s 50M fs$i
	    mkfs.btrfs fs$i
	    mkdir $i
	    mount fs$i $i
    done
    echo "content" > 1/testfile
    btrfs su snapshot -r 1/ 1/snap1
    echo "changed content" > 1/testfile
    btrfs su snapshot -r 1/ 1/snap2

    # works fine:
    btrfs send 1/snap1 | btrfs receive 2/
    btrfs send -p 1/snap1 1/snap2 | btrfs receive 2/

    # ERROR: could not find parent subvolume
    btrfs send 2/snap1 | btrfs receive 3/
    btrfs send -p 2/snap1 2/snap2 | btrfs receive 3/

Signed-off-by: Robin Ruede <rruede+git@gmail.com>
Fixes: 37b8d27de5d0 ("Btrfs: use received_uuid of parent during send")
Cc: stable@vger.kernel.org # v4.2+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Ed Tomlinson <edt@aei.ca>
---
 fs/btrfs/send.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ced935b..355a458 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2343,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	}
 
 	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
-	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
-			sctx->send_root->root_item.uuid);
+
+	if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			    sctx->send_root->root_item.received_uuid);
+	else
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			    sctx->send_root->root_item.uuid);
+
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
 		    le64_to_cpu(sctx->send_root->root_item.ctransid));
 	if (parent_root) {
-- 
cgit v1.1


From 8eb934591f8bf584969454a658f629cd06e59f3a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 12 Oct 2015 16:55:54 +0200
Subject: btrfs: check unsupported filters in balance arguments

We don't verify that all the balance filter arguments supplemented by
the flags are actually known to the kernel. Thus we let it silently pass
and do nothing.

At the moment this means only the 'limit' filter, but we're going to add
a few more soon so it's better to have that fixed. Also in older stable
kernels so that it works with newer userspace tools.

Cc: stable@vger.kernel.org # 3.16+
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c   | 5 +++++
 fs/btrfs/volumes.h | 8 ++++++++
 2 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..3e3e613 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4639,6 +4639,11 @@ locked:
 		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
+	if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+		ret = -EINVAL;
+		goto out_bargs;
+	}
+
 do_balance:
 	/*
 	 * Ownership of bctl and mutually_exclusive_operation_running
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ca784a..595279a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -376,6 +376,14 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 
+#define BTRFS_BALANCE_ARGS_MASK			\
+	(BTRFS_BALANCE_ARGS_PROFILES |		\
+	 BTRFS_BALANCE_ARGS_USAGE |		\
+	 BTRFS_BALANCE_ARGS_DEVID | 		\
+	 BTRFS_BALANCE_ARGS_DRANGE |		\
+	 BTRFS_BALANCE_ARGS_VRANGE |		\
+	 BTRFS_BALANCE_ARGS_LIMIT)
+
 /*
  * Profile changing flags.  When SOFT is set we won't relocate chunk if
  * it already has the target profile (even though it may be
-- 
cgit v1.1


From dc6c5fb3b514221f2e9d21ee626a9d95d3418dff Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 13 Oct 2015 14:06:48 -0400
Subject: btrfs: fix use after free iterating extrefs

The code for btrfs inode-resolve has never worked properly for
files with enough hard links to trigger extrefs.  It was trying to
get the leaf out of a path after freeing the path:

	btrfs_release_path(path);
	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, slot);

The fix here is to use the extent buffer we cloned just a little higher
up to avoid deadlocks caused by using the leaf in the path.

Signed-off-by: Chris Mason <clm@fb.com>
cc: stable@vger.kernel.org # v3.7+
cc: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/btrfs/backref.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ecbc63d..9a2ec79 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1828,7 +1828,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 	int found = 0;
 	struct extent_buffer *eb;
 	struct btrfs_inode_extref *extref;
-	struct extent_buffer *leaf;
 	u32 item_size;
 	u32 cur_offset;
 	unsigned long ptr;
@@ -1856,9 +1855,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, slot);
-		ptr = btrfs_item_ptr_offset(leaf, slot);
+		item_size = btrfs_item_size_nr(eb, slot);
+		ptr = btrfs_item_ptr_offset(eb, slot);
 		cur_offset = 0;
 
 		while (cur_offset < item_size) {
@@ -1872,7 +1870,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 			if (ret)
 				break;
 
-			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += btrfs_inode_extref_name_len(eb, extref);
 			cur_offset += sizeof(*extref);
 		}
 		btrfs_tree_read_unlock_blocking(eb);
-- 
cgit v1.1


From 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 13 Oct 2015 15:15:00 +0100
Subject: Btrfs: fix file corruption and data loss after cloning inline extents

Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.

Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).

So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.

Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.

The following test case for fstests exercises all these cases that result
in file corruption and data loss:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner
  _require_btrfs_fs_feature "no_holes"
  _require_btrfs_mkfs_feature "no-holes"

  rm -f $seqres.full

  test_cloning_inline_extents()
  {
      local mkfs_opts=$1
      local mount_opts=$2

      _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
      _scratch_mount $mount_opts

      # File bar, the source for all the following clone operations, consists
      # of a single inline extent (50 bytes).
      $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
          | _filter_xfs_io

      # Test cloning into a file with an extent (non-inlined) where the
      # destination offset overlaps that extent. It should not be possible to
      # clone the inline extent from file bar into this file.
      $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo

      # Doing IO against any range in the first 4K of the file should work.
      # Due to a past clone ioctl bug which allowed cloning the inline extent,
      # these operations resulted in EIO errors.
      echo "File foo data after clone operation:"
      # All bytes should have the value 0xaa (clone operation failed and did
      # not modify our file).
      od -t x1 $SCRATCH_MNT/foo
      $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io

      # Test cloning the inline extent against a file which has a hole in its
      # first 4K followed by a non-inlined extent. It should not be possible
      # as well to clone the inline extent from file bar into this file.
      $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2

      # Doing IO against any range in the first 4K of the file should work.
      # Due to a past clone ioctl bug which allowed cloning the inline extent,
      # these operations resulted in EIO errors.
      echo "File foo2 data after clone operation:"
      # All bytes should have the value 0x00 (clone operation failed and did
      # not modify our file).
      od -t x1 $SCRATCH_MNT/foo2
      $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io

      # Test cloning the inline extent against a file which has a size of zero
      # but has a prealloc extent. It should not be possible as well to clone
      # the inline extent from file bar into this file.
      $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3

      # Doing IO against any range in the first 4K of the file should work.
      # Due to a past clone ioctl bug which allowed cloning the inline extent,
      # these operations resulted in EIO errors.
      echo "First 50 bytes of foo3 after clone operation:"
      # Should not be able to read any bytes, file has 0 bytes i_size (the
      # clone operation failed and did not modify our file).
      od -t x1 $SCRATCH_MNT/foo3
      $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io

      # Test cloning the inline extent against a file which consists of a
      # single inline extent that has a size not greater than the size of
      # bar's inline extent (40 < 50).
      # It should be possible to do the extent cloning from bar to this file.
      $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4

      # Doing IO against any range in the first 4K of the file should work.
      echo "File foo4 data after clone operation:"
      # Must match file bar's content.
      od -t x1 $SCRATCH_MNT/foo4
      $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io

      # Test cloning the inline extent against a file which consists of a
      # single inline extent that has a size greater than the size of bar's
      # inline extent (60 > 50).
      # It should not be possible to clone the inline extent from file bar
      # into this file.
      $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
          | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5

      # Reading the file should not fail.
      echo "File foo5 data after clone operation:"
      # Must have a size of 60 bytes, with all bytes having a value of 0x03
      # (the clone operation failed and did not modify our file).
      od -t x1 $SCRATCH_MNT/foo5

      # Test cloning the inline extent against a file which has no extents but
      # has a size greater than bar's inline extent (16K > 50).
      # It should not be possible to clone the inline extent from file bar
      # into this file.
      $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6

      # Reading the file should not fail.
      echo "File foo6 data after clone operation:"
      # Must have a size of 16K, with all bytes having a value of 0x00 (the
      # clone operation failed and did not modify our file).
      od -t x1 $SCRATCH_MNT/foo6

      # Test cloning the inline extent against a file which has no extents but
      # has a size not greater than bar's inline extent (30 < 50).
      # It should be possible to clone the inline extent from file bar into
      # this file.
      $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7

      # Reading the file should not fail.
      echo "File foo7 data after clone operation:"
      # Must have a size of 50 bytes, with all bytes having a value of 0xbb.
      od -t x1 $SCRATCH_MNT/foo7

      # Test cloning the inline extent against a file which has a size not
      # greater than the size of bar's inline extent (20 < 50) but has
      # a prealloc extent that goes beyond the file's size. It should not be
      # possible to clone the inline extent from bar into this file.
      $XFS_IO_PROG -f -c "falloc -k 0 1M" \
                      -c "pwrite -S 0x88 0 20" \
                      $SCRATCH_MNT/foo8 | _filter_xfs_io
      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8

      echo "File foo8 data after clone operation:"
      # Must have a size of 20 bytes, with all bytes having a value of 0x88
      # (the clone operation did not modify our file).
      od -t x1 $SCRATCH_MNT/foo8

      _scratch_unmount
  }

  echo -e "\nTesting without compression and without the no-holes feature...\n"
  test_cloning_inline_extents

  echo -e "\nTesting with compression and without the no-holes feature...\n"
  test_cloning_inline_extents "" "-o compress"

  echo -e "\nTesting without compression and with the no-holes feature...\n"
  test_cloning_inline_extents "-O no-holes" ""

  echo -e "\nTesting with compression and with the no-holes feature...\n"
  test_cloning_inline_extents "-O no-holes" "-o compress"

  status=0
  exit

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/ioctl.c | 195 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 152 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 80342d3..55a735a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3328,6 +3328,150 @@ static void clone_update_extent_map(struct inode *inode,
 			&BTRFS_I(inode)->runtime_flags);
 }
 
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+				    struct inode *dst,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct btrfs_key *new_key,
+				    const u64 drop_start,
+				    const u64 datal,
+				    const u64 skip,
+				    const u64 size,
+				    char *inline_data)
+{
+	struct btrfs_root *root = BTRFS_I(dst)->root;
+	const u64 aligned_end = ALIGN(new_key->offset + datal,
+				      root->sectorsize);
+	int ret;
+	struct btrfs_key key;
+
+	if (new_key->offset > 0)
+		return -EOPNOTSUPP;
+
+	key.objectid = btrfs_ino(dst);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		return ret;
+	} else if (ret > 0) {
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ret;
+			else if (ret > 0)
+				goto copy_inline_extent;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == btrfs_ino(dst) &&
+		    key.type == BTRFS_EXTENT_DATA_KEY) {
+			ASSERT(key.offset > 0);
+			return -EOPNOTSUPP;
+		}
+	} else if (i_size_read(dst) <= datal) {
+		struct btrfs_file_extent_item *ei;
+		u64 ext_len;
+
+		/*
+		 * If the file size is <= datal, make sure there are no other
+		 * extents following (can happen do to an fallocate call with
+		 * the flag FALLOC_FL_KEEP_SIZE).
+		 */
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		/*
+		 * If it's an inline extent, it can not have other extents
+		 * following it.
+		 */
+		if (btrfs_file_extent_type(path->nodes[0], ei) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			goto copy_inline_extent;
+
+		ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+		if (ext_len > aligned_end)
+			return -EOPNOTSUPP;
+
+		ret = btrfs_next_item(root, path);
+		if (ret < 0) {
+			return ret;
+		} else if (ret == 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == btrfs_ino(dst) &&
+			    key.type == BTRFS_EXTENT_DATA_KEY)
+				return -EOPNOTSUPP;
+		}
+	}
+
+copy_inline_extent:
+	/*
+	 * We have no extent items, or we have an extent at offset 0 which may
+	 * or may not be inlined. All these cases are dealt the same way.
+	 */
+	if (i_size_read(dst) > datal) {
+		/*
+		 * If the destination inode has an inline extent...
+		 * This would require copying the data from the source inline
+		 * extent into the beginning of the destination's inline extent.
+		 * But this is really complex, both extents can be compressed
+		 * or just one of them, which would require decompressing and
+		 * re-compressing data (which could increase the new compressed
+		 * size, not allowing the compressed data to fit anymore in an
+		 * inline extent).
+		 * So just don't support this case for now (it should be rare,
+		 * we are not really saving space when cloning inline extents).
+		 */
+		return -EOPNOTSUPP;
+	}
+
+	btrfs_release_path(path);
+	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+	if (ret)
+		return ret;
+	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+	if (ret)
+		return ret;
+
+	if (skip) {
+		const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+		memmove(inline_data + start, inline_data + start + skip, datal);
+	}
+
+	write_extent_buffer(path->nodes[0], inline_data,
+			    btrfs_item_ptr_offset(path->nodes[0],
+						  path->slots[0]),
+			    size);
+	inode_add_bytes(dst, datal);
+
+	return 0;
+}
+
 /**
  * btrfs_clone() - clone a range from inode file to another
  *
@@ -3594,21 +3738,6 @@ process_slot:
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				u64 skip = 0;
 				u64 trim = 0;
-				u64 aligned_end = 0;
-
-				/*
-				 * Don't copy an inline extent into an offset
-				 * greater than zero. Having an inline extent
-				 * at such an offset results in chaos as btrfs
-				 * isn't prepared for such cases. Just skip
-				 * this case for the same reasons as commented
-				 * at btrfs_ioctl_clone().
-				 */
-				if (last_dest_end > 0) {
-					ret = -EOPNOTSUPP;
-					btrfs_end_transaction(trans, root);
-					goto out;
-				}
 
 				if (off > key.offset) {
 					skip = off - key.offset;
@@ -3626,42 +3755,22 @@ process_slot:
 				size -= skip + trim;
 				datal -= skip + trim;
 
-				aligned_end = ALIGN(new_key.offset + datal,
-						    root->sectorsize);
-				ret = btrfs_drop_extents(trans, root, inode,
-							 drop_start,
-							 aligned_end,
-							 1);
+				ret = clone_copy_inline_extent(src, inode,
+							       trans, path,
+							       &new_key,
+							       drop_start,
+							       datal,
+							       skip, size, buf);
 				if (ret) {
 					if (ret != -EOPNOTSUPP)
 						btrfs_abort_transaction(trans,
-							root, ret);
-					btrfs_end_transaction(trans, root);
-					goto out;
-				}
-
-				ret = btrfs_insert_empty_item(trans, root, path,
-							      &new_key, size);
-				if (ret) {
-					btrfs_abort_transaction(trans, root,
-								ret);
+									root,
+									ret);
 					btrfs_end_transaction(trans, root);
 					goto out;
 				}
-
-				if (skip) {
-					u32 start =
-					  btrfs_file_extent_calc_inline_size(0);
-					memmove(buf+start, buf+start+skip,
-						datal);
-				}
-
 				leaf = path->nodes[0];
 				slot = path->slots[0];
-				write_extent_buffer(leaf, buf,
-					    btrfs_item_ptr_offset(leaf, slot),
-					    size);
-				inode_add_bytes(inode, datal);
 			}
 
 			/* If we have an implicit hole (NO_HOLES feature). */
-- 
cgit v1.1


From 5e6ecb362bd5950a3d8ce19c32829e4f8c7917d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 13 Oct 2015 16:36:09 +0100
Subject: Btrfs: fix double range unlock of hole region when reading page

If when reading a page we find a hole and our caller had already locked
the range (bio flags has the bit EXTENT_BIO_PARENT_LOCKED set), we end
up unlocking the hole's range and then later our caller unlocks it
again, which might have already been locked by some other task once
the first unlock happened.

Currently this can only happen during a call to the extent_same ioctl,
as it's the only caller of __do_readpage() that sets the bit
EXTENT_BIO_PARENT_LOCKED for bio flags.

Fix this by leaving the unlock exclusively to the caller.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/extent_io.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ecb1204..6e6df34 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3070,8 +3070,12 @@ static int __do_readpage(struct extent_io_tree *tree,
 
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
-			unlock_extent_cached(tree, cur, cur + iosize - 1,
-			                     &cached, GFP_NOFS);
+			if (parent_locked)
+				free_extent_state(cached);
+			else
+				unlock_extent_cached(tree, cur,
+						     cur + iosize - 1,
+						     &cached, GFP_NOFS);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
-- 
cgit v1.1


From 870823e629ea194e6cf8e82a9694ac62cad49512 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Oct 2015 15:32:37 +0200
Subject: configfs: add show and store methods to struct configfs_attribute

Add methods to struct configfs_attribute to directly show and store
attributes without adding boilerplate code to every user.  In addition
to the methods this also adds 3 helper macros to define read/write,
read-only and write-only attributes with a single line of code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nicholas Bellinger <nab@linux-iscsi.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 fs/configfs/file.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 403269f..106ca58 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -74,7 +74,11 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
 	if (!buffer->page)
 		return -ENOMEM;
 
-	count = ops->show_attribute(item,attr,buffer->page);
+	if (ops->show_attribute)
+		count = ops->show_attribute(item, attr, buffer->page);
+	else
+		count = attr->show(item, buffer->page);
+
 	buffer->needs_read_fill = 0;
 	BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
 	if (count >= 0)
@@ -173,7 +177,9 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
 	struct config_item * item = to_item(dentry->d_parent);
 	struct configfs_item_operations * ops = buffer->ops;
 
-	return ops->store_attribute(item,attr,buffer->page,count);
+	if (ops->store_attribute)
+		return ops->store_attribute(item, attr, buffer->page, count);
+	return attr->store(item, buffer->page, count);
 }
 
 
@@ -237,8 +243,8 @@ static int check_perm(struct inode * inode, struct file * file)
 	 * and we must have a store method.
 	 */
 	if (file->f_mode & FMODE_WRITE) {
-
-		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+		if (!(inode->i_mode & S_IWUGO) ||
+		    (!ops->store_attribute && !attr->store))
 			goto Eaccess;
 
 	}
@@ -248,7 +254,8 @@ static int check_perm(struct inode * inode, struct file * file)
 	 * must be a show method for it.
 	 */
 	if (file->f_mode & FMODE_READ) {
-		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+		if (!(inode->i_mode & S_IRUGO) ||
+		    (!ops->show_attribute && !attr->show))
 			goto Eaccess;
 	}
 
-- 
cgit v1.1


From 9ae0f367df5d0d7be09fad1e2e5b080f6a45ca6b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Oct 2015 15:32:53 +0200
Subject: dlm: use per-attribute show and store methods

To simplify the configfs interface and remove boilerplate code that also
causes binary bloat.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Teigland <teigland@redhat.com
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 fs/dlm/config.c | 288 +++++++++++++++-----------------------------------------
 1 file changed, 74 insertions(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index d521bdd..8e294fb 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -61,35 +61,8 @@ static struct config_item *make_node(struct config_group *, const char *);
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
 
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
-			    char *buf);
-static ssize_t store_cluster(struct config_item *i,
-			     struct configfs_attribute *a,
-			     const char *buf, size_t len);
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
-			 char *buf);
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
-			  const char *buf, size_t len);
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
-			 char *buf);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
-			  const char *buf, size_t len);
-
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
-				size_t len);
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
-				size_t len);
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
-				size_t len);
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf);
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
-				size_t len);
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
-				size_t len);
+static struct configfs_attribute *comm_attrs[];
+static struct configfs_attribute *node_attrs[];
 
 struct dlm_cluster {
 	struct config_group group;
@@ -108,6 +81,12 @@ struct dlm_cluster {
 	char cl_cluster_name[DLM_LOCKSPACE_LEN];
 };
 
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+		   NULL;
+}
+
 enum {
 	CLUSTER_ATTR_TCP_PORT = 0,
 	CLUSTER_ATTR_BUFFER_SIZE,
@@ -124,33 +103,24 @@ enum {
 	CLUSTER_ATTR_CLUSTER_NAME,
 };
 
-struct cluster_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct dlm_cluster *, char *);
-	ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
-};
-
-static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf)
 {
+	struct dlm_cluster *cl = config_item_to_cluster(item);
 	return sprintf(buf, "%s\n", cl->cl_cluster_name);
 }
 
-static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+static ssize_t cluster_cluster_name_store(struct config_item *item,
 					  const char *buf, size_t len)
 {
+	struct dlm_cluster *cl = config_item_to_cluster(item);
+
 	strlcpy(dlm_config.ci_cluster_name, buf,
 				sizeof(dlm_config.ci_cluster_name));
 	strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
 	return len;
 }
 
-static struct cluster_attribute cluster_attr_cluster_name = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "cluster_name",
-                    .ca_mode = S_IRUGO | S_IWUSR },
-	.show   = cluster_cluster_name_read,
-	.store  = cluster_cluster_name_write,
-};
+CONFIGFS_ATTR(cluster_, cluster_name);
 
 static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
 			   int *info_field, int check_zero,
@@ -175,17 +145,19 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
 }
 
 #define CLUSTER_ATTR(name, check_zero)                                        \
-static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
+static ssize_t cluster_##name##_store(struct config_item *item, \
+		const char *buf, size_t len) \
 {                                                                             \
+	struct dlm_cluster *cl = config_item_to_cluster(item);		      \
 	return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
 			   check_zero, buf, len);                             \
 }                                                                             \
-static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
+static ssize_t cluster_##name##_show(struct config_item *item, char *buf)     \
 {                                                                             \
+	struct dlm_cluster *cl = config_item_to_cluster(item);		      \
 	return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
 }                                                                             \
-static struct cluster_attribute cluster_attr_##name =                         \
-__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+CONFIGFS_ATTR(cluster_, name);
 
 CLUSTER_ATTR(tcp_port, 1);
 CLUSTER_ATTR(buffer_size, 1);
@@ -201,19 +173,19 @@ CLUSTER_ATTR(new_rsb_count, 0);
 CLUSTER_ATTR(recover_callbacks, 0);
 
 static struct configfs_attribute *cluster_attrs[] = {
-	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
-	[CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
-	[CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
-	[CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
-	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
-	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
-	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
-	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
-	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
-	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
-	[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
-	[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
-	[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
+	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
+	[CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size,
+	[CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size,
+	[CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer,
+	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
+	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
+	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
+	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
+	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+	[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
+	[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
+	[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
 	NULL,
 };
 
@@ -224,83 +196,11 @@ enum {
 	COMM_ATTR_ADDR_LIST,
 };
 
-struct comm_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct dlm_comm *, char *);
-	ssize_t (*store)(struct dlm_comm *, const char *, size_t);
-};
-
-static struct comm_attribute comm_attr_nodeid = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "nodeid",
-                    .ca_mode = S_IRUGO | S_IWUSR },
-	.show   = comm_nodeid_read,
-	.store  = comm_nodeid_write,
-};
-
-static struct comm_attribute comm_attr_local = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "local",
-                    .ca_mode = S_IRUGO | S_IWUSR },
-	.show   = comm_local_read,
-	.store  = comm_local_write,
-};
-
-static struct comm_attribute comm_attr_addr = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "addr",
-                    .ca_mode = S_IWUSR },
-	.store  = comm_addr_write,
-};
-
-static struct comm_attribute comm_attr_addr_list = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "addr_list",
-                    .ca_mode = S_IRUGO },
-	.show   = comm_addr_list_read,
-};
-
-static struct configfs_attribute *comm_attrs[] = {
-	[COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
-	[COMM_ATTR_LOCAL] = &comm_attr_local.attr,
-	[COMM_ATTR_ADDR] = &comm_attr_addr.attr,
-	[COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr,
-	NULL,
-};
-
 enum {
 	NODE_ATTR_NODEID = 0,
 	NODE_ATTR_WEIGHT,
 };
 
-struct node_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct dlm_node *, char *);
-	ssize_t (*store)(struct dlm_node *, const char *, size_t);
-};
-
-static struct node_attribute node_attr_nodeid = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "nodeid",
-                    .ca_mode = S_IRUGO | S_IWUSR },
-	.show   = node_nodeid_read,
-	.store  = node_nodeid_write,
-};
-
-static struct node_attribute node_attr_weight = {
-	.attr   = { .ca_owner = THIS_MODULE,
-                    .ca_name = "weight",
-                    .ca_mode = S_IRUGO | S_IWUSR },
-	.show   = node_weight_read,
-	.store  = node_weight_write,
-};
-
-static struct configfs_attribute *node_attrs[] = {
-	[NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
-	[NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
-	NULL,
-};
-
 struct dlm_clusters {
 	struct configfs_subsystem subsys;
 };
@@ -349,8 +249,6 @@ static struct configfs_group_operations clusters_ops = {
 
 static struct configfs_item_operations cluster_ops = {
 	.release = release_cluster,
-	.show_attribute = show_cluster,
-	.store_attribute = store_cluster,
 };
 
 static struct configfs_group_operations spaces_ops = {
@@ -369,8 +267,6 @@ static struct configfs_group_operations comms_ops = {
 
 static struct configfs_item_operations comm_ops = {
 	.release = release_comm,
-	.show_attribute = show_comm,
-	.store_attribute = store_comm,
 };
 
 static struct configfs_group_operations nodes_ops = {
@@ -380,8 +276,6 @@ static struct configfs_group_operations nodes_ops = {
 
 static struct configfs_item_operations node_ops = {
 	.release = release_node,
-	.show_attribute = show_node,
-	.store_attribute = store_node,
 };
 
 static struct config_item_type clusters_type = {
@@ -427,12 +321,6 @@ static struct config_item_type node_type = {
 	.ct_owner = THIS_MODULE,
 };
 
-static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
-{
-	return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
-		   NULL;
-}
-
 static struct dlm_space *config_item_to_space(struct config_item *i)
 {
 	return i ? container_of(to_config_group(i), struct dlm_space, group) :
@@ -687,66 +575,30 @@ void dlm_config_exit(void)
  * Functions for user space to read/write attributes
  */
 
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
-			    char *buf)
-{
-	struct dlm_cluster *cl = config_item_to_cluster(i);
-	struct cluster_attribute *cla =
-			container_of(a, struct cluster_attribute, attr);
-	return cla->show ? cla->show(cl, buf) : 0;
-}
-
-static ssize_t store_cluster(struct config_item *i,
-			     struct configfs_attribute *a,
-			     const char *buf, size_t len)
+static ssize_t comm_nodeid_show(struct config_item *item, char *buf)
 {
-	struct dlm_cluster *cl = config_item_to_cluster(i);
-	struct cluster_attribute *cla =
-		container_of(a, struct cluster_attribute, attr);
-	return cla->store ? cla->store(cl, buf, len) : -EINVAL;
-}
-
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
-			 char *buf)
-{
-	struct dlm_comm *cm = config_item_to_comm(i);
-	struct comm_attribute *cma =
-			container_of(a, struct comm_attribute, attr);
-	return cma->show ? cma->show(cm, buf) : 0;
-}
-
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
-			  const char *buf, size_t len)
-{
-	struct dlm_comm *cm = config_item_to_comm(i);
-	struct comm_attribute *cma =
-		container_of(a, struct comm_attribute, attr);
-	return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+	return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid);
 }
 
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
-{
-	return sprintf(buf, "%d\n", cm->nodeid);
-}
-
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_nodeid_store(struct config_item *item, const char *buf,
 				 size_t len)
 {
-	int rc = kstrtoint(buf, 0, &cm->nodeid);
+	int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid);
 
 	if (rc)
 		return rc;
 	return len;
 }
 
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_local_show(struct config_item *item, char *buf)
 {
-	return sprintf(buf, "%d\n", cm->local);
+	return sprintf(buf, "%d\n", config_item_to_comm(item)->local);
 }
 
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_local_store(struct config_item *item, const char *buf,
 				size_t len)
 {
+	struct dlm_comm *cm = config_item_to_comm(item);
 	int rc = kstrtoint(buf, 0, &cm->local);
 
 	if (rc)
@@ -756,8 +608,10 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
 	return len;
 }
 
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_store(struct config_item *item, const char *buf,
+		size_t len)
 {
+	struct dlm_comm *cm = config_item_to_comm(item);
 	struct sockaddr_storage *addr;
 	int rv;
 
@@ -783,8 +637,9 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 	return len;
 }
 
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_addr_list_show(struct config_item *item, char *buf)
 {
+	struct dlm_comm *cm = config_item_to_comm(item);
 	ssize_t s;
 	ssize_t allowance;
 	int i;
@@ -827,32 +682,28 @@ static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
 	return 4096 - allowance;
 }
 
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
-			 char *buf)
-{
-	struct dlm_node *nd = config_item_to_node(i);
-	struct node_attribute *nda =
-			container_of(a, struct node_attribute, attr);
-	return nda->show ? nda->show(nd, buf) : 0;
-}
+CONFIGFS_ATTR(comm_, nodeid);
+CONFIGFS_ATTR(comm_, local);
+CONFIGFS_ATTR_WO(comm_, addr);
+CONFIGFS_ATTR_RO(comm_, addr_list);
 
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
-			  const char *buf, size_t len)
-{
-	struct dlm_node *nd = config_item_to_node(i);
-	struct node_attribute *nda =
-		container_of(a, struct node_attribute, attr);
-	return nda->store ? nda->store(nd, buf, len) : -EINVAL;
-}
+static struct configfs_attribute *comm_attrs[] = {
+	[COMM_ATTR_NODEID] = &comm_attr_nodeid,
+	[COMM_ATTR_LOCAL] = &comm_attr_local,
+	[COMM_ATTR_ADDR] = &comm_attr_addr,
+	[COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list,
+	NULL,
+};
 
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
+static ssize_t node_nodeid_show(struct config_item *item, char *buf)
 {
-	return sprintf(buf, "%d\n", nd->nodeid);
+	return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid);
 }
 
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_nodeid_store(struct config_item *item, const char *buf,
 				 size_t len)
 {
+	struct dlm_node *nd = config_item_to_node(item);
 	uint32_t seq = 0;
 	int rc = kstrtoint(buf, 0, &nd->nodeid);
 
@@ -863,21 +714,30 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
 	return len;
 }
 
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
+static ssize_t node_weight_show(struct config_item *item, char *buf)
 {
-	return sprintf(buf, "%d\n", nd->weight);
+	return sprintf(buf, "%d\n", config_item_to_node(item)->weight);
 }
 
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_weight_store(struct config_item *item, const char *buf,
 				 size_t len)
 {
-	int rc = kstrtoint(buf, 0, &nd->weight);
+	int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight);
 
 	if (rc)
 		return rc;
 	return len;
 }
 
+CONFIGFS_ATTR(node_, nodeid);
+CONFIGFS_ATTR(node_, weight);
+
+static struct configfs_attribute *node_attrs[] = {
+	[NODE_ATTR_NODEID] = &node_attr_nodeid,
+	[NODE_ATTR_WEIGHT] = &node_attr_weight,
+	NULL,
+};
+
 /*
  * Functions for the dlm to get the info that's been configured
  */
-- 
cgit v1.1


From 13a83fc9096dfaf2a7f4671b5777780bbe1d4a30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Oct 2015 15:32:57 +0200
Subject: ocfs2/cluster: move locking into attribute store methods

The test and separate set bit scheme was racy to start with, so move to do
a test_and_set_bit after doing the earlier error checks inside the actual
store methods.  Also remove the locking for the local attribute which
already has a different scheme to synchronize.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 fs/ocfs2/cluster/nodemanager.c | 54 +++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 441c84e..7a398f6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -188,7 +188,6 @@ enum {
 	O2NM_NODE_ATTR_NUM = 0,
 	O2NM_NODE_ATTR_PORT,
 	O2NM_NODE_ATTR_ADDRESS,
-	O2NM_NODE_ATTR_LOCAL,
 };
 
 static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
@@ -197,6 +196,7 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
 	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
 	unsigned long tmp;
 	char *p = (char *)page;
+	int ret = 0;
 
 	tmp = simple_strtoul(p, &p, 0);
 	if (!p || (*p && (*p != '\n')))
@@ -215,15 +215,18 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
 
 	write_lock(&cluster->cl_nodes_lock);
 	if (cluster->cl_nodes[tmp])
-		p = NULL;
+		ret = -EEXIST;
+	else if (test_and_set_bit(O2NM_NODE_ATTR_NUM,
+			&node->nd_set_attributes))
+		ret = -EBUSY;
 	else  {
 		cluster->cl_nodes[tmp] = node;
 		node->nd_num = tmp;
 		set_bit(tmp, cluster->cl_nodes_bitmap);
 	}
 	write_unlock(&cluster->cl_nodes_lock);
-	if (p == NULL)
-		return -EEXIST;
+	if (ret)
+		return ret;
 
 	return count;
 }
@@ -247,6 +250,8 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
 	if (tmp >= (u16)-1)
 		return -ERANGE;
 
+	if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EBUSY;
 	node->nd_ipv4_port = htons(tmp);
 
 	return count;
@@ -282,6 +287,9 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
 	write_lock(&cluster->cl_nodes_lock);
 	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
 		ret = -EEXIST;
+	else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS,
+			&node->nd_set_attributes))
+		ret = -EBUSY;
 	else {
 		rb_link_node(&node->nd_ip_node, parent, p);
 		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
@@ -388,24 +396,13 @@ static struct o2nm_node_attribute o2nm_node_attr_local = {
 };
 
 static struct configfs_attribute *o2nm_node_attrs[] = {
-	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
-	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
-	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
-	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+	&o2nm_node_attr_num.attr,
+	&o2nm_node_attr_ipv4_port.attr,
+	&o2nm_node_attr_ipv4_address.attr,
+	&o2nm_node_attr_local.attr,
 	NULL,
 };
 
-static int o2nm_attr_index(struct configfs_attribute *attr)
-{
-	int i;
-	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
-		if (attr == o2nm_node_attrs[i])
-			return i;
-	}
-	BUG();
-	return 0;
-}
-
 static ssize_t o2nm_node_show(struct config_item *item,
 			      struct configfs_attribute *attr,
 			      char *page)
@@ -427,24 +424,11 @@ static ssize_t o2nm_node_store(struct config_item *item,
 	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_node_attribute *o2nm_node_attr =
 		container_of(attr, struct o2nm_node_attribute, attr);
-	ssize_t ret;
-	int attr_index = o2nm_attr_index(attr);
 
-	if (o2nm_node_attr->store == NULL) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (test_bit(attr_index, &node->nd_set_attributes))
-		return -EBUSY;
-
-	ret = o2nm_node_attr->store(node, page, count);
-	if (ret < count)
-		goto out;
+	if (o2nm_node_attr->store == NULL)
+		return -EINVAL;
 
-	set_bit(attr_index, &node->nd_set_attributes);
-out:
-	return ret;
+	return o2nm_node_attr->store(node, page, count);
 }
 
 static struct configfs_item_operations o2nm_node_item_ops = {
-- 
cgit v1.1


From 45b997737a8025be2825e464e9e9dd5d07160dc3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Oct 2015 15:32:58 +0200
Subject: ocfs2/cluster: use per-attribute show and store methods

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 fs/ocfs2/cluster/heartbeat.c   | 205 +++++++----------------------------
 fs/ocfs2/cluster/nodemanager.c | 241 ++++++++++-------------------------------
 2 files changed, 100 insertions(+), 346 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index fa15deb..e404386 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1473,16 +1473,17 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
 	return 0;
 }
 
-static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
 					    char *page)
 {
-	return sprintf(page, "%u\n", reg->hr_block_bytes);
+	return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
 }
 
-static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
 					     const char *page,
 					     size_t count)
 {
+	struct o2hb_region *reg = to_o2hb_region(item);
 	int status;
 	unsigned long block_bytes;
 	unsigned int block_bits;
@@ -1501,16 +1502,17 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
 	return count;
 }
 
-static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_show(struct config_item *item,
 					    char *page)
 {
-	return sprintf(page, "%llu\n", reg->hr_start_block);
+	return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
 }
 
-static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_store(struct config_item *item,
 					     const char *page,
 					     size_t count)
 {
+	struct o2hb_region *reg = to_o2hb_region(item);
 	unsigned long long tmp;
 	char *p = (char *)page;
 
@@ -1526,16 +1528,16 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
 	return count;
 }
 
-static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
-				       char *page)
+static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
 {
-	return sprintf(page, "%d\n", reg->hr_blocks);
+	return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
 }
 
-static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_blocks_store(struct config_item *item,
 					const char *page,
 					size_t count)
 {
+	struct o2hb_region *reg = to_o2hb_region(item);
 	unsigned long tmp;
 	char *p = (char *)page;
 
@@ -1554,13 +1556,12 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
 	return count;
 }
 
-static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
-				    char *page)
+static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
 {
 	unsigned int ret = 0;
 
-	if (reg->hr_bdev)
-		ret = sprintf(page, "%s\n", reg->hr_dev_name);
+	if (to_o2hb_region(item)->hr_bdev)
+		ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
 
 	return ret;
 }
@@ -1670,10 +1671,11 @@ out:
 }
 
 /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
-static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_dev_store(struct config_item *item,
 				     const char *page,
 				     size_t count)
 {
+	struct o2hb_region *reg = to_o2hb_region(item);
 	struct task_struct *hb_task;
 	long fd;
 	int sectsize;
@@ -1828,9 +1830,9 @@ out:
 	return ret;
 }
 
-static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
-                                      char *page)
+static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
 {
+	struct o2hb_region *reg = to_o2hb_region(item);
 	pid_t pid = 0;
 
 	spin_lock(&o2hb_live_lock);
@@ -1844,92 +1846,23 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
 	return sprintf(page, "%u\n", pid);
 }
 
-struct o2hb_region_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct o2hb_region *, char *);
-	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "block_bytes",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2hb_region_block_bytes_read,
-	.store	= o2hb_region_block_bytes_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_start_block = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "start_block",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2hb_region_start_block_read,
-	.store	= o2hb_region_start_block_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_blocks = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "blocks",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2hb_region_blocks_read,
-	.store	= o2hb_region_blocks_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_dev = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "dev",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2hb_region_dev_read,
-	.store	= o2hb_region_dev_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_pid = {
-       .attr   = { .ca_owner = THIS_MODULE,
-                   .ca_name = "pid",
-                   .ca_mode = S_IRUGO | S_IRUSR },
-       .show   = o2hb_region_pid_read,
-};
+CONFIGFS_ATTR(o2hb_region_, block_bytes);
+CONFIGFS_ATTR(o2hb_region_, start_block);
+CONFIGFS_ATTR(o2hb_region_, blocks);
+CONFIGFS_ATTR(o2hb_region_, dev);
+CONFIGFS_ATTR_RO(o2hb_region_, pid);
 
 static struct configfs_attribute *o2hb_region_attrs[] = {
-	&o2hb_region_attr_block_bytes.attr,
-	&o2hb_region_attr_start_block.attr,
-	&o2hb_region_attr_blocks.attr,
-	&o2hb_region_attr_dev.attr,
-	&o2hb_region_attr_pid.attr,
+	&o2hb_region_attr_block_bytes,
+	&o2hb_region_attr_start_block,
+	&o2hb_region_attr_blocks,
+	&o2hb_region_attr_dev,
+	&o2hb_region_attr_pid,
 	NULL,
 };
 
-static ssize_t o2hb_region_show(struct config_item *item,
-				struct configfs_attribute *attr,
-				char *page)
-{
-	struct o2hb_region *reg = to_o2hb_region(item);
-	struct o2hb_region_attribute *o2hb_region_attr =
-		container_of(attr, struct o2hb_region_attribute, attr);
-	ssize_t ret = 0;
-
-	if (o2hb_region_attr->show)
-		ret = o2hb_region_attr->show(reg, page);
-	return ret;
-}
-
-static ssize_t o2hb_region_store(struct config_item *item,
-				 struct configfs_attribute *attr,
-				 const char *page, size_t count)
-{
-	struct o2hb_region *reg = to_o2hb_region(item);
-	struct o2hb_region_attribute *o2hb_region_attr =
-		container_of(attr, struct o2hb_region_attribute, attr);
-	ssize_t ret = -EINVAL;
-
-	if (o2hb_region_attr->store)
-		ret = o2hb_region_attr->store(reg, page, count);
-	return ret;
-}
-
 static struct configfs_item_operations o2hb_region_item_ops = {
 	.release		= o2hb_region_release,
-	.show_attribute		= o2hb_region_show,
-	.store_attribute	= o2hb_region_store,
 };
 
 static struct config_item_type o2hb_region_type = {
@@ -2124,49 +2057,14 @@ unlock:
 	spin_unlock(&o2hb_live_lock);
 }
 
-struct o2hb_heartbeat_group_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
-	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
-};
-
-static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
-					 struct configfs_attribute *attr,
-					 char *page)
-{
-	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
-	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
-		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
-	ssize_t ret = 0;
-
-	if (o2hb_heartbeat_group_attr->show)
-		ret = o2hb_heartbeat_group_attr->show(reg, page);
-	return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
-					  struct configfs_attribute *attr,
-					  const char *page, size_t count)
-{
-	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
-	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
-		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
-	ssize_t ret = -EINVAL;
-
-	if (o2hb_heartbeat_group_attr->store)
-		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
-	return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
-						     char *page)
+static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+		char *page)
 {
 	return sprintf(page, "%u\n", o2hb_dead_threshold);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
-						    const char *page,
-						    size_t count)
+static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+		const char *page, size_t count)
 {
 	unsigned long tmp;
 	char *p = (char *)page;
@@ -2181,17 +2079,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
 	return count;
 }
 
-static
-ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
-				       char *page)
+static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
+		char *page)
 {
 	return sprintf(page, "%s\n",
 		       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
 }
 
-static
-ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
-					const char *page, size_t count)
+static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
+		const char *page, size_t count)
 {
 	unsigned int i;
 	int ret;
@@ -2216,33 +2112,15 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
 
 }
 
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "dead_threshold",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2hb_heartbeat_group_threshold_show,
-	.store	= o2hb_heartbeat_group_threshold_store,
-};
-
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
-	.attr   = { .ca_owner = THIS_MODULE,
-		.ca_name = "mode",
-		.ca_mode = S_IRUGO | S_IWUSR },
-	.show   = o2hb_heartbeat_group_mode_show,
-	.store  = o2hb_heartbeat_group_mode_store,
-};
+CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
 
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
-	&o2hb_heartbeat_group_attr_threshold.attr,
-	&o2hb_heartbeat_group_attr_mode.attr,
+	&o2hb_heartbeat_group_attr_threshold,
+	&o2hb_heartbeat_group_attr_mode,
 	NULL,
 };
 
-static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
-	.show_attribute		= o2hb_heartbeat_group_show,
-	.store_attribute	= o2hb_heartbeat_group_store,
-};
-
 static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
 	.make_item	= o2hb_heartbeat_group_make_item,
 	.drop_item	= o2hb_heartbeat_group_drop_item,
@@ -2250,7 +2128,6 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
 
 static struct config_item_type o2hb_heartbeat_group_type = {
 	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
-	.ct_item_ops	= &o2hb_heartbeat_group_item_ops,
 	.ct_attrs	= o2hb_heartbeat_group_attrs,
 	.ct_owner	= THIS_MODULE,
 };
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7a398f6..72afdca 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -172,9 +172,9 @@ static void o2nm_node_release(struct config_item *item)
 	kfree(node);
 }
 
-static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_num_show(struct config_item *item, char *page)
 {
-	return sprintf(page, "%d\n", node->nd_num);
+	return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num);
 }
 
 static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
@@ -190,9 +190,10 @@ enum {
 	O2NM_NODE_ATTR_ADDRESS,
 };
 
-static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 				   size_t count)
 {
+	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
 	unsigned long tmp;
 	char *p = (char *)page;
@@ -230,14 +231,15 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
 
 	return count;
 }
-static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page)
 {
-	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+	return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port));
 }
 
-static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_port_store(struct config_item *item,
 					 const char *page, size_t count)
 {
+	struct o2nm_node *node = to_o2nm_node(item);
 	unsigned long tmp;
 	char *p = (char *)page;
 
@@ -257,15 +259,16 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
 	return count;
 }
 
-static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page)
 {
-	return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+	return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address);
 }
 
-static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 					    const char *page,
 					    size_t count)
 {
+	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
 	int ret, i;
 	struct rb_node **p, *parent;
@@ -303,14 +306,15 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
 	return count;
 }
 
-static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_local_show(struct config_item *item, char *page)
 {
-	return sprintf(page, "%d\n", node->nd_local);
+	return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local);
 }
 
-static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 				     size_t count)
 {
+	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
 	unsigned long tmp;
 	char *p = (char *)page;
@@ -357,84 +361,21 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
 	return count;
 }
 
-struct o2nm_node_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct o2nm_node *, char *);
-	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_num = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "num",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_node_num_read,
-	.store	= o2nm_node_num_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "ipv4_port",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_node_ipv4_port_read,
-	.store	= o2nm_node_ipv4_port_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "ipv4_address",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_node_ipv4_address_read,
-	.store	= o2nm_node_ipv4_address_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_local = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "local",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_node_local_read,
-	.store	= o2nm_node_local_write,
-};
+CONFIGFS_ATTR(o2nm_node_, num);
+CONFIGFS_ATTR(o2nm_node_, ipv4_port);
+CONFIGFS_ATTR(o2nm_node_, ipv4_address);
+CONFIGFS_ATTR(o2nm_node_, local);
 
 static struct configfs_attribute *o2nm_node_attrs[] = {
-	&o2nm_node_attr_num.attr,
-	&o2nm_node_attr_ipv4_port.attr,
-	&o2nm_node_attr_ipv4_address.attr,
-	&o2nm_node_attr_local.attr,
+	&o2nm_node_attr_num,
+	&o2nm_node_attr_ipv4_port,
+	&o2nm_node_attr_ipv4_address,
+	&o2nm_node_attr_local,
 	NULL,
 };
 
-static ssize_t o2nm_node_show(struct config_item *item,
-			      struct configfs_attribute *attr,
-			      char *page)
-{
-	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_node_attribute *o2nm_node_attr =
-		container_of(attr, struct o2nm_node_attribute, attr);
-	ssize_t ret = 0;
-
-	if (o2nm_node_attr->show)
-		ret = o2nm_node_attr->show(node, page);
-	return ret;
-}
-
-static ssize_t o2nm_node_store(struct config_item *item,
-			       struct configfs_attribute *attr,
-			       const char *page, size_t count)
-{
-	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_node_attribute *o2nm_node_attr =
-		container_of(attr, struct o2nm_node_attribute, attr);
-
-	if (o2nm_node_attr->store == NULL)
-		return -EINVAL;
-
-	return o2nm_node_attr->store(node, page, count);
-}
-
 static struct configfs_item_operations o2nm_node_item_ops = {
 	.release		= o2nm_node_release,
-	.show_attribute		= o2nm_node_show,
-	.store_attribute	= o2nm_node_store,
 };
 
 static struct config_item_type o2nm_node_type = {
@@ -459,12 +400,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
 }
 #endif
 
-struct o2nm_cluster_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct o2nm_cluster *, char *);
-	ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
-};
-
 static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
                                        unsigned int *val)
 {
@@ -485,15 +420,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
 	return count;
 }
 
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
-	struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item,
+	char *page)
 {
-	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+	return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms);
 }
 
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
-	struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item,
+	const char *page, size_t count)
 {
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
 	ssize_t ret;
 	unsigned int val;
 
@@ -520,15 +456,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
 	return ret;
 }
 
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
-	struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_keepalive_delay_ms_show(
+	struct config_item *item, char *page)
 {
-	return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+	return sprintf(page, "%u\n",
+			to_o2nm_cluster(item)->cl_keepalive_delay_ms);
 }
 
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
-	struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_keepalive_delay_ms_store(
+	struct config_item *item, const char *page, size_t count)
 {
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
 	ssize_t ret;
 	unsigned int val;
 
@@ -555,22 +493,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
 	return ret;
 }
 
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
-	struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_reconnect_delay_ms_show(
+	struct config_item *item, char *page)
 {
-	return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+	return sprintf(page, "%u\n",
+			to_o2nm_cluster(item)->cl_reconnect_delay_ms);
 }
 
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
-	struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_reconnect_delay_ms_store(
+	struct config_item *item, const char *page, size_t count)
 {
 	return o2nm_cluster_attr_write(page, count,
-	                               &cluster->cl_reconnect_delay_ms);
+                               &to_o2nm_cluster(item)->cl_reconnect_delay_ms);
 }
 
-static ssize_t o2nm_cluster_attr_fence_method_read(
-	struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_fence_method_show(
+	struct config_item *item, char *page)
 {
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
 	ssize_t ret = 0;
 
 	if (cluster)
@@ -579,8 +519,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read(
 	return ret;
 }
 
-static ssize_t o2nm_cluster_attr_fence_method_write(
-	struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_fence_method_store(
+	struct config_item *item, const char *page, size_t count)
 {
 	unsigned int i;
 
@@ -592,10 +532,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write(
 			continue;
 		if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
 			continue;
-		if (cluster->cl_fence_method != i) {
+		if (to_o2nm_cluster(item)->cl_fence_method != i) {
 			printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
 			       o2nm_fence_method_desc[i]);
-			cluster->cl_fence_method = i;
+			to_o2nm_cluster(item)->cl_fence_method = i;
 		}
 		return count;
 	}
@@ -604,79 +544,18 @@ bail:
 	return -EINVAL;
 }
 
-static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "idle_timeout_ms",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_cluster_attr_idle_timeout_ms_read,
-	.store	= o2nm_cluster_attr_idle_timeout_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "keepalive_delay_ms",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_cluster_attr_keepalive_delay_ms_read,
-	.store	= o2nm_cluster_attr_keepalive_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "reconnect_delay_ms",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_cluster_attr_reconnect_delay_ms_read,
-	.store	= o2nm_cluster_attr_reconnect_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "fence_method",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= o2nm_cluster_attr_fence_method_read,
-	.store	= o2nm_cluster_attr_fence_method_write,
-};
+CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms);
+CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, fence_method);
 
 static struct configfs_attribute *o2nm_cluster_attrs[] = {
-	&o2nm_cluster_attr_idle_timeout_ms.attr,
-	&o2nm_cluster_attr_keepalive_delay_ms.attr,
-	&o2nm_cluster_attr_reconnect_delay_ms.attr,
-	&o2nm_cluster_attr_fence_method.attr,
+	&o2nm_cluster_attr_idle_timeout_ms,
+	&o2nm_cluster_attr_keepalive_delay_ms,
+	&o2nm_cluster_attr_reconnect_delay_ms,
+	&o2nm_cluster_attr_fence_method,
 	NULL,
 };
-static ssize_t o2nm_cluster_show(struct config_item *item,
-                                 struct configfs_attribute *attr,
-                                 char *page)
-{
-	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
-	struct o2nm_cluster_attribute *o2nm_cluster_attr =
-		container_of(attr, struct o2nm_cluster_attribute, attr);
-	ssize_t ret = 0;
-
-	if (o2nm_cluster_attr->show)
-		ret = o2nm_cluster_attr->show(cluster, page);
-	return ret;
-}
-
-static ssize_t o2nm_cluster_store(struct config_item *item,
-                                  struct configfs_attribute *attr,
-                                  const char *page, size_t count)
-{
-	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
-	struct o2nm_cluster_attribute *o2nm_cluster_attr =
-		container_of(attr, struct o2nm_cluster_attribute, attr);
-	ssize_t ret;
-
-	if (o2nm_cluster_attr->store == NULL) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = o2nm_cluster_attr->store(cluster, page, count);
-	if (ret < count)
-		goto out;
-out:
-	return ret;
-}
 
 static struct config_item *o2nm_node_group_make_item(struct config_group *group,
 						     const char *name)
@@ -757,8 +636,6 @@ static void o2nm_cluster_release(struct config_item *item)
 
 static struct configfs_item_operations o2nm_cluster_item_ops = {
 	.release	= o2nm_cluster_release,
-	.show_attribute		= o2nm_cluster_show,
-	.store_attribute	= o2nm_cluster_store,
 };
 
 static struct config_item_type o2nm_cluster_type = {
-- 
cgit v1.1


From 517982229f78b2aebf00a8a337e84e8eeea70b8e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Oct 2015 15:32:59 +0200
Subject: configfs: remove old API

Remove the old show_attribute and store_attribute methods and update
the documentation.  Also replace the two C samples with a single new
one in the proper samples directory where people expect to find it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 fs/configfs/file.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 106ca58..d39099e 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -65,7 +65,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
 {
 	struct configfs_attribute * attr = to_attr(dentry);
 	struct config_item * item = to_item(dentry->d_parent);
-	struct configfs_item_operations * ops = buffer->ops;
 	int ret = 0;
 	ssize_t count;
 
@@ -74,10 +73,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
 	if (!buffer->page)
 		return -ENOMEM;
 
-	if (ops->show_attribute)
-		count = ops->show_attribute(item, attr, buffer->page);
-	else
-		count = attr->show(item, buffer->page);
+	count = attr->show(item, buffer->page);
 
 	buffer->needs_read_fill = 0;
 	BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
@@ -175,10 +171,7 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
 {
 	struct configfs_attribute * attr = to_attr(dentry);
 	struct config_item * item = to_item(dentry->d_parent);
-	struct configfs_item_operations * ops = buffer->ops;
 
-	if (ops->store_attribute)
-		return ops->store_attribute(item, attr, buffer->page, count);
 	return attr->store(item, buffer->page, count);
 }
 
@@ -243,8 +236,7 @@ static int check_perm(struct inode * inode, struct file * file)
 	 * and we must have a store method.
 	 */
 	if (file->f_mode & FMODE_WRITE) {
-		if (!(inode->i_mode & S_IWUGO) ||
-		    (!ops->store_attribute && !attr->store))
+		if (!(inode->i_mode & S_IWUGO) || !attr->store)
 			goto Eaccess;
 
 	}
@@ -254,8 +246,7 @@ static int check_perm(struct inode * inode, struct file * file)
 	 * must be a show method for it.
 	 */
 	if (file->f_mode & FMODE_READ) {
-		if (!(inode->i_mode & S_IRUGO) ||
-		    (!ops->show_attribute && !attr->show))
+		if (!(inode->i_mode & S_IRUGO) || !attr->show)
 			goto Eaccess;
 	}
 
-- 
cgit v1.1


From a7b7617493179a0ff76cbc0cc2eb45ad07074765 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 22 Apr 2015 14:20:47 +0200
Subject: mm: add architecture primitives for software dirty bit clearing

There are primitives to create and query the software dirty bits
in a pte or pmd. But the clearing of the software dirty bits is done
in common code with x86 specific page table functions.

Add the missing architecture primitives to clear the software dirty
bits to allow the feature to be used on non-x86 systems, e.g. the
s390 architecture.

Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 fs/proc/task_mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e2d46ad..b029d42 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -754,7 +754,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 
 	if (pte_present(ptent)) {
 		ptent = pte_wrprotect(ptent);
-		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+		ptent = pte_clear_soft_dirty(ptent);
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
 	}
@@ -768,7 +768,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 	pmd_t pmd = *pmdp;
 
 	pmd = pmd_wrprotect(pmd);
-	pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+	pmd = pmd_clear_soft_dirty(pmd);
 
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		vma->vm_flags &= ~VM_SOFTDIRTY;
-- 
cgit v1.1


From 6ca7d910121af4dd8c83294b50546f4664b2a932 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 15 Oct 2015 09:07:07 -0400
Subject: locks: Use more file_inode and fix a comment

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/locks.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 316e474..c74c9df 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1718,8 +1718,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
 {
 	int error = -EAGAIN;
 	struct file_lock *fl, *victim = NULL;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
 
@@ -1759,8 +1758,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 			void **priv)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int error;
 
 	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
@@ -2115,7 +2113,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 	return error;
 }
 
-/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
 static int
 check_fmode_for_setlk(struct file_lock *fl)
 {
-- 
cgit v1.1


From b90197b655185a11640cce3a0a0bc5d8291b8ad2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 15 Oct 2015 10:29:05 -0400
Subject: ext4: use private version of page_zero_new_buffers() for data=journal
 mode

If there is a error while copying data from userspace into the page
cache during a write(2) system call, in data=journal mode, in
ext4_journalled_write_end() were using page_zero_new_buffers() from
fs/buffer.c.  Unfortunately, this sets the buffer dirty flag, which is
no good if journalling is enabled.  This is a long-standing bug that
goes back for years and years in ext3, but a combination of (a)
data=journal not being very common, (b) in many case it only results
in a warning message. and (c) only very rarely causes the kernel hang,
means that we only really noticed this as a problem when commit
998ef75ddb caused this failure to happen frequently enough to cause
generic/208 to fail when run in data=journal mode.

The fix is to have our own version of this function that doesn't call
mark_dirty_buffer(), since we will end up calling
ext4_handle_dirty_metadata() on the buffer head(s) in questions very
shortly afterwards in ext4_journalled_write_end().

Thanks to Dave Hansen and Linus Torvalds for helping to identify the
root cause of the problem.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.com>
---
 fs/ext4/inode.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ae52e32..f205ac3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1181,6 +1181,38 @@ errout:
 	return ret ? ret : copied;
 }
 
+/*
+ * This is a private version of page_zero_new_buffers() which doesn't
+ * set the buffer to be dirty, since in data=journalled mode we need
+ * to call ext4_handle_dirty_metadata() instead.
+ */
+static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start = 0, block_end;
+	struct buffer_head *head, *bh;
+
+	bh = head = page_buffers(page);
+	do {
+		block_end = block_start + bh->b_size;
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, size;
+
+					start = max(from, block_start);
+					size = min(to, block_end) - start;
+
+					zero_user(page, start, size);
+					set_buffer_uptodate(bh);
+				}
+				clear_buffer_new(bh);
+			}
+		}
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
 static int ext4_journalled_write_end(struct file *file,
 				     struct address_space *mapping,
 				     loff_t pos, unsigned len, unsigned copied,
@@ -1207,7 +1239,7 @@ static int ext4_journalled_write_end(struct file *file,
 		if (copied < len) {
 			if (!PageUptodate(page))
 				copied = 0;
-			page_zero_new_buffers(page, from+copied, to);
+			zero_new_buffers(page, from+copied, to);
 		}
 
 		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
-- 
cgit v1.1


From 8595798ca34d186d39abcb277591e541776c0ef5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 15 Oct 2015 10:30:36 -0400
Subject: jbd2: gate checksum calculations on crc driver presence, not sb flags

Change the journal's checksum functions to gate on whether or not the
crc32c driver is loaded, and gate the loading on the superblock bits.
This prevents a journal crash if someone loads a journal in no-csum
mode and then randomizes the superblock, thus flipping on the feature
bits.

Tested-By: Nikolay Borisov <kernel@kyup.com>
Reported-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8270fe9..00f7dbd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
 /* Checksumming functions */
 static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
-	if (!jbd2_journal_has_csum_v2or3(j))
+	if (!jbd2_journal_has_csum_v2or3_feature(j))
 		return 1;
 
 	return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -1531,7 +1531,7 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
-	if (jbd2_journal_has_csum_v2or3(journal) &&
+	if (jbd2_journal_has_csum_v2or3_feature(journal) &&
 	    JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
 		/* Can't have checksum v1 and v2 on at the same time! */
 		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
@@ -1545,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal)
 	}
 
 	/* Load the checksum driver */
-	if (jbd2_journal_has_csum_v2or3(journal)) {
+	if (jbd2_journal_has_csum_v2or3_feature(journal)) {
 		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(journal->j_chksum_driver)) {
 			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
-- 
cgit v1.1


From 9172796bc3754af73b90cbd70586812ddbc1e0ca Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 15 Oct 2015 10:33:21 -0400
Subject: ext4: promote ext4 over ext2 in the default probe order

Prevent clean ext3 filesystems from mounting by default with the ext2
driver (with no journal!) by putting ext4 ahead of ext2 in the default
probe order.  This will have the effect of mounting ext2 filesystems
with ext4.ko by default, which is a safer failure than hoping the user
notices that their journalled ext3 is now running without a journal!

Users who require ext2.ko for ext2 can either disable ext4.ko or
explicitly request ext2 via "mount -t ext2" or "rootfstype=ext2".

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index f79cf40..79f5225 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -63,10 +63,11 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_FSCACHE)		+= fscache/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
-obj-$(CONFIG_EXT2_FS)		+= ext2/
-# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
-# unless explicitly requested by rootfstype
 obj-$(CONFIG_EXT4_FS)		+= ext4/
+# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
+# ext2 driver, which doesn't know about journalling!  Explicitly request ext2
+# by giving the rootfstype= parameter.
+obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_SQUASHFS)		+= squashfs/
-- 
cgit v1.1


From 36022770de6cf9a403c40a68712ed2d2ea2746be Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 26 Sep 2015 02:24:34 +0800
Subject: nfs42: add CLONE xdr functions

xdr definitions per draft-ietf-nfsv4-minorversion2-38.txt

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42xdr.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/nfs4xdr.c  |  1 +
 2 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0eb29e1..0ca482a 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -34,6 +34,12 @@
 					1 /* opaque devaddr4 length */ + \
 					XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
 #define decode_layoutstats_maxsz	(op_decode_hdr_maxsz)
+#define encode_clone_maxsz		(encode_stateid_maxsz + \
+					encode_stateid_maxsz + \
+					2 /* src offset */ + \
+					2 /* dst offset */ + \
+					2 /* count */)
+#define decode_clone_maxsz		(op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
@@ -65,7 +71,20 @@
 					 decode_sequence_maxsz + \
 					 decode_putfh_maxsz + \
 					 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
-
+#define NFS4_enc_clone_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_savefh_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_clone_maxsz + \
+					 encode_getattr_maxsz)
+#define NFS4_dec_clone_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_savefh_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_clone_maxsz + \
+					 decode_getattr_maxsz)
 
 static void encode_fallocate(struct xdr_stream *xdr,
 			     struct nfs42_falloc_args *args)
@@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr,
 		encode_uint32(xdr, 0);
 }
 
+static void encode_clone(struct xdr_stream *xdr,
+			 struct nfs42_clone_args *args,
+			 struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->src_stateid);
+	encode_nfs4_stateid(xdr, &args->dst_stateid);
+	p = reserve_space(xdr, 3*8);
+	p = xdr_encode_hyper(p, args->src_offset);
+	p = xdr_encode_hyper(p, args->dst_offset);
+	xdr_encode_hyper(p, args->count);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+			       struct xdr_stream *xdr,
+			       struct nfs42_clone_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->src_fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dst_fh, &hdr);
+	encode_clone(xdr, args, &hdr);
+	encode_getfattr(xdr, args->dst_bitmask, &hdr);
+	encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -243,6 +298,11 @@ static int decode_layoutstats(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_LAYOUTSTATS);
 }
 
+static int decode_clone(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_CLONE);
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -351,4 +411,39 @@ out:
 	return status;
 }
 
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+			      struct xdr_stream *xdr,
+			      struct nfs42_clone_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_savefh(xdr);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_clone(xdr);
+	if (status)
+		goto out;
+	status = decode_getfattr(xdr, res->dst_fattr, res->server);
+
+out:
+	res->rpc_status = status;
+	return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 788adf3..868472b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7465,6 +7465,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(ALLOCATE,		enc_allocate,		dec_allocate),
 	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
 	PROC(LAYOUTSTATS,	enc_layoutstats,	dec_layoutstats),
+	PROC(CLONE,		enc_clone,		dec_clone),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
-- 
cgit v1.1


From e5341f3a5762d17be9cdd06257c02c0098bdcab8 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 26 Sep 2015 02:24:35 +0800
Subject: nfs42: add CLONE proc functions

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42.h     |  1 +
 fs/nfs/nfs42proc.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c  |  3 ++-
 3 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 814c125..b587ccd 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,5 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
 				   struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
 
 #endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 0f020e4..3e92a3c 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -271,3 +271,74 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
 		return PTR_ERR(task);
 	return 0;
 }
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+			     struct file *dst_f, loff_t src_offset,
+			     loff_t dst_offset, loff_t count)
+{
+	struct inode *src_inode = file_inode(src_f);
+	struct inode *dst_inode = file_inode(dst_f);
+	struct nfs_server *server = NFS_SERVER(dst_inode);
+	struct nfs42_clone_args args = {
+		.src_fh = NFS_FH(src_inode),
+		.dst_fh = NFS_FH(dst_inode),
+		.src_offset = src_offset,
+		.dst_offset = dst_offset,
+		.dst_bitmask = server->cache_consistency_bitmask,
+	};
+	struct nfs42_clone_res res = {
+		.server	= server,
+	};
+	int status;
+
+	msg->rpc_argp = &args;
+	msg->rpc_resp = &res;
+
+	status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+	if (status)
+		return status;
+
+	status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+	if (status)
+		return status;
+
+	res.dst_fattr = nfs_alloc_fattr();
+	if (!res.dst_fattr)
+		return -ENOMEM;
+
+	status = nfs4_call_sync(server->client, server, msg,
+				&args.seq_args, &res.seq_res, 0);
+	if (status == 0)
+		status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+	kfree(res.dst_fattr);
+	return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+		     loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+	};
+	struct inode *inode = file_inode(src_f);
+	struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+	struct nfs4_exception exception = { };
+	int err;
+
+	if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+		return -EOPNOTSUPP;
+
+	do {
+		err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
+					dst_offset, count);
+		if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+			NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+			return -EOPNOTSUPP;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+
+	return err;
+
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5133bb1..9688b1a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8729,7 +8729,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_ALLOCATE
 		| NFS_CAP_DEALLOCATE
 		| NFS_CAP_SEEK
-		| NFS_CAP_LAYOUTSTATS,
+		| NFS_CAP_LAYOUTSTATS
+		| NFS_CAP_CLONE,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
-- 
cgit v1.1


From bea51b30b281039f0f43fb4f42028ddf33fb601f Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 26 Sep 2015 02:24:36 +0800
Subject: nfs42: add NFS_IOC_CLONE ioctl

It can be called by user space to CLONE two files.
Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE.
Thus we don't mess up userspace with too many ioctls.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4file.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index b0dbe0a..fc68ba5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 1992  Rick Sladkey
  */
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/falloc.h>
 #include <linux/nfs_fs.h>
 #include "delegation.h"
@@ -192,8 +193,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
 		return nfs42_proc_deallocate(filep, offset, len);
 	return nfs42_proc_allocate(filep, offset, len);
 }
+
+static noinline long
+nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
+		  u64 src_off, u64 dst_off, u64 count)
+{
+	struct inode *dst_inode = file_inode(dst_file);
+	struct fd src_file;
+	struct inode *src_inode;
+	int ret;
+
+	/* dst file must be opened for writing */
+	if (!(dst_file->f_mode & FMODE_WRITE))
+		return -EINVAL;
+
+	ret = mnt_want_write_file(dst_file);
+	if (ret)
+		return ret;
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	src_inode = file_inode(src_file.file);
+
+	/* src and dst must be different files */
+	ret = -EINVAL;
+	if (src_inode == dst_inode)
+		goto out_fput;
+
+	/* src file must be opened for reading */
+	if (!(src_file.file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	/* src and dst must be regular files */
+	ret = -EISDIR;
+	if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
+	    src_inode->i_sb != dst_inode->i_sb)
+		goto out_fput;
+
+	/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+	if (dst_inode < src_inode) {
+		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+	}
+
+	/* flush all pending writes on both src and dst so that server
+	 * has the latest data */
+	ret = nfs_sync_inode(src_inode);
+	if (ret)
+		goto out_unlock;
+	ret = nfs_sync_inode(dst_inode);
+	if (ret)
+		goto out_unlock;
+
+	ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
+
+	/* truncate inode page cache of the dst range so that future reads can fetch
+	 * new data from server */
+	if (!ret)
+		truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+	if (dst_inode < src_inode) {
+		mutex_unlock(&src_inode->i_mutex);
+		mutex_unlock(&dst_inode->i_mutex);
+	} else {
+		mutex_unlock(&dst_inode->i_mutex);
+		mutex_unlock(&src_inode->i_mutex);
+	}
+out_fput:
+	fdput(src_file);
+out_drop_write:
+	mnt_drop_write_file(dst_file);
+	return ret;
+}
 #endif /* CONFIG_NFS_V4_2 */
 
+long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+#ifdef CONFIG_NFS_V4_2
+	case NFS_IOC_CLONE:
+		return nfs42_ioctl_clone(file, arg, 0, 0, 0);
+#endif
+	}
+
+	return -ENOTTY;
+}
+
 const struct file_operations nfs4_file_operations = {
 #ifdef CONFIG_NFS_V4_2
 	.llseek		= nfs4_file_llseek,
@@ -216,4 +313,9 @@ const struct file_operations nfs4_file_operations = {
 #endif /* CONFIG_NFS_V4_2 */
 	.check_flags	= nfs_check_flags,
 	.setlease	= simple_nosetlease,
+#ifdef CONFIG_COMPAT
+	.unlocked_ioctl = nfs4_ioctl,
+#else
+	.compat_ioctl	= nfs4_ioctl,
+#endif /* CONFIG_COMPAT */
 };
-- 
cgit v1.1


From 2a92ee92d4545448066fb664674c0ae5a9d5ea99 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 26 Sep 2015 02:24:37 +0800
Subject: nfs: get clone_blksize when probing fsinfo

NFSv42 CLONE operation is supposed to respect it.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c   |  1 +
 fs/nfs/nfs4proc.c |  1 +
 fs/nfs/nfs4xdr.c  | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 57c5a02..d6d5d2a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -764,6 +764,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 
 	server->time_delta = fsinfo->time_delta;
 
+	server->clone_blksize = fsinfo->clone_blksize;
 	/* We're airborne Set socket buffersize */
 	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9688b1a..8814fbe 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -239,6 +239,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
 			FATTR4_WORD1_TIME_DELTA
 			| FATTR4_WORD1_FS_LAYOUT_TYPES,
 			FATTR4_WORD2_LAYOUT_BLKSIZE
+			| FATTR4_WORD2_CLONE_BLKSIZE
 };
 
 const u32 nfs4_fs_locations_bitmap[3] = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 868472b..9f65679 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4764,6 +4764,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
 	return 0;
 }
 
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+				     uint32_t *res)
+{
+	__be32 *p;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+	*res = 0;
+	if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p)) {
+			print_overflow_msg(__func__, xdr);
+			return -EIO;
+		}
+		*res = be32_to_cpup(p);
+		bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+	}
+	return 0;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
 	unsigned int savep;
@@ -4798,6 +4820,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
 	if (status)
 		goto xdr_error;
+	status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+	if (status)
+		goto xdr_error;
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-- 
cgit v1.1


From 811b7b85d6641df580a6c43184cf13d6fcc7498d Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 26 Sep 2015 02:24:38 +0800
Subject: nfs42: respect clone_blksize

draft-ietf-nfsv4-minorversion2-38.txt says:
   Both cl_src_offset and
   cl_dst_offset must be aligned to the clone block size Section 12.2.1.
   The number of bytes to be cloned must be a multiple of the clone
   block size, except in the case in which cl_src_offset plus the number
   of bytes to be cloned is equal to the source file size.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4file.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index fc68ba5..4f463dd 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -199,8 +199,10 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
 		  u64 src_off, u64 dst_off, u64 count)
 {
 	struct inode *dst_inode = file_inode(dst_file);
+	struct nfs_server *server = NFS_SERVER(dst_inode);
 	struct fd src_file;
 	struct inode *src_inode;
+	unsigned int bs = server->clone_blksize;
 	int ret;
 
 	/* dst file must be opened for writing */
@@ -238,6 +240,15 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
 	    src_inode->i_sb != dst_inode->i_sb)
 		goto out_fput;
 
+	/* check alignment w.r.t. clone_blksize */
+	ret = -EINVAL;
+	if (bs) {
+		if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+			goto out_fput;
+		if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+			goto out_fput;
+	}
+
 	/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
 	if (dst_inode < src_inode) {
 		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
-- 
cgit v1.1


From a340abcf4173461f688292a6879b4d5bc781c2b1 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Sat, 26 Sep 2015 02:24:39 +0800
Subject: nfs42: add NFS_IOC_CLONE_RANGE ioctl

It follows btrfs BTRFS_IOC_CLONE_RANGE lead on ioctl number and
arguments.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4file.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4f463dd..4384a1d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -288,14 +288,28 @@ out_drop_write:
 	mnt_drop_write_file(dst_file);
 	return ret;
 }
+
+static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
+{
+	struct nfs_ioctl_clone_range_args args;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_off, args.dst_off, args.count);
+}
 #endif /* CONFIG_NFS_V4_2 */
 
 long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	void __user *argp = (void __user *)arg;
+
 	switch (cmd) {
 #ifdef CONFIG_NFS_V4_2
 	case NFS_IOC_CLONE:
 		return nfs42_ioctl_clone(file, arg, 0, 0, 0);
+	case NFS_IOC_CLONE_RANGE:
+		return nfs42_ioctl_clone_range(file, argp);
 #endif
 	}
 
-- 
cgit v1.1


From 275058a2188903786e42b380ea5889ef0a7cdf95 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 15 Oct 2015 16:20:11 -0400
Subject: NFS: Fix an 'unused variable' complaint when #ifndef CONFIG_NFS_V4_2

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4file.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4384a1d..4aa5719 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -298,6 +298,17 @@ static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
 
 	return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_off, args.dst_off, args.count);
 }
+#else
+static long nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
+		u64 src_off, u64 dst_off, u64 count)
+{
+	return -ENOTTY;
+}
+
+static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
+{
+	return -ENOTTY;
+}
 #endif /* CONFIG_NFS_V4_2 */
 
 long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -305,12 +316,10 @@ long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
-#ifdef CONFIG_NFS_V4_2
 	case NFS_IOC_CLONE:
 		return nfs42_ioctl_clone(file, arg, 0, 0, 0);
 	case NFS_IOC_CLONE_RANGE:
 		return nfs42_ioctl_clone_range(file, argp);
-#endif
 	}
 
 	return -ENOTTY;
-- 
cgit v1.1


From 063d99b4fa762cbae9324dbbf9b6bff4b3a8cfdc Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 15 Oct 2015 15:28:24 -0700
Subject: mm, fs: obey gfp_mapping for add_to_page_cache()

Commit 6afdb859b710 ("mm: do not ignore mapping_gfp_mask in page cache
allocation paths") has caught some users of hardcoded GFP_KERNEL used in
the page cache allocation paths.  This, however, wasn't complete and
there were others which went unnoticed.

Dave Chinner has reported the following deadlock for xfs on loop device:
: With the recent merge of the loop device changes, I'm now seeing
: XFS deadlock on my single CPU, 1GB RAM VM running xfs/073.
:
: The deadlocked is as follows:
:
: kloopd1: loop_queue_read_work
:       xfs_file_iter_read
:       lock XFS inode XFS_IOLOCK_SHARED (on image file)
:       page cache read (GFP_KERNEL)
:       radix tree alloc
:       memory reclaim
:       reclaim XFS inodes
:       log force to unpin inodes
:       <wait for log IO completion>
:
: xfs-cil/loop1: <does log force IO work>
:       xlog_cil_push
:       xlog_write
:       <loop issuing log writes>
:               xlog_state_get_iclog_space()
:               <blocks due to all log buffers under write io>
:               <waits for IO completion>
:
: kloopd1: loop_queue_write_work
:       xfs_file_write_iter
:       lock XFS inode XFS_IOLOCK_EXCL (on image file)
:       <wait for inode to be unlocked>
:
: i.e. the kloopd, with it's split read and write work queues, has
: introduced a dependency through memory reclaim. i.e. that writes
: need to be able to progress for reads make progress.
:
: The problem, fundamentally, is that mpage_readpages() does a
: GFP_KERNEL allocation, rather than paying attention to the inode's
: mapping gfp mask, which is set to GFP_NOFS.
:
: The didn't used to happen, because the loop device used to issue
: reads through the splice path and that does:
:
:       error = add_to_page_cache_lru(page, mapping, index,
:                       GFP_KERNEL & mapping_gfp_mask(mapping));

This has changed by commit aa4d86163e4 ("block: loop: switch to VFS
ITER_BVEC").

This patch changes mpage_readpage{s} to follow gfp mask set for the
mapping.  There are, however, other places which are doing basically the
same.

lustre:ll_dir_filler is doing GFP_KERNEL from the function which
apparently uses GFP_NOFS for other allocations so let's make this
consistent.

cifs:readpages_get_pages is called from cifs_readpages and
__cifs_readpages_from_fscache called from the same path obeys mapping
gfp.

ramfs_nommu_expand_for_mapping is hardcoding GFP_KERNEL as well
regardless it uses mapping_gfp_mask for the page allocation.

ext4_mpage_readpages is the called from the page cache allocation path
same as read_pages and read_cache_pages

As I've noticed in my previous post I cannot say I would be happy about
sprinkling mapping_gfp_mask all over the place and it sounds like we
should drop gfp_mask argument altogether and use it internally in
__add_to_page_cache_locked that would require all the filesystems to use
mapping gfp consistently which I am not sure is the case here.  From a
quick glance it seems that some file system use it all the time while
others are selective.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Dave Chinner <david@fromorbit.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/file.c        |  6 +++---
 fs/ext4/readpage.c    |  4 ++--
 fs/mpage.c            | 15 +++++++++------
 fs/ramfs/file-nommu.c |  5 +++--
 4 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2a6af1..62203c3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3380,6 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 	struct page *page, *tpage;
 	unsigned int expected_index;
 	int rc;
+	gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
 
 	INIT_LIST_HEAD(tmplist);
 
@@ -3392,7 +3393,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 	 */
 	__set_page_locked(page);
 	rc = add_to_page_cache_locked(page, mapping,
-				      page->index, GFP_KERNEL);
+				      page->index, gfp);
 
 	/* give up if we can't stick it in the cache */
 	if (rc) {
@@ -3418,8 +3419,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 			break;
 
 		__set_page_locked(page);
-		if (add_to_page_cache_locked(page, mapping, page->index,
-								GFP_KERNEL)) {
+		if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
 			__clear_page_locked(page);
 			break;
 		}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e26803f..560af04 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -165,8 +165,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		if (pages) {
 			page = list_entry(pages->prev, struct page, lru);
 			list_del(&page->lru);
-			if (add_to_page_cache_lru(page, mapping,
-						  page->index, GFP_KERNEL))
+			if (add_to_page_cache_lru(page, mapping, page->index,
+					GFP_KERNEL & mapping_gfp_mask(mapping)))
 				goto next_page;
 		}
 
diff --git a/fs/mpage.c b/fs/mpage.c
index 778a4dd..a7c3427 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -139,7 +139,8 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 static struct bio *
 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		sector_t *last_block_in_bio, struct buffer_head *map_bh,
-		unsigned long *first_logical_block, get_block_t get_block)
+		unsigned long *first_logical_block, get_block_t get_block,
+		gfp_t gfp)
 {
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
@@ -277,8 +278,7 @@ alloc_new:
 				goto out;
 		}
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-				min_t(int, nr_pages, BIO_MAX_PAGES),
-				GFP_KERNEL);
+				min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
 		if (bio == NULL)
 			goto confused;
 	}
@@ -361,6 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
+	gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
@@ -370,12 +371,13 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 		prefetchw(&page->flags);
 		list_del(&page->lru);
 		if (!add_to_page_cache_lru(page, mapping,
-					page->index, GFP_KERNEL)) {
+					page->index,
+					gfp)) {
 			bio = do_mpage_readpage(bio, page,
 					nr_pages - page_idx,
 					&last_block_in_bio, &map_bh,
 					&first_logical_block,
-					get_block);
+					get_block, gfp);
 		}
 		page_cache_release(page);
 	}
@@ -395,11 +397,12 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
+	gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
 	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
-			&map_bh, &first_logical_block, get_block);
+			&map_bh, &first_logical_block, get_block, gfp);
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ba1323a..a586467 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,6 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	unsigned order;
 	void *data;
 	int ret;
+	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 
 	/* make various checks */
 	order = get_order(newsize);
@@ -84,7 +85,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
 	/* allocate enough contiguous pages to be able to satisfy the
 	 * request */
-	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	pages = alloc_pages(gfp, order);
 	if (!pages)
 		return -ENOMEM;
 
@@ -108,7 +109,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 		struct page *page = pages + loop;
 
 		ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
-					GFP_KERNEL);
+					gfp);
 		if (ret < 0)
 			goto add_error;
 
-- 
cgit v1.1


From 0f90cc6609c72b0bdf2aad0cb0456194dd896e19 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Thu, 15 Oct 2015 15:28:32 -0700
Subject: mm, dax: fix DAX deadlocks

The following two locking commits in the DAX code:

commit 843172978bb9 ("dax: fix race between simultaneous faults")
commit 46c043ede471 ("mm: take i_mmap_lock in unmap_mapping_range() for DAX")

introduced a number of deadlocks and other issues which need to be fixed
for the v4.3 kernel.  The list of issues in DAX after these commits
(some newly introduced by the commits, some preexisting) can be found
here:

  https://lkml.org/lkml/2015/9/25/602 (Subject: "Re: [PATCH] dax: fix deadlock in __dax_fault").

This undoes most of the changes introduced by those two commits,
essentially returning us to the DAX locking scheme that was used in
v4.2.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c | 70 +++++++++++++++++++++++++++-------------------------------------
 1 file changed, 29 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index bcfb14b..a86d3cc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -285,6 +285,7 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct address_space *mapping = inode->i_mapping;
 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	void __pmem *addr;
@@ -292,6 +293,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	pgoff_t size;
 	int error;
 
+	i_mmap_lock_read(mapping);
+
 	/*
 	 * Check truncate didn't happen while we were allocating a block.
 	 * If it did, this block may or may not be still allocated to the
@@ -321,6 +324,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	error = vm_insert_mixed(vma, vaddr, pfn);
 
  out:
+	i_mmap_unlock_read(mapping);
+
 	return error;
 }
 
@@ -382,17 +387,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			 * from a read fault and we've raced with a truncate
 			 */
 			error = -EIO;
-			goto unlock;
+			goto unlock_page;
 		}
-	} else {
-		i_mmap_lock_write(mapping);
 	}
 
 	error = get_block(inode, block, &bh, 0);
 	if (!error && (bh.b_size < PAGE_SIZE))
 		error = -EIO;		/* fs corruption? */
 	if (error)
-		goto unlock;
+		goto unlock_page;
 
 	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
 		if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -403,9 +406,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			if (!error && (bh.b_size < PAGE_SIZE))
 				error = -EIO;
 			if (error)
-				goto unlock;
+				goto unlock_page;
 		} else {
-			i_mmap_unlock_write(mapping);
 			return dax_load_hole(mapping, page, vmf);
 		}
 	}
@@ -417,15 +419,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
-			goto unlock;
+			goto unlock_page;
 		vmf->page = page;
 		if (!page) {
+			i_mmap_lock_read(mapping);
 			/* Check we didn't race with truncate */
 			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
 								PAGE_SHIFT;
 			if (vmf->pgoff >= size) {
+				i_mmap_unlock_read(mapping);
 				error = -EIO;
-				goto unlock;
+				goto out;
 			}
 		}
 		return VM_FAULT_LOCKED;
@@ -461,8 +465,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
 	}
 
-	if (!page)
-		i_mmap_unlock_write(mapping);
  out:
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
@@ -471,14 +473,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_SIGBUS | major;
 	return VM_FAULT_NOPAGE | major;
 
- unlock:
+ unlock_page:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
-	} else {
-		i_mmap_unlock_write(mapping);
 	}
-
 	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
@@ -556,10 +555,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
-	i_mmap_lock_write(mapping);
 	length = get_block(inode, block, &bh, write);
 	if (length)
 		return VM_FAULT_SIGBUS;
+	i_mmap_lock_read(mapping);
 
 	/*
 	 * If the filesystem isn't willing to tell us the length of a hole,
@@ -569,36 +568,14 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
 		goto fallback;
 
-	sector = bh.b_blocknr << (blkbits - 9);
-
-	if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-		int i;
-
-		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
-						bh.b_size);
-		if (length < 0) {
-			result = VM_FAULT_SIGBUS;
-			goto out;
-		}
-		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
-			goto fallback;
-
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
-		wmb_pmem();
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-		result |= VM_FAULT_MAJOR;
-	}
-
 	/*
 	 * If we allocated new storage, make sure no process has any
 	 * zero pages covering this hole
 	 */
 	if (buffer_new(&bh)) {
-		i_mmap_unlock_write(mapping);
+		i_mmap_unlock_read(mapping);
 		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_write(mapping);
+		i_mmap_lock_read(mapping);
 	}
 
 	/*
@@ -635,6 +612,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		result = VM_FAULT_NOPAGE;
 		spin_unlock(ptl);
 	} else {
+		sector = bh.b_blocknr << (blkbits - 9);
 		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
 						bh.b_size);
 		if (length < 0) {
@@ -644,15 +622,25 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
 			goto fallback;
 
+		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+			int i;
+			for (i = 0; i < PTRS_PER_PMD; i++)
+				clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+			wmb_pmem();
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			result |= VM_FAULT_MAJOR;
+		}
+
 		result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
 	}
 
  out:
+	i_mmap_unlock_read(mapping);
+
 	if (buffer_unwritten(&bh))
 		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
 
-	i_mmap_unlock_write(mapping);
-
 	return result;
 
  fallback:
-- 
cgit v1.1


From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 16 Oct 2015 12:34:25 +0100
Subject: Btrfs: fix truncation of compressed and inlined extents

When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.

Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:

   1) User A creates a file which consists of an inline and compressed
      extent with a size of 2000 bytes - the file is not accessible to
      any other users (no read, write or execution permission for anyone
      else);

   2) The user truncates the file to a size of 1000 bytes;

   3) User A makes the file world readable;

   4) User B creates a file consisting of an inline extent of 2000 bytes;

   5) User B issues a clone operation from user A's file into its own
      file (using a length argument of 0, clone the whole range);

   6) User B now gets to see the 1000 bytes that user A truncated from
      its file before it made its file world readbale. User B also lost
      the bytes in the range [1000, 2000[ bytes from its own file, but
      that might be ok if his/her intention was reading stale data from
      user A that was never supposed to be public.

Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.

This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").

So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.

The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_cloner

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  # Create our test files. File foo is going to be the source of a clone operation
  # and consists of a single inline extent with an uncompressed size of 512 bytes,
  # while file bar consists of a single inline extent with an uncompressed size of
  # 256 bytes. For our test's purpose, it's important that file bar has an inline
  # extent with a size smaller than foo's inline extent.
  $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128"   \
          -c "pwrite -S 0x2a 128 384" \
          $SCRATCH_MNT/foo | _filter_xfs_io
  $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io

  # Now durably persist all metadata and data. We do this to make sure that we get
  # on disk an inline extent with a size of 512 bytes for file foo.
  sync

  # Now truncate our file foo to a smaller size. Because it consists of a
  # compressed and inline extent, btrfs did not shrink the inline extent to the
  # new size (if the extent was not compressed, btrfs would shrink it to 128
  # bytes), it only updates the inode's i_size to 128 bytes.
  $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo

  # Now clone foo's inline extent into bar.
  # This clone operation should fail with errno EOPNOTSUPP because the source
  # file consists only of an inline extent and the file's size is smaller than
  # the inline extent of the destination (128 bytes < 256 bytes). However the
  # clone ioctl was not prepared to deal with a file that has a size smaller
  # than the size of its inline extent (something that happens only for compressed
  # inline extents), resulting in copying the full inline extent from the source
  # file into the destination file.
  #
  # Note that btrfs' clone operation for inline extents consists of removing the
  # inline extent from the destination inode and copy the inline extent from the
  # source inode into the destination inode, meaning that if the destination
  # inode's inline extent is larger (N bytes) than the source inode's inline
  # extent (M bytes), some bytes (N - M bytes) will be lost from the destination
  # file. Btrfs could copy the source inline extent's data into the destination's
  # inline extent so that we would not lose any data, but that's currently not
  # done due to the complexity that would be needed to deal with such cases
  # (specially when one or both extents are compressed), returning EOPNOTSUPP, as
  # it's normally not a very common case to clone very small files (only case
  # where we get inline extents) and copying inline extents does not save any
  # space (unlike for normal, non-inlined extents).
  $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar

  # Now because the above clone operation used to succeed, and due to foo's inline
  # extent not being shinked by the truncate operation, our file bar got the whole
  # inline extent copied from foo, making us lose the last 128 bytes from bar
  # which got replaced by the bytes in range [128, 256[ from foo before foo was
  # truncated - in other words, data loss from bar and being able to read old and
  # stale data from foo that should not be possible to read anymore through normal
  # filesystem operations. Contrast with the case where we truncate a file from a
  # size N to a smaller size M, truncate it back to size N and then read the range
  # [M, N[, we should always get the value 0x00 for all the bytes in that range.

  # We expected the clone operation to fail with errno EOPNOTSUPP and therefore
  # not modify our file's bar data/metadata. So its content should be 256 bytes
  # long with all bytes having the value 0xbb.
  #
  # Without the btrfs bug fix, the clone operation succeeded and resulted in
  # leaking truncated data from foo, the bytes that belonged to its range
  # [128, 256[, and losing data from bar in that same range. So reading the
  # file gave us the following content:
  #
  # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
  # *
  # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
  # *
  # 0000400
  echo "File bar's content after the clone operation:"
  od -t x1 $SCRATCH_MNT/bar

  # Also because the foo's inline extent was not shrunk by the truncate
  # operation, btrfs' fsck, which is run by the fstests framework everytime a
  # test completes, failed reporting the following error:
  #
  #  root 5 inode 257 errors 400, nbytes wrong

  status=0
  exit

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 68 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 208db4e..cbb4286 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4217,6 +4217,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
 
 }
 
+static int truncate_inline_extent(struct inode *inode,
+				  struct btrfs_path *path,
+				  struct btrfs_key *found_key,
+				  const u64 item_end,
+				  const u64 new_size)
+{
+	struct extent_buffer *leaf = path->nodes[0];
+	int slot = path->slots[0];
+	struct btrfs_file_extent_item *fi;
+	u32 size = (u32)(new_size - found_key->offset);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+		loff_t offset = new_size;
+		loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+		/*
+		 * Zero out the remaining of the last page of our inline extent,
+		 * instead of directly truncating our inline extent here - that
+		 * would be much more complex (decompressing all the data, then
+		 * compressing the truncated data, which might be bigger than
+		 * the size of the inline extent, resize the extent, etc).
+		 * We release the path because to get the page we might need to
+		 * read the extent item from disk (data not in the page cache).
+		 */
+		btrfs_release_path(path);
+		return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+	}
+
+	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+	size = btrfs_file_extent_calc_inline_size(size);
+	btrfs_truncate_item(root, path, size, 1);
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		inode_sub_bytes(inode, item_end + 1 - new_size);
+
+	return 0;
+}
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -4411,27 +4452,40 @@ search_again:
 			 * special encodings
 			 */
 			if (!del_item &&
-			    btrfs_file_extent_compression(leaf, fi) == 0 &&
 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-				u32 size = new_size - found_key.offset;
-
-				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-					inode_sub_bytes(inode, item_end + 1 -
-							new_size);
 
 				/*
-				 * update the ram bytes to properly reflect
-				 * the new size of our item
+				 * Need to release path in order to truncate a
+				 * compressed extent. So delete any accumulated
+				 * extent items so far.
 				 */
-				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-				size =
-				    btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(root, path, size, 1);
+				if (btrfs_file_extent_compression(leaf, fi) !=
+				    BTRFS_COMPRESS_NONE && pending_del_nr) {
+					err = btrfs_del_items(trans, root, path,
+							      pending_del_slot,
+							      pending_del_nr);
+					if (err) {
+						btrfs_abort_transaction(trans,
+									root,
+									err);
+						goto error;
+					}
+					pending_del_nr = 0;
+				}
+
+				err = truncate_inline_extent(inode, path,
+							     &found_key,
+							     item_end,
+							     new_size);
+				if (err) {
+					btrfs_abort_transaction(trans,
+								root, err);
+					goto error;
+				}
 			} else if (test_bit(BTRFS_ROOT_REF_COWS,
 					    &root->state)) {
-				inode_sub_bytes(inode, item_end + 1 -
-						found_key.offset);
+				inode_sub_bytes(inode, item_end + 1 - new_size);
 			}
 		}
 delete:
-- 
cgit v1.1


From 8b4953e13f4c5d9a3c869f5fca7d51e1700e7db0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 17 Oct 2015 16:15:18 -0400
Subject: ext4: reserve code points for the project quota feature

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3f248c9..320f10e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -374,6 +374,7 @@ struct flex_groups {
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
+#define EXT4_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
@@ -431,6 +432,7 @@ enum {
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
 	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
+	EXT4_INODE_PROJINHERIT	= 29,	/* Create with parents projid */
 	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
 };
 
@@ -475,6 +477,7 @@ static inline void ext4_check_flag_values(void)
 	CHECK_FLAG_VALUE(EA_INODE);
 	CHECK_FLAG_VALUE(EOFBLOCKS);
 	CHECK_FLAG_VALUE(INLINE_DATA);
+	CHECK_FLAG_VALUE(PROJINHERIT);
 	CHECK_FLAG_VALUE(RESERVED);
 }
 
@@ -692,6 +695,7 @@ struct ext4_inode {
 	__le32  i_crtime;       /* File Creation time */
 	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
 	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
+	__le32	i_projid;	/* Project ID */
 };
 
 struct move_extent {
@@ -1179,7 +1183,8 @@ struct ext4_super_block {
 	__u8	s_encrypt_algos[4];	/* Encryption algorithms in use  */
 	__u8	s_encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
 	__le32	s_lpf_ino;		/* Location of the lost+found inode */
-	__le32	s_reserved[100];	/* Padding to the end of the block */
+	__le32	s_prj_quota_inum;	/* inode for tracking project quota */
+	__le32	s_reserved[99];		/* Padding to the end of the block */
 	__le32	s_checksum;		/* crc32c(superblock) */
 };
 
@@ -1566,6 +1571,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
  */
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
 #define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
+#define EXT4_FEATURE_RO_COMPAT_PROJECT		0x2000
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
-- 
cgit v1.1


From 8c81bd8f586c46eaf114758a78d82895a2b081c2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 17 Oct 2015 16:16:02 -0400
Subject: ext4: store checksum seed in superblock

Allow the filesystem to store the metadata checksum seed in the
superblock and add an incompat feature to say that we're using it.
This enables tune2fs to change the UUID on a mounted metadata_csum
FS without having to (racy!) rewrite all disk metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 8 +++++---
 fs/ext4/super.c | 4 +++-
 fs/ext4/sysfs.c | 2 ++
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 320f10e..cd832b9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1184,7 +1184,8 @@ struct ext4_super_block {
 	__u8	s_encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
 	__le32	s_lpf_ino;		/* Location of the lost+found inode */
 	__le32	s_prj_quota_inum;	/* inode for tracking project quota */
-	__le32	s_reserved[99];		/* Padding to the end of the block */
+	__le32	s_checksum_seed;	/* crc32c(uuid) if csum_seed set */
+	__le32	s_reserved[98];		/* Padding to the end of the block */
 	__le32	s_checksum;		/* crc32c(superblock) */
 };
 
@@ -1584,7 +1585,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_CSUM_SEED		0x2000
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
 #define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
@@ -1613,7 +1614,8 @@ static inline int ext4_encrypted_inode(struct inode *inode)
 					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
 					 EXT4_FEATURE_INCOMPAT_MMP | \
 					 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
-					 EXT4_FEATURE_INCOMPAT_ENCRYPT)
+					 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+					 EXT4_FEATURE_INCOMPAT_CSUM_SEED)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7ef3fa5..43c0cc8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3196,7 +3196,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Precompute checksum seed for all metadata */
-	if (ext4_has_metadata_csum(sb))
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_CSUM_SEED))
+		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+	else if (ext4_has_metadata_csum(sb))
 		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
 					       sizeof(es->s_uuid));
 
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 62bef0f..1b57c72 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -224,12 +224,14 @@ EXT4_ATTR_FEATURE(lazy_itable_init);
 EXT4_ATTR_FEATURE(batched_discard);
 EXT4_ATTR_FEATURE(meta_bg_resize);
 EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(metadata_csum_seed);
 
 static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
 	ATTR_LIST(batched_discard),
 	ATTR_LIST(meta_bg_resize),
 	ATTR_LIST(encryption),
+	ATTR_LIST(metadata_csum_seed),
 	NULL,
 };
 
-- 
cgit v1.1


From 6a797d2737838906f2ea0a31686e87c3151e21ca Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 17 Oct 2015 16:16:04 -0400
Subject: ext4: call out CRC and corruption errors with specific error codes

Instead of overloading EIO for CRC errors and corrupt structures,
return the same error codes that XFS returns for the same issues.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c         |  2 +-
 fs/ext4/block_validity.c |  2 +-
 fs/ext4/dir.c            |  4 +--
 fs/ext4/ext4.h           |  3 ++
 fs/ext4/extents.c        | 75 ++++++++++++++++++++++++------------------------
 fs/ext4/ialloc.c         |  1 +
 fs/ext4/indirect.c       |  2 +-
 fs/ext4/inode.c          | 16 +++++------
 fs/ext4/mmp.c            |  8 ++++--
 fs/ext4/namei.c          | 28 +++++++++---------
 fs/ext4/super.c          | 10 ++++++-
 fs/ext4/symlink.c        |  2 +-
 fs/ext4/xattr.c          | 28 +++++++++---------
 fs/jbd2/journal.c        |  3 +-
 fs/jbd2/recovery.c       |  8 +++---
 15 files changed, 104 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cd6ea29..f831e10 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -203,7 +203,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
 					   count);
 		}
 		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
-		return -EIO;
+		return -EFSBADCRC;
 	}
 	memset(bh->b_data, 0, sb->s_blocksize);
 
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3522340..02ddec6 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -234,7 +234,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
 			es->s_last_error_block = cpu_to_le64(blk);
 			ext4_error_inode(inode, function, line, blk,
 					 "invalid block");
-			return -EIO;
+			return -EFSCORRUPTED;
 		}
 	}
 	return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f9e1491..b29cb70 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -621,14 +621,14 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
 	while ((char *) de < top) {
 		if (ext4_check_dir_entry(dir, NULL, de, bh,
 					 buf, buf_size, offset))
-			return -EIO;
+			return -EFSCORRUPTED;
 		nlen = EXT4_DIR_REC_LEN(de->name_len);
 		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
 		de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
 		offset += rlen;
 	}
 	if ((char *) de > top)
-		return -EIO;
+		return -EFSCORRUPTED;
 
 	return 0;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cd832b9..6c07973 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3064,4 +3064,7 @@ extern void ext4_resize_end(struct super_block *sb);
 
 #endif	/* __KERNEL__ */
 
+#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
+
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7f486e3..faeaba8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -442,7 +442,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
 			    int depth, ext4_fsblk_t pblk)
 {
 	const char *error_msg;
-	int max = 0;
+	int max = 0, err = -EFSCORRUPTED;
 
 	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 		error_msg = "invalid magic";
@@ -473,6 +473,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
 	if (ext_depth(inode) != depth &&
 	    !ext4_extent_block_csum_verify(inode, eh)) {
 		error_msg = "extent tree corrupted";
+		err = -EFSBADCRC;
 		goto corrupted;
 	}
 	return 0;
@@ -485,7 +486,7 @@ corrupted:
 			 le16_to_cpu(eh->eh_magic),
 			 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 			 max, le16_to_cpu(eh->eh_depth), depth);
-	return -EIO;
+	return err;
 }
 
 #define ext4_ext_check(inode, eh, depth, pblk)			\
@@ -910,7 +911,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 			put_bh(bh);
 			EXT4_ERROR_INODE(inode,
 					 "ppos %d > depth %d", ppos, depth);
-			ret = -EIO;
+			ret = -EFSCORRUPTED;
 			goto err;
 		}
 		path[ppos].p_bh = bh;
@@ -959,7 +960,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		EXT4_ERROR_INODE(inode,
 				 "logical %d == ei_block %d!",
 				 logical, le32_to_cpu(curp->p_idx->ei_block));
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
@@ -968,7 +969,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 				 "eh_entries %d >= eh_max %d!",
 				 le16_to_cpu(curp->p_hdr->eh_entries),
 				 le16_to_cpu(curp->p_hdr->eh_max));
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
@@ -992,7 +993,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 
 	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	ix->ei_block = cpu_to_le32(logical);
@@ -1001,7 +1002,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 
 	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
 		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	err = ext4_ext_dirty(handle, inode, curp);
@@ -1042,7 +1043,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	 * border from split point */
 	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
 		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		border = path[depth].p_ext[1].ee_block;
@@ -1086,7 +1087,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	newblock = ablocks[--a];
 	if (unlikely(newblock == 0)) {
 		EXT4_ERROR_INODE(inode, "newblock == 0!");
-		err = -EIO;
+		err = -EFSCORRUPTED;
 		goto cleanup;
 	}
 	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
@@ -1112,7 +1113,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
 				 path[depth].p_hdr->eh_entries,
 				 path[depth].p_hdr->eh_max);
-		err = -EIO;
+		err = -EFSCORRUPTED;
 		goto cleanup;
 	}
 	/* start copy from next extent */
@@ -1151,7 +1152,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	k = depth - at - 1;
 	if (unlikely(k < 0)) {
 		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
-		err = -EIO;
+		err = -EFSCORRUPTED;
 		goto cleanup;
 	}
 	if (k)
@@ -1191,7 +1192,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			EXT4_ERROR_INODE(inode,
 					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
 					 le32_to_cpu(path[i].p_ext->ee_block));
-			err = -EIO;
+			err = -EFSCORRUPTED;
 			goto cleanup;
 		}
 		/* start copy indexes */
@@ -1425,7 +1426,7 @@ static int ext4_ext_search_left(struct inode *inode,
 
 	if (unlikely(path == NULL)) {
 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	depth = path->p_depth;
 	*phys = 0;
@@ -1444,7 +1445,7 @@ static int ext4_ext_search_left(struct inode *inode,
 			EXT4_ERROR_INODE(inode,
 					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
 					 *logical, le32_to_cpu(ex->ee_block));
-			return -EIO;
+			return -EFSCORRUPTED;
 		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
@@ -1455,7 +1456,7 @@ static int ext4_ext_search_left(struct inode *inode,
 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
 		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
 				  depth);
-				return -EIO;
+				return -EFSCORRUPTED;
 			}
 		}
 		return 0;
@@ -1465,7 +1466,7 @@ static int ext4_ext_search_left(struct inode *inode,
 		EXT4_ERROR_INODE(inode,
 				 "logical %d < ee_block %d + ee_len %d!",
 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
@@ -1495,7 +1496,7 @@ static int ext4_ext_search_right(struct inode *inode,
 
 	if (unlikely(path == NULL)) {
 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	depth = path->p_depth;
 	*phys = 0;
@@ -1514,7 +1515,7 @@ static int ext4_ext_search_right(struct inode *inode,
 			EXT4_ERROR_INODE(inode,
 					 "first_extent(path[%d].p_hdr) != ex",
 					 depth);
-			return -EIO;
+			return -EFSCORRUPTED;
 		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
@@ -1522,7 +1523,7 @@ static int ext4_ext_search_right(struct inode *inode,
 				EXT4_ERROR_INODE(inode,
 						 "ix != EXT_FIRST_INDEX *logical %d!",
 						 *logical);
-				return -EIO;
+				return -EFSCORRUPTED;
 			}
 		}
 		goto found_extent;
@@ -1532,7 +1533,7 @@ static int ext4_ext_search_right(struct inode *inode,
 		EXT4_ERROR_INODE(inode,
 				 "logical %d < ee_block %d + ee_len %d!",
 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
@@ -1670,7 +1671,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	if (unlikely(ex == NULL || eh == NULL)) {
 		EXT4_ERROR_INODE(inode,
 				 "ex %p == NULL or eh %p == NULL", ex, eh);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	if (depth == 0) {
@@ -1938,14 +1939,14 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
 	eh = path[depth].p_hdr;
 	if (unlikely(path[depth].p_hdr == NULL)) {
 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	/* try to insert block into found extent and return */
@@ -2172,7 +2173,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		if (unlikely(path[depth].p_hdr == NULL)) {
 			up_read(&EXT4_I(inode)->i_data_sem);
 			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
-			err = -EIO;
+			err = -EFSCORRUPTED;
 			break;
 		}
 		ex = path[depth].p_ext;
@@ -2241,7 +2242,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 
 		if (unlikely(es.es_len == 0)) {
 			EXT4_ERROR_INODE(inode, "es.es_len == 0");
-			err = -EIO;
+			err = -EFSCORRUPTED;
 			break;
 		}
 
@@ -2264,7 +2265,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 						 "next extent == %u, next "
 						 "delalloc extent = %u",
 						 next, next_del);
-				err = -EIO;
+				err = -EFSCORRUPTED;
 				break;
 			}
 		}
@@ -2363,7 +2364,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	leaf = ext4_idx_pblock(path->p_idx);
 	if (unlikely(path->p_hdr->eh_entries == 0)) {
 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	err = ext4_ext_get_access(handle, inode, path);
 	if (err)
@@ -2612,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	eh = path[depth].p_hdr;
 	if (unlikely(path[depth].p_hdr == NULL)) {
 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	/* find where to start removing */
 	ex = path[depth].p_ext;
@@ -2666,7 +2667,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 					 "on extent %u:%u",
 					 start, end, ex_ee_block,
 					 ex_ee_block + ex_ee_len - 1);
-			err = -EIO;
+			err = -EFSCORRUPTED;
 			goto out;
 		} else if (a != ex_ee_block) {
 			/* remove tail of the extent */
@@ -2841,7 +2842,7 @@ again:
 				EXT4_ERROR_INODE(inode,
 						 "path[%d].p_hdr == NULL",
 						 depth);
-				err = -EIO;
+				err = -EFSCORRUPTED;
 			}
 			goto out;
 		}
@@ -2920,7 +2921,7 @@ again:
 		i = 0;
 
 		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
-			err = -EIO;
+			err = -EFSCORRUPTED;
 			goto out;
 		}
 	}
@@ -2978,7 +2979,7 @@ again:
 			 * Should be a no-op if we did IO above. */
 			cond_resched();
 			if (WARN_ON(i + 1 > depth)) {
-				err = -EIO;
+				err = -EFSCORRUPTED;
 				break;
 			}
 			path[i + 1].p_bh = bh;
@@ -3345,7 +3346,7 @@ static int ext4_split_extent(handle_t *handle,
 	if (!ex) {
 		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
 				 (unsigned long) map->m_lblk);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	unwritten = ext4_ext_is_unwritten(ex);
 	split_flag1 = 0;
@@ -3973,7 +3974,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 		if (!ex) {
 			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
 					 (unsigned long) map->m_lblk);
-			return -EIO;
+			return -EFSCORRUPTED;
 		}
 	}
 
@@ -4311,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 				 "lblock: %lu, depth: %d pblock %lld",
 				 (unsigned long) map->m_lblk, depth,
 				 path[depth].p_block);
-		err = -EIO;
+		err = -EFSCORRUPTED;
 		goto out2;
 	}
 
@@ -5274,7 +5275,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 		if (depth == path->p_depth) {
 			ex_start = path[depth].p_ext;
 			if (!ex_start)
-				return -EIO;
+				return -EFSCORRUPTED;
 
 			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
 
@@ -5414,7 +5415,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 		if (!extent) {
 			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
 					 (unsigned long) *iterator);
-			return -EIO;
+			return -EFSCORRUPTED;
 		}
 		if (SHIFT == SHIFT_LEFT && *iterator >
 		    le32_to_cpu(extent->ee_block)) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 619bfc1..f34b1aa 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1116,6 +1116,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
 		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
+		err = -EFSCORRUPTED;
 		goto error;
 	}
 
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 2468261..d96ea53 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -566,7 +566,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 				       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
 		EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
 				 "non-extent mapped inodes with bigalloc");
-		return -EUCLEAN;
+		return -EFSCORRUPTED;
 	}
 
 	/* Set up for the direct block allocation */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f205ac3..2b92788 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -378,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
 				 "lblock %lu mapped to illegal pblock "
 				 "(length %d)", (unsigned long) map->m_lblk,
 				 map->m_len);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -480,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
 	/* We can handle the block number less than EXT_MAX_BLOCKS */
 	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
-		return -EIO;
+		return -EFSCORRUPTED;
 
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@ -3863,7 +3863,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 	iloc->bh = NULL;
 	if (!ext4_valid_inum(sb, inode->i_ino))
-		return -EIO;
+		return -EFSCORRUPTED;
 
 	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@ -4111,7 +4111,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
 				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
 				EXT4_INODE_SIZE(inode->i_sb));
-			ret = -EIO;
+			ret = -EFSCORRUPTED;
 			goto bad_inode;
 		}
 	} else
@@ -4131,7 +4131,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 
 	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
 		EXT4_ERROR_INODE(inode, "checksum invalid");
-		ret = -EIO;
+		ret = -EFSBADCRC;
 		goto bad_inode;
 	}
 
@@ -4246,7 +4246,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
 		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
 				 ei->i_file_acl);
-		ret = -EIO;
+		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	} else if (!ext4_has_inline_data(inode)) {
 		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
@@ -4297,7 +4297,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	} else if (ino == EXT4_BOOT_LOADER_INO) {
 		make_bad_inode(inode);
 	} else {
-		ret = -EIO;
+		ret = -EFSCORRUPTED;
 		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
@@ -4315,7 +4315,7 @@ bad_inode:
 struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
 {
 	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-EFSCORRUPTED);
 	return ext4_iget(sb, ino);
 }
 
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6eb1a61..0a512aa 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -98,10 +98,12 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 	}
 
 	mmp = (struct mmp_struct *)((*bh)->b_data);
-	if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC &&
-	    ext4_mmp_csum_verify(sb, mmp))
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+		ret = -EFSCORRUPTED;
+	else if (!ext4_mmp_csum_verify(sb, mmp))
+		ret = -EFSBADCRC;
+	else
 		return 0;
-	ret = -EINVAL;
 
 warn_exit:
 	ext4_warning(sb, "Error %d while reading MMP block %llu",
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9f61e76..8fd8e0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	if (!bh) {
 		ext4_error_inode(inode, func, line, block,
 				 "Directory hole found");
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-EFSCORRUPTED);
 	}
 	dirent = (struct ext4_dir_entry *) bh->b_data;
 	/* Determine whether or not we have an index block */
@@ -124,7 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	if (!is_dx_block && type == INDEX) {
 		ext4_error_inode(inode, func, line, block,
 		       "directory leaf block found instead of index block");
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-EFSCORRUPTED);
 	}
 	if (!ext4_has_metadata_csum(inode->i_sb) ||
 	    buffer_verified(bh))
@@ -142,7 +142,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 			ext4_error_inode(inode, func, line, block,
 					 "Directory index failed checksum");
 			brelse(bh);
-			return ERR_PTR(-EIO);
+			return ERR_PTR(-EFSBADCRC);
 		}
 	}
 	if (!is_dx_block) {
@@ -152,7 +152,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 			ext4_error_inode(inode, func, line, block,
 					 "Directory block failed checksum");
 			brelse(bh);
-			return ERR_PTR(-EIO);
+			return ERR_PTR(-EFSBADCRC);
 		}
 	}
 	return bh;
@@ -1570,19 +1570,19 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
 			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
-			return ERR_PTR(-EIO);
+			return ERR_PTR(-EFSCORRUPTED);
 		}
 		if (unlikely(ino == dir->i_ino)) {
 			EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
 					 dentry);
-			return ERR_PTR(-EIO);
+			return ERR_PTR(-EFSCORRUPTED);
 		}
 		inode = ext4_iget_normal(dir->i_sb, ino);
 		if (inode == ERR_PTR(-ESTALE)) {
 			EXT4_ERROR_INODE(dir,
 					 "deleted inode referenced: %u",
 					 ino);
-			return ERR_PTR(-EIO);
+			return ERR_PTR(-EFSCORRUPTED);
 		}
 		if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
 		    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -1619,7 +1619,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
 		EXT4_ERROR_INODE(d_inode(child),
 				 "bad parent inode number: %u", ino);
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-EFSCORRUPTED);
 	}
 
 	return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
@@ -1807,7 +1807,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 	while ((char *) de <= top) {
 		if (ext4_check_dir_entry(dir, NULL, de, bh,
 					 buf, buf_size, offset)) {
-			res = -EIO;
+			res = -EFSCORRUPTED;
 			goto return_result;
 		}
 		/* Provide crypto context and crypto buffer to ext4 match */
@@ -1967,7 +1967,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
 	if ((char *) de >= (((char *) root) + blocksize)) {
 		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
 		brelse(bh);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 	len = ((char *) root) + (blocksize - csum_size) - (char *) de;
 
@@ -2315,7 +2315,7 @@ int ext4_generic_delete_entry(handle_t *handle,
 	while (i < buf_size - csum_size) {
 		if (ext4_check_dir_entry(dir, NULL, de, bh,
 					 bh->b_data, bh->b_size, i))
-			return -EIO;
+			return -EFSCORRUPTED;
 		if (de == de_del)  {
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
@@ -2934,7 +2934,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 
 	inode = d_inode(dentry);
 
-	retval = -EIO;
+	retval = -EFSCORRUPTED;
 	if (le32_to_cpu(de->inode) != inode->i_ino)
 		goto end_rmdir;
 
@@ -3008,7 +3008,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 
 	inode = d_inode(dentry);
 
-	retval = -EIO;
+	retval = -EFSCORRUPTED;
 	if (le32_to_cpu(de->inode) != inode->i_ino)
 		goto end_unlink;
 
@@ -3310,7 +3310,7 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
 	if (!ent->dir_bh)
 		return retval;
 	if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
-		return -EIO;
+		return -EFSCORRUPTED;
 	BUFFER_TRACE(ent->dir_bh, "get_write_access");
 	return ext4_journal_get_write_access(handle, ent->dir_bh);
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 43c0cc8..f7519ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -490,6 +490,12 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
 	char *errstr = NULL;
 
 	switch (errno) {
+	case -EFSCORRUPTED:
+		errstr = "Corrupt filesystem";
+		break;
+	case -EFSBADCRC:
+		errstr = "Filesystem failed CRC";
+		break;
 	case -EIO:
 		errstr = "IO failure";
 		break;
@@ -3192,6 +3198,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
 			 "invalid superblock checksum.  Run e2fsck?");
 		silent = 1;
+		ret = -EFSBADCRC;
 		goto cantfind_ext4;
 	}
 
@@ -3612,6 +3619,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+		ret = -EFSCORRUPTED;
 		goto failed_mount2;
 	}
 
@@ -4664,7 +4672,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
-					err = -EINVAL;
+					err = -EFSBADCRC;
 					goto restore_opts;
 				}
 			}
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index c677f2c..abe2401 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -57,7 +57,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
 	     sizeof(struct ext4_encrypted_symlink_data) - 1) >
 	    max_size) {
 		/* Symlink data on the disk is corrupted */
-		res = -EIO;
+		res = -EFSCORRUPTED;
 		goto errout;
 	}
 	plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 16e28c0..7649422 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -195,7 +195,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 	while (!IS_LAST_ENTRY(e)) {
 		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
 		if ((void *)next >= end)
-			return -EIO;
+			return -EFSCORRUPTED;
 		e = next;
 	}
 
@@ -205,7 +205,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 		     (void *)e + sizeof(__u32) ||
 		     value_start + le16_to_cpu(entry->e_value_offs) +
 		    le32_to_cpu(entry->e_value_size) > end))
-			return -EIO;
+			return -EFSCORRUPTED;
 		entry = EXT4_XATTR_NEXT(entry);
 	}
 
@@ -222,9 +222,9 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
 
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1))
-		return -EIO;
+		return -EFSCORRUPTED;
 	if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
-		return -EIO;
+		return -EFSBADCRC;
 	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
 				       bh->b_data);
 	if (!error)
@@ -239,7 +239,7 @@ ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
 
 	if (entry->e_value_block != 0 || value_size > size ||
 	    le16_to_cpu(entry->e_value_offs) + value_size > size)
-		return -EIO;
+		return -EFSCORRUPTED;
 	return 0;
 }
 
@@ -266,7 +266,7 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
 	}
 	*pentry = entry;
 	if (!cmp && ext4_xattr_check_entry(entry, size))
-			return -EIO;
+		return -EFSCORRUPTED;
 	return cmp ? -ENODATA : 0;
 }
 
@@ -297,13 +297,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 bad_block:
 		EXT4_ERROR_INODE(inode, "bad block %llu",
 				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
+		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
 	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	entry = BFIRST(bh);
 	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
-	if (error == -EIO)
+	if (error == -EFSCORRUPTED)
 		goto bad_block;
 	if (error)
 		goto cleanup;
@@ -445,7 +445,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	if (ext4_xattr_check_block(inode, bh)) {
 		EXT4_ERROR_INODE(inode, "bad block %llu",
 				 EXT4_I(inode)->i_file_acl);
-		error = -EIO;
+		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
 	ext4_xattr_cache_insert(ext4_mb_cache, bh);
@@ -751,7 +751,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 		if (ext4_xattr_check_block(inode, bs->bh)) {
 			EXT4_ERROR_INODE(inode, "bad block %llu",
 					 EXT4_I(inode)->i_file_acl);
-			error = -EIO;
+			error = -EFSCORRUPTED;
 			goto cleanup;
 		}
 		/* Find the named attribute. */
@@ -811,7 +811,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 					bs->bh);
 			}
 			unlock_buffer(bs->bh);
-			if (error == -EIO)
+			if (error == -EFSCORRUPTED)
 				goto bad_block;
 			if (!error)
 				error = ext4_handle_dirty_xattr_block(handle,
@@ -855,7 +855,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	}
 
 	error = ext4_xattr_set_entry(i, s);
-	if (error == -EIO)
+	if (error == -EFSCORRUPTED)
 		goto bad_block;
 	if (error)
 		goto cleanup;
@@ -1314,7 +1314,7 @@ retry:
 		if (ext4_xattr_check_block(inode, bh)) {
 			EXT4_ERROR_INODE(inode, "bad block %llu",
 					 EXT4_I(inode)->i_file_acl);
-			error = -EIO;
+			error = -EFSCORRUPTED;
 			goto cleanup;
 		}
 		base = BHDR(bh);
@@ -1579,7 +1579,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
 		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
 			return 1;
 		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-			return -EIO;
+			return -EFSCORRUPTED;
 		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
 			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
 			   le32_to_cpu(entry1->e_value_size)))
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 00f7dbd..474c178 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1558,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal)
 	/* Check superblock checksum */
 	if (!jbd2_superblock_csum_verify(journal, sb)) {
 		printk(KERN_ERR "JBD2: journal checksum error\n");
+		err = -EFSBADCRC;
 		goto out;
 	}
 
@@ -1649,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal)
 		printk(KERN_ERR "JBD2: journal transaction %u on %s "
 		       "is corrupt.\n", journal->j_failed_commit,
 		       journal->j_devname);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	/* OK, we've finished with the dynamic journal bits:
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a9079d0..5c836d7 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 
 	if (offset >= journal->j_maxlen) {
 		printk(KERN_ERR "JBD2: corrupted journal superblock\n");
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	err = jbd2_journal_bmap(journal, offset, &blocknr);
@@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal,
 				printk(KERN_ERR "JBD2: Invalid checksum "
 				       "recovering block %lu in log\n",
 				       next_log_block);
-				err = -EIO;
+				err = -EFSBADCRC;
 				brelse(bh);
 				goto failed;
 			}
@@ -602,7 +602,7 @@ static int do_one_pass(journal_t *journal,
 						journal, tag, obh->b_data,
 						be32_to_cpu(tmp->h_sequence))) {
 						brelse(obh);
-						success = -EIO;
+						success = -EFSBADCRC;
 						printk(KERN_ERR "JBD2: Invalid "
 						       "checksum recovering "
 						       "block %llu in log\n",
@@ -851,7 +851,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 	rcount = be32_to_cpu(header->r_count);
 
 	if (!jbd2_revoke_block_csum_verify(journal, header))
-		return -EINVAL;
+		return -EFSBADCRC;
 
 	if (jbd2_journal_has_csum_v2or3(journal))
 		csum_size = sizeof(struct jbd2_journal_revoke_tail);
-- 
cgit v1.1


From e2b911c53584a92266943f3b7f2cdbc19c1a4e80 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 17 Oct 2015 16:18:43 -0400
Subject: ext4: clean up feature test macros with predicate functions

Create separate predicate functions to test/set/clear feature flags,
thereby replacing the wordy old macros.  Furthermore, clean out the
places where we open-coded feature tests.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/ext4/balloc.c    |  16 +++--
 fs/ext4/dir.c       |   3 +-
 fs/ext4/ext4.h      | 146 +++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/ext4_jbd2.h |  10 ++--
 fs/ext4/extents.c   |   4 +-
 fs/ext4/ialloc.c    |   4 +-
 fs/ext4/indirect.c  |   3 +-
 fs/ext4/inline.c    |   3 +-
 fs/ext4/inode.c     |  16 ++---
 fs/ext4/ioctl.c     |  15 ++---
 fs/ext4/migrate.c   |   9 +--
 fs/ext4/namei.c     |   8 +--
 fs/ext4/resize.c    |  30 ++++------
 fs/ext4/super.c     | 165 ++++++++++++++++++++++------------------------------
 fs/ext4/xattr.c     |   4 +-
 15 files changed, 252 insertions(+), 184 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f831e10..32184c1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -213,7 +213,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
 
 	start = ext4_group_first_block_no(sb, block_group);
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+	if (ext4_has_feature_flex_bg(sb))
 		flex_bg = 1;
 
 	/* Set bits for block and inode bitmaps, and inode table */
@@ -322,7 +322,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
 	ext4_fsblk_t blk;
 	ext4_fsblk_t group_first_block;
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+	if (ext4_has_feature_flex_bg(sb)) {
 		/* with FLEX_BG, the inode/block bitmaps and itable
 		 * blocks may not be in the group at all
 		 * so the bitmap validation will be skipped for those groups
@@ -740,14 +740,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 
 	if (group == 0)
 		return 1;
-	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+	if (ext4_has_feature_sparse_super2(sb)) {
 		if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
 		    group == le32_to_cpu(es->s_backup_bgs[1]))
 			return 1;
 		return 0;
 	}
-	if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+	if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
 		return 1;
 	if (!(group & 1))
 		return 0;
@@ -776,7 +775,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
 	if (!ext4_bg_has_super(sb, group))
 		return 0;
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+	if (ext4_has_feature_meta_bg(sb))
 		return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
 	else
 		return EXT4_SB(sb)->s_gdb_count;
@@ -797,8 +796,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
 	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
 
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
-			metagroup < first_meta_bg)
+	if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
 		return ext4_bg_num_gdb_nometa(sb, group);
 
 	return ext4_bg_num_gdb_meta(sb,group);
@@ -818,7 +816,7 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 	/* Check for superblock and gdt backups in this group */
 	num = ext4_bg_has_super(sb, block_group);
 
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	if (!ext4_has_feature_meta_bg(sb) ||
 	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
 			  sbi->s_desc_per_block) {
 		if (num) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b29cb70..1d1bca7 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -40,8 +40,7 @@ static int is_dx_dir(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
-	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-		     EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+	if (ext4_has_feature_dir_index(inode->i_sb) &&
 	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
 	     ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
 	     ext4_has_inline_data(inode)))
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c07973..b1d04aa7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1528,6 +1528,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
  * Feature set definitions
  */
 
+/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
 	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
@@ -1590,6 +1591,94 @@ static inline int ext4_encrypted_inode(struct inode *inode)
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
 #define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
 
+#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+	return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+		cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+	EXT4_SB(sb)->s_es->s_feature_compat |= \
+		cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+	EXT4_SB(sb)->s_es->s_feature_compat &= \
+		~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+	return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+		cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+	EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
+		cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+	EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
+		~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+	return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+		cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+	EXT4_SB(sb)->s_es->s_feature_incompat |= \
+		cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+	EXT4_SB(sb)->s_es->s_feature_incompat &= \
+		~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+}
+
+EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,		DIR_PREALLOC)
+EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,	IMAGIC_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(journal,		HAS_JOURNAL)
+EXT4_FEATURE_COMPAT_FUNCS(xattr,		EXT_ATTR)
+EXT4_FEATURE_COMPAT_FUNCS(resize_inode,		RESIZE_INODE)
+EXT4_FEATURE_COMPAT_FUNCS(dir_index,		DIR_INDEX)
+EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,	SPARSE_SUPER2)
+
+EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,	SPARSE_SUPER)
+EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,	LARGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,		BTREE_DIR)
+EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,		HUGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,		GDT_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,		DIR_NLINK)
+EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,	EXTRA_ISIZE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(quota,		QUOTA)
+EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,		BIGALLOC)
+EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,	METADATA_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,		READONLY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(project,		PROJECT)
+
+EXT4_FEATURE_INCOMPAT_FUNCS(compression,	COMPRESSION)
+EXT4_FEATURE_INCOMPAT_FUNCS(filetype,		FILETYPE)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,	RECOVER)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,	JOURNAL_DEV)
+EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,		META_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(extents,		EXTENTS)
+EXT4_FEATURE_INCOMPAT_FUNCS(64bit,		64BIT)
+EXT4_FEATURE_INCOMPAT_FUNCS(mmp,		MMP)
+EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,		FLEX_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,		EA_INODE)
+EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,		DIRDATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,		CSUM_SEED)
+EXT4_FEATURE_INCOMPAT_FUNCS(largedir,		LARGEDIR)
+EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,	INLINE_DATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
+
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_META_BG)
@@ -1605,7 +1694,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 
-#define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_RECOVER| \
 					 EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -1627,6 +1716,40 @@ static inline int ext4_encrypted_inode(struct inode *inode)
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
 					 EXT4_FEATURE_RO_COMPAT_QUOTA)
 
+#define EXTN_FEATURE_FUNCS(ver) \
+static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
+{ \
+	return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+		cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
+{ \
+	return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+		cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
+{ \
+	return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+		cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
+}
+
+EXTN_FEATURE_FUNCS(2)
+EXTN_FEATURE_FUNCS(3)
+EXTN_FEATURE_FUNCS(4)
+
+static inline bool ext4_has_compat_features(struct super_block *sb)
+{
+	return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
+}
+static inline bool ext4_has_ro_compat_features(struct super_block *sb)
+{
+	return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
+}
+static inline bool ext4_has_incompat_features(struct super_block *sb)
+{
+	return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
+}
+
 /*
  * Default values for user and/or group using reserved blocks
  */
@@ -1777,8 +1900,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
  * (c) Daniel Phillips, 2001
  */
 
-#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
-				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
 		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -2079,7 +2201,7 @@ int ext4_init_crypto(void);
 void ext4_exit_crypto(void);
 static inline int ext4_sb_has_crypto(struct super_block *sb)
 {
-	return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+	return ext4_has_feature_encrypt(sb);
 }
 #else
 static inline int ext4_init_crypto(void) { return 0; }
@@ -2200,8 +2322,7 @@ int ext4_insert_dentry(struct inode *dir,
 		       struct ext4_filename *fname);
 static inline void ext4_update_dx_flag(struct inode *inode)
 {
-	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-				     EXT4_FEATURE_COMPAT_DIR_INDEX))
+	if (!ext4_has_feature_dir_index(inode->i_sb))
 		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 static unsigned char ext4_filetype_table[] = {
@@ -2210,8 +2331,7 @@ static unsigned char ext4_filetype_table[] = {
 
 static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
 {
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
-	    (filetype >= EXT4_FT_MAX))
+	if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
 		return DT_UNKNOWN;
 
 	return ext4_filetype_table[filetype];
@@ -2543,15 +2663,13 @@ extern int ext4_register_li_request(struct super_block *sb,
 
 static inline int ext4_has_group_desc_csum(struct super_block *sb)
 {
-	return EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					  EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
-	       (EXT4_SB(sb)->s_chksum_driver != NULL);
+	return ext4_has_feature_gdt_csum(sb) ||
+	       EXT4_SB(sb)->s_chksum_driver != NULL;
 }
 
 static inline int ext4_has_metadata_csum(struct super_block *sb)
 {
-	WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
 		     !EXT4_SB(sb)->s_chksum_driver);
 
 	return (EXT4_SB(sb)->s_chksum_driver != NULL);
@@ -2898,7 +3016,7 @@ static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
 static inline void ext4_set_de_type(struct super_block *sb,
 				struct ext4_dir_entry_2 *de,
 				umode_t mode) {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+	if (ext4_has_feature_filetype(sb))
 		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9c5b49f..5f58462 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -34,8 +34,7 @@
  */
 
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
-	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-	 ? 20U : 8U)
+	(ext4_has_feature_extents(sb) ? 20U : 8U)
 
 /* Extended attribute operations touch at most two data buffers,
  * two bitmap buffers, and two group summaries, in addition to the inode
@@ -84,17 +83,16 @@
 /* Amount of blocks needed for quota update - we know that the structure was
  * allocated so we need to update only data block */
 #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
-		1 : 0)
+		ext4_has_feature_quota(sb)) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		ext4_has_feature_quota(sb)) ?\
 		(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
 		 +3+DQUOT_INIT_REWRITE) : 0)
 
 #define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		ext4_has_feature_quota(sb)) ?\
 		(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
 		 +3+DQUOT_DEL_REWRITE) : 0)
 #else
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index faeaba8..3a6197a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3055,7 +3055,7 @@ void ext4_ext_init(struct super_block *sb)
 	 * possible initialization would be here
 	 */
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+	if (ext4_has_feature_extents(sb)) {
 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
 		printk(KERN_INFO "EXT4-fs: file extents enabled"
 #ifdef AGGRESSIVE_TEST
@@ -3082,7 +3082,7 @@ void ext4_ext_init(struct super_block *sb)
  */
 void ext4_ext_release(struct super_block *sb)
 {
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+	if (!ext4_has_feature_extents(sb))
 		return;
 
 #ifdef EXTENTS_STATS
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f34b1aa..0d7f06e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1045,7 +1045,7 @@ got:
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 	ei->i_inline_off = 0;
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+	if (ext4_has_feature_inline_data(sb))
 		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
 	ret = inode;
 	err = dquot_alloc_inode(inode);
@@ -1060,7 +1060,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+	if (ext4_has_feature_extents(sb)) {
 		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index d96ea53..355ef9c 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -562,8 +562,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	*/
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+	if (ext4_has_feature_bigalloc(inode->i_sb)) {
 		EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
 				 "non-extent mapped inodes with bigalloc");
 		return -EFSCORRUPTED;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cd944a7..d884989 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -434,8 +434,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
 	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
 		0, EXT4_MIN_INLINE_DATA_SIZE);
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
-				      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+	if (ext4_has_feature_extents(inode->i_sb)) {
 		if (S_ISDIR(inode->i_mode) ||
 		    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
 			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b92788..e8d620a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2642,8 +2642,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 /* We always reserve for an inode update; the superblock could be there too */
 static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
 {
-	if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+	if (likely(ext4_has_feature_large_file(inode->i_sb)))
 		return 1;
 
 	if (pos + len <= 0x7fffffffULL)
@@ -4049,8 +4048,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 	struct inode *inode = &(ei->vfs_inode);
 	struct super_block *sb = inode->i_sb;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+	if (ext4_has_feature_huge_file(sb)) {
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
@@ -4173,7 +4171,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+	if (ext4_has_feature_64bit(sb))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(raw_inode);
@@ -4337,7 +4335,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+	if (!ext4_has_feature_huge_file(sb))
 		return -EFBIG;
 
 	if (i_blocks <= 0xffffffffffffULL) {
@@ -4498,8 +4496,7 @@ static int ext4_do_update_inode(handle_t *handle,
 		need_datasync = 1;
 	}
 	if (ei->i_disksize > 0x7fffffffULL) {
-		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+		if (!ext4_has_feature_large_file(sb) ||
 				EXT4_SB(sb)->s_es->s_rev_level ==
 		    cpu_to_le32(EXT4_GOOD_OLD_REV))
 			set_large_file = 1;
@@ -4548,8 +4545,7 @@ static int ext4_do_update_inode(handle_t *handle,
 		if (err)
 			goto out_brelse;
 		ext4_update_dynamic_rev(sb);
-		EXT4_SET_RO_COMPAT_FEATURE(sb,
-					   EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+		ext4_set_feature_large_file(sb);
 		ext4_handle_sync(handle);
 		err = ext4_handle_dirty_super(handle, sb);
 	}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1346cfa..5e872fd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,8 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 		inode_bl->i_version = 1;
 		i_size_write(inode_bl, 0);
 		inode_bl->i_mode = S_IFREG;
-		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-					      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		if (ext4_has_feature_extents(sb)) {
 			ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
 			ext4_ext_tree_init(handle, inode_bl);
 		} else
@@ -383,8 +382,7 @@ setversion_out:
 			goto group_extend_out;
 		}
 
-		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		if (ext4_has_feature_bigalloc(sb)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not supported with bigalloc");
 			err = -EOPNOTSUPP;
@@ -432,8 +430,7 @@ group_extend_out:
 			goto mext_out;
 		}
 
-		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		if (ext4_has_feature_bigalloc(sb)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online defrag not supported with bigalloc");
 			err = -EOPNOTSUPP;
@@ -470,8 +467,7 @@ mext_out:
 			goto group_add_out;
 		}
 
-		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		if (ext4_has_feature_bigalloc(sb)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not supported with bigalloc");
 			err = -EOPNOTSUPP;
@@ -553,8 +549,7 @@ group_add_out:
 		int err = 0, err2 = 0;
 		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
 
-		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		if (ext4_has_feature_bigalloc(sb)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not (yet) supported with bigalloc");
 			return -EOPNOTSUPP;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6163ad2..a465189 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -448,8 +448,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 * If the filesystem does not support extents, or the inode
 	 * already is extent-based, error out.
 	 */
-	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	if (!ext4_has_feature_extents(inode->i_sb) ||
 	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EINVAL;
 
@@ -625,13 +624,11 @@ int ext4_ind_migrate(struct inode *inode)
 	handle_t			*handle;
 	int				ret;
 
-	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	if (!ext4_has_feature_extents(inode->i_sb) ||
 	    (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EINVAL;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+	if (ext4_has_feature_bigalloc(inode->i_sb))
 		return -EOPNOTSUPP;
 
 	/*
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8fd8e0d..52a79d4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2118,7 +2118,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			goto out;
 
 		if (blocks == 1 && !dx_fallback &&
-		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+		    ext4_has_feature_dir_index(sb)) {
 			retval = make_indexed_dir(handle, &fname, dentry,
 						  inode, bh);
 			bh = NULL; /* make_indexed_dir releases bh */
@@ -2388,8 +2388,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
 		/* limit is 16-bit i_links_count */
 		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
 			set_nlink(inode, 1);
-			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
-					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+			ext4_set_feature_dir_nlink(inode->i_sb);
 		}
 	}
 }
@@ -3352,8 +3351,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 	if (retval)
 		return retval;
 	ent->de->inode = cpu_to_le32(ino);
-	if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
-				      EXT4_FEATURE_INCOMPAT_FILETYPE))
+	if (ext4_has_feature_filetype(ent->dir->i_sb))
 		ent->de->file_type = file_type;
 	ent->dir->i_version++;
 	ent->dir->i_ctime = ent->dir->i_mtime =
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf0c472..469eecc 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -490,7 +490,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 	       group_data[0].group != sbi->s_groups_count);
 
 	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
-	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	meta_bg = ext4_has_feature_meta_bg(sb);
 
 	/* This transaction may be extended/restarted along the way */
 	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
@@ -680,8 +680,7 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
 	int mult = 3;
 	unsigned ret;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+	if (!ext4_has_feature_sparse_super(sb)) {
 		ret = *min;
 		*min += 1;
 		return ret;
@@ -1158,7 +1157,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
 	int i, gdb_off, gdb_num, err = 0;
 	int meta_bg;
 
-	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	meta_bg = ext4_has_feature_meta_bg(sb);
 	for (i = 0; i < count; i++, group++) {
 		int reserved_gdb = ext4_bg_has_super(sb, group) ?
 			le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1381,9 +1380,7 @@ static void ext4_update_super(struct super_block *sb,
 
 	ext4_debug("free blocks count %llu",
 		   percpu_counter_read(&sbi->s_freeclusters_counter));
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
-	    sbi->s_log_groups_per_flex) {
+	if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, group_data[0].group);
 		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
@@ -1476,8 +1473,7 @@ exit_journal:
 		int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 		int gdb_num_end = ((group + flex_gd->count - 1) /
 				   EXT4_DESC_PER_BLOCK(sb));
-		int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
-				EXT4_FEATURE_INCOMPAT_META_BG);
+		int meta_bg = ext4_has_feature_meta_bg(sb);
 		sector_t old_gdb = 0;
 
 		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
@@ -1585,8 +1581,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
 
-	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+	if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) {
 		ext4_warning(sb, "Can't resize non-sparse filesystem further");
 		return -EPERM;
 	}
@@ -1604,9 +1599,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	}
 
 	if (reserved_gdb || gdb_off == 0) {
-		if (!EXT4_HAS_COMPAT_FEATURE(sb,
-					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
-		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+		if (ext4_has_feature_resize_inode(sb) ||
+		    !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext4_warning(sb,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
@@ -1825,8 +1819,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
 	if (err)
 		goto errout;
 
-	EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	ext4_clear_feature_resize_inode(sb);
+	ext4_set_feature_meta_bg(sb);
 	sbi->s_es->s_first_meta_bg =
 		cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
 
@@ -1918,9 +1912,9 @@ retry:
 	n_desc_blocks = num_desc_blocks(sb, n_group + 1);
 	o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
 
-	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	meta_bg = ext4_has_feature_meta_bg(sb);
 
-	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+	if (ext4_has_feature_resize_inode(sb)) {
 		if (meta_bg) {
 			ext4_error(sb, "resize_inode and meta_bg enabled "
 				   "simultaneously");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7519ad..49fa4d4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -110,8 +110,7 @@ MODULE_ALIAS("ext3");
 static int ext4_verify_csum_type(struct super_block *sb,
 				 struct ext4_super_block *es)
 {
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_feature_metadata_csum(sb))
 		return 1;
 
 	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
@@ -810,7 +809,7 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_xattr_put_super(sb);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_clear_feature_journal_needs_recovery(sb);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
 	}
 	if (!(sb->s_flags & MS_RDONLY))
@@ -1284,7 +1283,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 			"quota options when quota turned on");
 		return -1;
 	}
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+	if (ext4_has_feature_quota(sb)) {
 		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
 			 "when QUOTA feature is enabled");
 		return -1;
@@ -1643,8 +1642,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 				 "quota options when quota turned on");
 			return -1;
 		}
-		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+		if (ext4_has_feature_quota(sb)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Cannot set journaled quota options "
 				 "when QUOTA feature is enabled");
@@ -1703,7 +1701,7 @@ static int parse_options(char *options, struct super_block *sb,
 			return 0;
 	}
 #ifdef CONFIG_QUOTA
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	if (ext4_has_feature_quota(sb) &&
 	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
 		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
 			 "feature is enabled");
@@ -1927,7 +1925,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	es->s_mtime = cpu_to_le32(get_seconds());
 	ext4_update_dynamic_rev(sb);
 	if (sbi->s_journal)
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_set_feature_journal_needs_recovery(sb);
 
 	ext4_commit_super(sb, 1);
 done:
@@ -2010,12 +2008,13 @@ failed:
 	return 0;
 }
 
-static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
 				   struct ext4_group_desc *gdp)
 {
 	int offset;
 	__u16 crc = 0;
 	__le32 le_group = cpu_to_le32(block_group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (ext4_has_metadata_csum(sbi->s_sb)) {
 		/* Use new metadata_csum algorithm */
@@ -2035,8 +2034,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 	}
 
 	/* old crc16 code */
-	if (!(sbi->s_es->s_feature_ro_compat &
-	      cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+	if (!ext4_has_feature_gdt_csum(sb))
 		return 0;
 
 	offset = offsetof(struct ext4_group_desc, bg_checksum);
@@ -2046,8 +2044,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 	crc = crc16(crc, (__u8 *)gdp, offset);
 	offset += sizeof(gdp->bg_checksum); /* skip checksum */
 	/* for checksum of struct ext4_group_desc do the rest...*/
-	if ((sbi->s_es->s_feature_incompat &
-	     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+	if (ext4_has_feature_64bit(sb) &&
 	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
 		crc = crc16(crc, (__u8 *)gdp + offset,
 			    le16_to_cpu(sbi->s_es->s_desc_size) -
@@ -2061,8 +2058,7 @@ int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
 				struct ext4_group_desc *gdp)
 {
 	if (ext4_has_group_desc_csum(sb) &&
-	    (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
-						      block_group, gdp)))
+	    (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
 		return 0;
 
 	return 1;
@@ -2073,7 +2069,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
 {
 	if (!ext4_has_group_desc_csum(sb))
 		return;
-	gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+	gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
 }
 
 /* Called at mount-time, super-block is locked */
@@ -2089,7 +2085,7 @@ static int ext4_check_descriptors(struct super_block *sb,
 	int flexbg_flag = 0;
 	ext4_group_t i, grp = sbi->s_groups_count;
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+	if (ext4_has_feature_flex_bg(sb))
 		flexbg_flag = 1;
 
 	ext4_debug("Checking group descriptors");
@@ -2133,7 +2129,7 @@ static int ext4_check_descriptors(struct super_block *sb,
 		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 				 "Checksum for group %u failed (%u!=%u)",
-				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
 				     gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
 				ext4_unlock_group(sb, i);
@@ -2396,8 +2392,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 
 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
 
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
-	    nr < first_meta_bg)
+	if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
 		return logical_sb_block + nr + 1;
 	bg = sbi->s_desc_per_block * nr;
 	if (ext4_bg_has_super(sb, bg))
@@ -2461,7 +2456,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
  */
 static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+	if (ext4_has_unknown_ext4_incompat_features(sb)) {
 		ext4_msg(sb, KERN_ERR,
 			"Couldn't mount because of "
 			"unsupported optional features (%x)",
@@ -2473,14 +2468,14 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 	if (readonly)
 		return 1;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+	if (ext4_has_feature_readonly(sb)) {
 		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
 		sb->s_flags |= MS_RDONLY;
 		return 1;
 	}
 
 	/* Check that feature set is OK for a read-write mount */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+	if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
 		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
 			 "unsupported optional features (%x)",
 			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
@@ -2491,7 +2486,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 	 * Large file size enabled file system can only be mounted
 	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
 	 */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+	if (ext4_has_feature_huge_file(sb)) {
 		if (sizeof(blkcnt_t) < sizeof(u64)) {
 			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
 				 "cannot be mounted RDWR without "
@@ -2499,8 +2494,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 			return 0;
 		}
 	}
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
-	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Can't support bigalloc feature without "
 			 "extents feature\n");
@@ -2508,8 +2502,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 	}
 
 #ifndef CONFIG_QUOTA
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
-	    !readonly) {
+	if (ext4_has_feature_quota(sb) && !readonly) {
 		ext4_msg(sb, KERN_ERR,
 			 "Filesystem with quota feature cannot be mounted RDWR "
 			 "without CONFIG_QUOTA");
@@ -2966,7 +2959,7 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
 	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
 	int			s, j, count = 0;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+	if (!ext4_has_feature_bigalloc(sb))
 		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
 			sbi->s_itb_per_group + 2);
 
@@ -3068,7 +3061,7 @@ static void ext4_set_resv_clusters(struct super_block *sb)
 	 * hole punching doesn't need new metadata... This is needed especially
 	 * to keep ext2/3 backward compatibility.
 	 */
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+	if (!ext4_has_feature_extents(sb))
 		return;
 	/*
 	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
@@ -3167,9 +3160,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
 	/* Warn if metadata_csum and gdt_csum are both set. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
-	    EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+	if (ext4_has_feature_metadata_csum(sb) &&
+	    ext4_has_feature_gdt_csum(sb))
 		ext4_warning(sb, "metadata_csum and uninit_bg are "
 			     "redundant flags; please run fsck.");
 
@@ -3182,8 +3174,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Load the checksum driver */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+	if (ext4_has_feature_metadata_csum(sb)) {
 		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(sbi->s_chksum_driver)) {
 			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -3203,7 +3194,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Precompute checksum seed for all metadata */
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_CSUM_SEED))
+	if (ext4_has_feature_csum_seed(sb))
 		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
 	else if (ext4_has_metadata_csum(sb))
 		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
@@ -3308,17 +3299,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
-	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
-	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
-	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+	    (ext4_has_compat_features(sb) ||
+	     ext4_has_ro_compat_features(sb) ||
+	     ext4_has_incompat_features(sb)))
 		ext4_msg(sb, KERN_WARNING,
 		       "feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended");
 
 	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
 		set_opt2(sb, HURD_COMPAT);
-		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-					      EXT4_FEATURE_INCOMPAT_64BIT)) {
+		if (ext4_has_feature_64bit(sb)) {
 			ext4_msg(sb, KERN_ERR,
 				 "The Hurd can't support 64-bit file systems");
 			goto failed_mount;
@@ -3376,8 +3366,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
-	    es->s_encryption_level) {
+	if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
 		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
 			 es->s_encryption_level);
 		goto failed_mount;
@@ -3409,8 +3398,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 
-	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+	has_huge_files = ext4_has_feature_huge_file(sb);
 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
 						      has_huge_files);
 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -3434,7 +3422,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+	if (ext4_has_feature_64bit(sb)) {
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
 		    !is_power_of_2(sbi->s_desc_size)) {
@@ -3465,7 +3453,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
-	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+	if (ext4_has_feature_dir_index(sb)) {
 		i = le32_to_cpu(es->s_flags);
 		if (i & EXT2_FLAGS_UNSIGNED_HASH)
 			sbi->s_hash_unsigned = 3;
@@ -3485,8 +3473,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Handle clustersize */
 	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
-	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+	has_bigalloc = ext4_has_feature_bigalloc(sb);
 	if (has_bigalloc) {
 		if (clustersize < blocksize) {
 			ext4_msg(sb, KERN_ERR,
@@ -3645,7 +3632,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &ext4_quota_operations;
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+	if (ext4_has_feature_quota(sb))
 		sb->s_qcop = &dquot_quotactl_sysfile_ops;
 	else
 		sb->s_qcop = &ext4_qctl_operations;
@@ -3659,11 +3646,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_root = NULL;
 
 	needs_recovery = (es->s_last_orphan != 0 ||
-			  EXT4_HAS_INCOMPAT_FEATURE(sb,
-				    EXT4_FEATURE_INCOMPAT_RECOVER));
+			  ext4_has_feature_journal_needs_recovery(sb));
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
-	    !(sb->s_flags & MS_RDONLY))
+	if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY))
 		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
 			goto failed_mount3a;
 
@@ -3671,12 +3656,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
 	 */
-	if (!test_opt(sb, NOLOAD) &&
-	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
 			goto failed_mount3a;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
-	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		   ext4_has_feature_journal_needs_recovery(sb)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
 		       "suppressed and not mounted read-only");
 		goto failed_mount_wq;
@@ -3687,7 +3671,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto no_journal;
 	}
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
+	if (ext4_has_feature_64bit(sb) &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
 		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
@@ -3739,18 +3723,16 @@ no_journal:
 		}
 	}
 
-	if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
-	     EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_CACHE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Unsupported blocksize for fs encryption");
 		goto failed_mount_wq;
 	}
 
-	if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
-	    !(sb->s_flags & MS_RDONLY) &&
-	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+	if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) &&
+	    !ext4_has_feature_encrypt(sb)) {
+		ext4_set_feature_encrypt(sb);
 		ext4_commit_super(sb, 1);
 	}
 
@@ -3809,8 +3791,7 @@ no_journal:
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						     EXT4_GOOD_OLD_INODE_SIZE;
-		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+		if (ext4_has_feature_extra_isize(sb)) {
 			if (sbi->s_want_extra_isize <
 			    le16_to_cpu(es->s_want_extra_isize))
 				sbi->s_want_extra_isize =
@@ -3869,7 +3850,7 @@ no_journal:
 		goto failed_mount6;
 	}
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+	if (ext4_has_feature_flex_bg(sb))
 		if (!ext4_fill_flex_info(sb)) {
 			ext4_msg(sb, KERN_ERR,
 			       "unable to initialize "
@@ -3887,8 +3868,7 @@ no_journal:
 
 #ifdef CONFIG_QUOTA
 	/* Enable quota usage during mount. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
-	    !(sb->s_flags & MS_RDONLY)) {
+	if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) {
 		err = ext4_enable_quotas(sb);
 		if (err)
 			goto failed_mount8;
@@ -4029,7 +4009,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 	struct inode *journal_inode;
 	journal_t *journal;
 
-	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+	BUG_ON(!ext4_has_feature_journal(sb));
 
 	/* First, test for the existence of a valid inode on disk.  Bad
 	 * things happen if we iget() an unused inode, as the subsequent
@@ -4079,7 +4059,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	struct ext4_super_block *es;
 	struct block_device *bdev;
 
-	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+	BUG_ON(!ext4_has_feature_journal(sb));
 
 	bdev = ext4_blkdev_get(j_dev, sb);
 	if (bdev == NULL)
@@ -4171,7 +4151,7 @@ static int ext4_load_journal(struct super_block *sb,
 	int err = 0;
 	int really_read_only;
 
-	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+	BUG_ON(!ext4_has_feature_journal(sb));
 
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -4188,7 +4168,7 @@ static int ext4_load_journal(struct super_block *sb,
 	 * crash?  For recovery, we need to check in advance whether we
 	 * can get read-write access to the device.
 	 */
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+	if (ext4_has_feature_journal_needs_recovery(sb)) {
 		if (sb->s_flags & MS_RDONLY) {
 			ext4_msg(sb, KERN_INFO, "INFO: recovery "
 					"required on readonly filesystem");
@@ -4219,7 +4199,7 @@ static int ext4_load_journal(struct super_block *sb,
 	if (!(journal->j_flags & JBD2_BARRIER))
 		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+	if (!ext4_has_feature_journal_needs_recovery(sb))
 		err = jbd2_journal_wipe(journal, !really_read_only);
 	if (!err) {
 		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
@@ -4333,7 +4313,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
-	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+	if (!ext4_has_feature_journal(sb)) {
 		BUG_ON(journal != NULL);
 		return;
 	}
@@ -4341,9 +4321,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+	if (ext4_has_feature_journal_needs_recovery(sb) &&
 	    sb->s_flags & MS_RDONLY) {
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_clear_feature_journal_needs_recovery(sb);
 		ext4_commit_super(sb, 1);
 	}
 
@@ -4363,7 +4343,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
 	int j_errno;
 	const char *errstr;
 
-	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+	BUG_ON(!ext4_has_feature_journal(sb));
 
 	journal = EXT4_SB(sb)->s_journal;
 
@@ -4478,7 +4458,7 @@ static int ext4_freeze(struct super_block *sb)
 			goto out;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_clear_feature_journal_needs_recovery(sb);
 	}
 
 	error = ext4_commit_super(sb, 1);
@@ -4500,7 +4480,7 @@ static int ext4_unfreeze(struct super_block *sb)
 
 	if (EXT4_SB(sb)->s_journal) {
 		/* Reset the needs_recovery flag before the fs is unlocked. */
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_set_feature_journal_needs_recovery(sb);
 	}
 
 	ext4_commit_super(sb, 1);
@@ -4653,8 +4633,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				ext4_mark_recovery_complete(sb, es);
 		} else {
 			/* Make sure we can mount this feature set readwrite */
-			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_READONLY) ||
+			if (ext4_has_feature_readonly(sb) ||
 			    !ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
@@ -4670,7 +4649,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
 					ext4_msg(sb, KERN_ERR,
 	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
-		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+		g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EFSBADCRC;
 					goto restore_opts;
@@ -4702,8 +4681,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
-			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-						     EXT4_FEATURE_INCOMPAT_MMP))
+			if (ext4_has_feature_mmp(sb))
 				if (ext4_multi_mount_protect(sb,
 						le64_to_cpu(es->s_mmp_block))) {
 					err = -EROFS;
@@ -4736,8 +4714,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (enable_quota) {
 		if (sb_any_quota_suspended(sb))
 			dquot_resume(sb, -1);
-		else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+		else if (ext4_has_feature_quota(sb)) {
 			err = ext4_enable_quotas(sb);
 			if (err)
 				goto restore_opts;
@@ -4881,7 +4858,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	/* Are we journaling quotas? */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+	if (ext4_has_feature_quota(sb) ||
 	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
 		return ext4_write_dquot(dquot);
@@ -4969,7 +4946,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
 	};
 
-	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+	BUG_ON(!ext4_has_feature_quota(sb));
 
 	if (!qf_inums[type])
 		return -EPERM;
@@ -5163,11 +5140,11 @@ static inline void unregister_as_ext2(void)
 
 static inline int ext2_feature_set_ok(struct super_block *sb)
 {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+	if (ext4_has_unknown_ext2_incompat_features(sb))
 		return 0;
 	if (sb->s_flags & MS_RDONLY)
 		return 1;
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+	if (ext4_has_unknown_ext2_ro_compat_features(sb))
 		return 0;
 	return 1;
 }
@@ -5192,13 +5169,13 @@ static inline void unregister_as_ext3(void)
 
 static inline int ext3_feature_set_ok(struct super_block *sb)
 {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+	if (ext4_has_unknown_ext3_incompat_features(sb))
 		return 0;
-	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+	if (!ext4_has_feature_journal(sb))
 		return 0;
 	if (sb->s_flags & MS_RDONLY)
 		return 1;
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+	if (ext4_has_unknown_ext3_ro_compat_features(sb))
 		return 0;
 	return 1;
 }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7649422..984448c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -525,12 +525,12 @@ errout:
 static void ext4_xattr_update_super_block(handle_t *handle,
 					  struct super_block *sb)
 {
-	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+	if (ext4_has_feature_xattr(sb))
 		return;
 
 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
-		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+		ext4_set_feature_xattr(sb);
 		ext4_handle_dirty_super(handle, sb);
 	}
 }
-- 
cgit v1.1


From 56316a0d28f251dae6a3bc2b6d50e7c25389871f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 17 Oct 2015 16:18:45 -0400
Subject: jbd2: clean up feature test macros with predicate functions

Create separate predicate functions to test/set/clear feature flags,
thereby replacing the wordy old macros.  Furthermore, clean out the
places where we open-coded feature tests.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/commit.c   | 22 ++++++++--------------
 fs/jbd2/journal.c  | 12 ++++++------
 fs/jbd2/recovery.c | 18 +++++++-----------
 fs/jbd2/revoke.c   |  4 ++--
 4 files changed, 23 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 362e5f6..36345fe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
-	if (JBD2_HAS_COMPAT_FEATURE(journal,
-				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+	if (jbd2_has_feature_checksum(journal)) {
 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
@@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	bh->b_end_io = journal_end_buffer_io_sync;
 
 	if (journal->j_flags & JBD2_BARRIER &&
-	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
-				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+	    !jbd2_has_feature_async_commit(journal))
 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 	else
 		ret = submit_bh(WRITE_SYNC, bh);
@@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 				   unsigned long long block)
 {
 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
-	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
+	if (jbd2_has_feature_64bit(j))
 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 }
 
@@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 			     bh->b_size);
 	kunmap_atomic(addr);
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+	if (jbd2_has_feature_csum3(j))
 		tag3->t_checksum = cpu_to_be32(csum32);
 	else
 		tag->t_checksum = cpu_to_be16(csum32);
@@ -730,8 +728,7 @@ start_journal_io:
 				/*
 				 * Compute checksum.
 				 */
-				if (JBD2_HAS_COMPAT_FEATURE(journal,
-					JBD2_FEATURE_COMPAT_CHECKSUM)) {
+				if (jbd2_has_feature_checksum(journal)) {
 					crc32_sum =
 					    jbd2_checksum_data(crc32_sum, bh);
 				}
@@ -797,8 +794,7 @@ start_journal_io:
 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 
 	/* Done it all: now write the commit record asynchronously. */
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
-				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+	if (jbd2_has_feature_async_commit(journal)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 						 &cbh, crc32_sum);
 		if (err)
@@ -889,8 +885,7 @@ start_journal_io:
 	commit_transaction->t_state = T_COMMIT_JFLUSH;
 	write_unlock(&journal->j_state_lock);
 
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+	if (!jbd2_has_feature_async_commit(journal)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 						&cbh, crc32_sum);
 		if (err)
@@ -898,8 +893,7 @@ start_journal_io:
 	}
 	if (cbh)
 		err = journal_wait_on_commit_record(journal, cbh);
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
-				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+	if (jbd2_has_feature_async_commit(journal) &&
 	    journal->j_flags & JBD2_BARRIER) {
 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 	}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 474c178..eb8a754 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1523,8 +1523,8 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
-	    JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+	if (jbd2_has_feature_csum2(journal) &&
+	    jbd2_has_feature_csum3(journal)) {
 		/* Can't have checksum v2 and v3 at the same time! */
 		printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
 		       "at the same time!\n");
@@ -1532,7 +1532,7 @@ static int journal_get_superblock(journal_t *journal)
 	}
 
 	if (jbd2_journal_has_csum_v2or3_feature(journal) &&
-	    JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+	    jbd2_has_feature_checksum(journal)) {
 		/* Can't have checksum v1 and v2 on at the same time! */
 		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
 		       "at the same time!\n");
@@ -2198,15 +2198,15 @@ size_t journal_tag_bytes(journal_t *journal)
 {
 	size_t sz;
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+	if (jbd2_has_feature_csum3(journal))
 		return sizeof(journal_block_tag3_t);
 
 	sz = sizeof(journal_block_tag_t);
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+	if (jbd2_has_feature_csum2(journal))
 		sz += sizeof(__u16);
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+	if (jbd2_has_feature_64bit(journal))
 		return sz;
 	else
 		return sz - sizeof(__u32);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 5c836d7..7f277e4 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal,
 						journal_block_tag_t *tag)
 {
 	unsigned long long block = be32_to_cpu(tag->t_blocknr);
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+	if (jbd2_has_feature_64bit(journal))
 		block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
 	return block;
 }
@@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 	csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+	if (jbd2_has_feature_csum3(j))
 		return tag3->t_checksum == cpu_to_be32(csum32);
 	else
 		return tag->t_checksum == cpu_to_be16(csum32);
@@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal,
 			 * just skip over the blocks it describes. */
 			if (pass != PASS_REPLAY) {
 				if (pass == PASS_SCAN &&
-				    JBD2_HAS_COMPAT_FEATURE(journal,
-					    JBD2_FEATURE_COMPAT_CHECKSUM) &&
+				    jbd2_has_feature_checksum(journal) &&
 				    !info->end_transaction) {
 					if (calc_chksums(journal, bh,
 							&next_log_block,
@@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal,
 			 * much to do other than move on to the next sequence
 			 * number. */
 			if (pass == PASS_SCAN &&
-			    JBD2_HAS_COMPAT_FEATURE(journal,
-				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+			    jbd2_has_feature_checksum(journal)) {
 				int chksum_err, chksum_seen;
 				struct commit_header *cbh =
 					(struct commit_header *)bh->b_data;
@@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal,
 				if (chksum_err) {
 					info->end_transaction = next_commit_ID;
 
-					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+					if (!jbd2_has_feature_async_commit(journal)) {
 						journal->j_failed_commit =
 							next_commit_ID;
 						brelse(bh);
@@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal,
 							   bh->b_data)) {
 				info->end_transaction = next_commit_ID;
 
-				if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-				     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+				if (!jbd2_has_feature_async_commit(journal)) {
 					journal->j_failed_commit =
 						next_commit_ID;
 					brelse(bh);
@@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 		return -EINVAL;
 	max = rcount;
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+	if (jbd2_has_feature_64bit(journal))
 		record_len = 8;
 
 	while (offset + record_len <= max) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 0abf2e7f7..705ae57 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -589,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal,
 	if (jbd2_journal_has_csum_v2or3(journal))
 		csum_size = sizeof(struct jbd2_journal_revoke_tail);
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+	if (jbd2_has_feature_64bit(journal))
 		sz = 8;
 	else
 		sz = 4;
@@ -619,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal,
 		*descriptorp = descriptor;
 	}
 
-	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+	if (jbd2_has_feature_64bit(journal))
 		* ((__be64 *)(&descriptor->b_data[offset])) =
 			cpu_to_be64(record->blocknr);
 	else
-- 
cgit v1.1


From 9008a58e5dcee014f5de69d154e2620870f9224e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 17 Oct 2015 21:33:24 -0400
Subject: ext4: make the bitmap read routines return real error codes

Make the bitmap reaading routines return real error codes (EIO,
EFSCORRUPTED, EFSBADCRC) which can then be reflected back to
userspace for more precise diagnosis work.

In particular, this means that mballoc no longer claims that we're out
of memory if the block bitmaps become corrupt.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  74 ++++++++++++++++++++-----------------
 fs/ext4/ialloc.c  | 108 ++++++++++++++++++++++++++++++++++++------------------
 fs/ext4/mballoc.c |  46 +++++++++++++++--------
 3 files changed, 143 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 32184c1..ec0668a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,6 +191,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
 	/* If checksum is bad mark all blocks used to prevent allocation
 	 * essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+		ext4_error(sb, "Checksum bad for group %u", block_group);
 		grp = ext4_get_group_info(sb, block_group);
 		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 			percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -360,42 +361,45 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
 	return 0;
 }
 
-static void ext4_validate_block_bitmap(struct super_block *sb,
-				       struct ext4_group_desc *desc,
-				       ext4_group_t block_group,
-				       struct buffer_head *bh)
+static int ext4_validate_block_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *desc,
+				      ext4_group_t block_group,
+				      struct buffer_head *bh)
 {
 	ext4_fsblk_t	blk;
 	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
-		return;
+	if (buffer_verified(bh))
+		return 0;
+	if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+		return -EFSCORRUPTED;
 
 	ext4_lock_group(sb, block_group);
-	blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
-	if (unlikely(blk != 0)) {
+	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+			desc, bh))) {
 		ext4_unlock_group(sb, block_group);
-		ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
-			   block_group, blk);
+		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
 		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 			percpu_counter_sub(&sbi->s_freeclusters_counter,
 					   grp->bb_free);
 		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
-		return;
+		return -EFSBADCRC;
 	}
-	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
-			desc, bh))) {
+	blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	if (unlikely(blk != 0)) {
 		ext4_unlock_group(sb, block_group);
-		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+		ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+			   block_group, blk);
 		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 			percpu_counter_sub(&sbi->s_freeclusters_counter,
 					   grp->bb_free);
 		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
-		return;
+		return -EFSCORRUPTED;
 	}
 	set_buffer_verified(bh);
 	ext4_unlock_group(sb, block_group);
+	return 0;
 }
 
 /**
@@ -414,17 +418,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	struct ext4_group_desc *desc;
 	struct buffer_head *bh;
 	ext4_fsblk_t bitmap_blk;
+	int err;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
-		return NULL;
+		return ERR_PTR(-EFSCORRUPTED);
 	bitmap_blk = ext4_block_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
 		ext4_error(sb, "Cannot get buffer for block bitmap - "
 			   "block_group = %u, block_bitmap = %llu",
 			   block_group, bitmap_blk);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	if (bitmap_uptodate(bh))
@@ -437,7 +442,6 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	}
 	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		int err;
 
 		err = ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
@@ -445,7 +449,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		if (err)
-			ext4_error(sb, "Checksum bad for grp %u", block_group);
+			goto out;
 		goto verify;
 	}
 	ext4_unlock_group(sb, block_group);
@@ -468,11 +472,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 	return bh;
 verify:
-	ext4_validate_block_bitmap(sb, desc, block_group, bh);
-	if (buffer_verified(bh))
-		return bh;
+	err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
+	if (err)
+		goto out;
+	return bh;
+out:
 	put_bh(bh);
-	return NULL;
+	return ERR_PTR(err);
 }
 
 /* Returns 0 on success, 1 on error */
@@ -485,32 +491,32 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 		return 0;
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
-		return 1;
+		return -EFSCORRUPTED;
 	wait_on_buffer(bh);
 	if (!buffer_uptodate(bh)) {
 		ext4_error(sb, "Cannot read block bitmap - "
 			   "block_group = %u, block_bitmap = %llu",
 			   block_group, (unsigned long long) bh->b_blocknr);
-		return 1;
+		return -EIO;
 	}
 	clear_buffer_new(bh);
 	/* Panic or remount fs read-only if block bitmap is invalid */
-	ext4_validate_block_bitmap(sb, desc, block_group, bh);
-	/* ...but check for error just in case errors=continue. */
-	return !buffer_verified(bh);
+	return ext4_validate_block_bitmap(sb, desc, block_group, bh);
 }
 
 struct buffer_head *
 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct buffer_head *bh;
+	int err;
 
 	bh = ext4_read_block_bitmap_nowait(sb, block_group);
-	if (!bh)
-		return NULL;
-	if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+	if (IS_ERR(bh))
+		return bh;
+	err = ext4_wait_block_bitmap(sb, block_group, bh);
+	if (err) {
 		put_bh(bh);
-		return NULL;
+		return ERR_PTR(err);
 	}
 	return bh;
 }
@@ -681,8 +687,10 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 			desc_count += ext4_free_group_clusters(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_block_bitmap(sb, i);
-		if (bitmap_bh == NULL)
+		if (IS_ERR(bitmap_bh)) {
+			bitmap_bh = NULL;
 			continue;
+		}
 
 		x = ext4_count_free(bitmap_bh->b_data,
 				    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 0d7f06e..1b8024d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,7 +64,7 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 
 /* Initializes an uninitialized inode bitmap */
-static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+static int ext4_init_inode_bitmap(struct super_block *sb,
 				       struct buffer_head *bh,
 				       ext4_group_t block_group,
 				       struct ext4_group_desc *gdp)
@@ -89,7 +89,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
 					   count);
 		}
 		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
-		return 0;
+		return -EFSBADCRC;
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
@@ -99,7 +99,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
 				   EXT4_INODES_PER_GROUP(sb) / 8);
 	ext4_group_desc_csum_set(sb, block_group, gdp);
 
-	return EXT4_INODES_PER_GROUP(sb);
+	return 0;
 }
 
 void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
@@ -112,6 +112,42 @@ void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
+static int ext4_validate_inode_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *desc,
+				      ext4_group_t block_group,
+				      struct buffer_head *bh)
+{
+	ext4_fsblk_t	blk;
+	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (buffer_verified(bh))
+		return 0;
+	if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+		return -EFSCORRUPTED;
+
+	ext4_lock_group(sb, block_group);
+	blk = ext4_inode_bitmap(sb, desc);
+	if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+					   EXT4_INODES_PER_GROUP(sb) / 8)) {
+		ext4_unlock_group(sb, block_group);
+		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+			   "inode_bitmap = %llu", block_group, blk);
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, desc);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return -EFSBADCRC;
+	}
+	set_buffer_verified(bh);
+	ext4_unlock_group(sb, block_group);
+	return 0;
+}
+
 /*
  * Read the inode allocation bitmap for a given block_group, reading
  * into the specified slot in the superblock's bitmap cache.
@@ -124,12 +160,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct ext4_group_desc *desc;
 	struct buffer_head *bh = NULL;
 	ext4_fsblk_t bitmap_blk;
-	struct ext4_group_info *grp;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int err;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
-		return NULL;
+		return ERR_PTR(-EFSCORRUPTED);
 
 	bitmap_blk = ext4_inode_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
@@ -137,7 +172,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		ext4_error(sb, "Cannot read inode bitmap - "
 			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
-		return NULL;
+		return ERR_PTR(-EIO);
 	}
 	if (bitmap_uptodate(bh))
 		goto verify;
@@ -150,12 +185,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 
 	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		err = ext4_init_inode_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
 		set_buffer_verified(bh);
 		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
+		if (err)
+			goto out;
 		return bh;
 	}
 	ext4_unlock_group(sb, block_group);
@@ -182,31 +219,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		ext4_error(sb, "Cannot read inode bitmap - "
 			   "block_group = %u, inode_bitmap = %llu",
 			   block_group, bitmap_blk);
-		return NULL;
+		return ERR_PTR(-EIO);
 	}
 
 verify:
-	ext4_lock_group(sb, block_group);
-	if (!buffer_verified(bh) &&
-	    !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
-					   EXT4_INODES_PER_GROUP(sb) / 8)) {
-		ext4_unlock_group(sb, block_group);
-		put_bh(bh);
-		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
-			   "inode_bitmap = %llu", block_group, bitmap_blk);
-		grp = ext4_get_group_info(sb, block_group);
-		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
-			int count;
-			count = ext4_free_inodes_count(sb, desc);
-			percpu_counter_sub(&sbi->s_freeinodes_counter,
-					   count);
-		}
-		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
-		return NULL;
-	}
-	ext4_unlock_group(sb, block_group);
-	set_buffer_verified(bh);
+	err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
+	if (err)
+		goto out;
 	return bh;
+out:
+	put_bh(bh);
+	return ERR_PTR(err);
 }
 
 /*
@@ -286,8 +309,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	/* Don't bother if the inode bitmap is corrupt. */
 	grp = ext4_get_group_info(sb, block_group);
-	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
+	if (IS_ERR(bitmap_bh)) {
+		fatal = PTR_ERR(bitmap_bh);
+		bitmap_bh = NULL;
+		goto error_return;
+	}
+	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+		fatal = -EFSCORRUPTED;
 		goto error_return;
+	}
 
 	BUFFER_TRACE(bitmap_bh, "get_write_access");
 	fatal = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -826,7 +856,9 @@ got_group:
 		brelse(inode_bitmap_bh);
 		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
 		/* Skip groups with suspicious inode tables */
-		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+		    IS_ERR(inode_bitmap_bh)) {
+			inode_bitmap_bh = NULL;
 			if (++group == ngroups)
 				group = 0;
 			continue;
@@ -902,8 +934,8 @@ got:
 		struct buffer_head *block_bitmap_bh;
 
 		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
-		if (!block_bitmap_bh) {
-			err = -EIO;
+		if (IS_ERR(block_bitmap_bh)) {
+			err = PTR_ERR(block_bitmap_bh);
 			goto out;
 		}
 		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
@@ -1123,8 +1155,10 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
-	if (!bitmap_bh) {
-		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+	if (IS_ERR(bitmap_bh)) {
+		err = PTR_ERR(bitmap_bh);
+		ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
+			     ino, err);
 		goto error;
 	}
 
@@ -1199,8 +1233,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 		desc_count += ext4_free_inodes_count(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_inode_bitmap(sb, i);
-		if (!bitmap_bh)
+		if (IS_ERR(bitmap_bh)) {
+			bitmap_bh = NULL;
 			continue;
+		}
 
 		x = ext4_count_free(bitmap_bh->b_data,
 				    EXT4_INODES_PER_GROUP(sb) / 8);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b0f7ee5..6794ff4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -874,8 +874,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			bh[i] = NULL;
 			continue;
 		}
-		if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
-			err = -ENOMEM;
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		if (IS_ERR(bh[i])) {
+			err = PTR_ERR(bh[i]);
+			bh[i] = NULL;
 			goto out;
 		}
 		mb_debug(1, "read bitmap for group %u\n", group);
@@ -883,8 +885,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	/* wait for I/O completion */
 	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
-		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
-			err = -EIO;
+		int err2;
+
+		if (!bh[i])
+			continue;
+		err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
+		if (!err)
+			err = err2;
 	}
 
 	first_block = page->index * blocks_per_page;
@@ -2447,7 +2454,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			kmalloc(sb->s_blocksize, GFP_NOFS);
 		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
 		bh = ext4_read_block_bitmap(sb, group);
-		BUG_ON(bh == NULL);
+		BUG_ON(IS_ERR_OR_NULL(bh));
 		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
 			sb->s_blocksize);
 		put_bh(bh);
@@ -2889,10 +2896,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
 
-	err = -EIO;
 	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
-	if (!bitmap_bh)
+	if (IS_ERR(bitmap_bh)) {
+		err = PTR_ERR(bitmap_bh);
+		bitmap_bh = NULL;
 		goto out_err;
+	}
 
 	BUFFER_TRACE(bitmap_bh, "getting write access");
 	err = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -3836,8 +3845,10 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 		return 0;
 
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
-	if (bitmap_bh == NULL) {
-		ext4_error(sb, "Error reading block bitmap for %u", group);
+	if (IS_ERR(bitmap_bh)) {
+		err = PTR_ERR(bitmap_bh);
+		ext4_error(sb, "Error %d reading block bitmap for %u",
+			   err, group);
 		return 0;
 	}
 
@@ -4008,9 +4019,10 @@ repeat:
 		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
-		if (bitmap_bh == NULL) {
-			ext4_error(sb, "Error reading block bitmap for %u",
-					group);
+		if (IS_ERR(bitmap_bh)) {
+			err = PTR_ERR(bitmap_bh);
+			ext4_error(sb, "Error %d reading block bitmap for %u",
+					err, group);
 			ext4_mb_unload_buddy(&e4b);
 			continue;
 		}
@@ -4754,8 +4766,9 @@ do_more:
 	}
 	count_clusters = EXT4_NUM_B2C(sbi, count);
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (!bitmap_bh) {
-		err = -EIO;
+	if (IS_ERR(bitmap_bh)) {
+		err = PTR_ERR(bitmap_bh);
+		bitmap_bh = NULL;
 		goto error_return;
 	}
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4924,8 +4937,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 	}
 
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (!bitmap_bh) {
-		err = -EIO;
+	if (IS_ERR(bitmap_bh)) {
+		err = PTR_ERR(bitmap_bh);
+		bitmap_bh = NULL;
 		goto error_return;
 	}
 
-- 
cgit v1.1


From 9c02ac97989d544c89f049e8add7149aaea01671 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Sat, 17 Oct 2015 22:28:21 -0400
Subject: ext4: fix xfstest generic/269 double revoked buffer bug with bigalloc

When you repeatly execute xfstest generic/269 with bigalloc_1k option
enabled using the below command:

"./kvm-xfstests -c bigalloc_1k -m nodelalloc -C 1000 generic/269"

you can easily see the below bug message.

"JBD2 unexpected failure: jbd2_journal_revoke: !buffer_revoked(bh);"

This means that an already revoked buffer is erroneously revoked again
and it is caused by doing revoke for the buffer at the wrong position
in ext4_free_blocks(). We need to re-position the buffer revoke
procedure for an unspecified buffer after checking the cluster boundary
for bigalloc option. If not, some part of the cluster can be doubly
revoked.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
---
 fs/ext4/mballoc.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6794ff4..61eaf74 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4687,22 +4687,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	ext4_debug("freeing block %llu\n", block);
 	trace_ext4_free_blocks(inode, block, count, flags);
 
-	if (flags & EXT4_FREE_BLOCKS_FORGET) {
-		struct buffer_head *tbh = bh;
-		int i;
-
-		BUG_ON(bh && (count > 1));
+	if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+		BUG_ON(count > 1);
 
-		for (i = 0; i < count; i++) {
-			cond_resched();
-			if (!bh)
-				tbh = sb_find_get_block(inode->i_sb,
-							block + i);
-			if (!tbh)
-				continue;
-			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
-				    inode, tbh, block + i);
-		}
+		ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+			    inode, bh, block);
 	}
 
 	/*
@@ -4747,6 +4736,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			count += sbi->s_cluster_ratio - overflow;
 	}
 
+	if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+		int i;
+
+		for (i = 0; i < count; i++) {
+			cond_resched();
+			bh = sb_find_get_block(inode->i_sb, block + i);
+			if (!bh)
+				continue;
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+				    inode, bh, block + i);
+		}
+	}
+
 do_more:
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-- 
cgit v1.1


From 33d14975e5ac469963d5d63856b61698ad0bff07 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Sat, 17 Oct 2015 22:35:09 -0400
Subject: jbd2: fix checkpoint list cleanup

Unlike comments and expectation of callers journal_clean_one_cp_list()
returned 1 not only if it freed the transaction but also if it freed
some buffers in the transaction. That could make
__jbd2_journal_clean_checkpoint_list() skip processing
t_checkpoint_io_list and continue with processing the next transaction.
This is mostly a cosmetic issue since the only result is we can
sometimes free less memory than we could. But it's still worth fixing.
Fix journal_clean_one_cp_list() to return 1 only if the transaction was
really freed.

Fixes: 50849db32a9f529235a84bcc84a6b8e631b1d0ec
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/jbd2/checkpoint.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 8c44654..684996c 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -427,7 +427,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
 	struct journal_head *last_jh;
 	struct journal_head *next_jh = jh;
 	int ret;
-	int freed = 0;
 
 	if (!jh)
 		return 0;
@@ -441,10 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
 		else
 			ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 		if (!ret)
-			return freed;
+			return 0;
 		if (ret == 2)
 			return 1;
-		freed = 1;
 		/*
 		 * This function only frees up some memory
 		 * if possible so we dont have an obligation
@@ -452,10 +450,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
 		 * requested:
 		 */
 		if (need_resched())
-			return freed;
+			return 0;
 	} while (jh != last_jh);
 
-	return freed;
+	return 0;
 }
 
 /*
-- 
cgit v1.1


From 6934da9238da947628be83635e365df41064b09b Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Sat, 17 Oct 2015 22:57:06 -0400
Subject: ext4: fix potential use after free in __ext4_journal_stop

There is a use-after-free possibility in __ext4_journal_stop() in the
case that we free the handle in the first jbd2_journal_stop() because
we're referencing handle->h_err afterwards. This was introduced in
9705acd63b125dee8b15c705216d7186daea4625 and it is wrong. Fix it by
storing the handle->h_err value beforehand and avoid referencing
potentially freed handle.

Fixes: 9705acd63b125dee8b15c705216d7186daea4625
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: stable@vger.kernel.org
---
 fs/ext4/ext4_jbd2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d418431..e770c1ee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
 		return 0;
 	}
 
+	err = handle->h_err;
 	if (!handle->h_transaction) {
-		err = jbd2_journal_stop(handle);
-		return handle->h_err ? handle->h_err : err;
+		rc = jbd2_journal_stop(handle);
+		return err ? err : rc;
 	}
 
 	sb = handle->h_transaction->t_journal->j_private;
-	err = handle->h_err;
 	rc = jbd2_journal_stop(handle);
 
 	if (!err)
-- 
cgit v1.1


From 904dad4742d211b7a8910e92695c0fa957483836 Mon Sep 17 00:00:00 2001
From: Andy Leiserson <andy@leiserson.org>
Date: Sun, 18 Oct 2015 00:36:29 -0400
Subject: [PATCH] fix calculation of meta_bg descriptor backups

"group" is the group where the backup will be placed, and is
initialized to zero in the declaration. This meant that backups for
meta_bg descriptors were erroneously written to the backup block group
descriptors in groups 1 and (desc_per_block-1).

Reproduction information:
  mke2fs -Fq -t ext4 -b 1024 -O ^resize_inode /tmp/foo.img 16G
  truncate -s 24G /tmp/foo.img
  losetup /dev/loop0 /tmp/foo.img
  mount /dev/loop0 /mnt
  resize2fs /dev/loop0
  umount /dev/loop0
  dd if=/dev/zero of=/dev/loop0 bs=1024 count=2
  e2fsck -fy /dev/loop0
  losetup -d /dev/loop0

Signed-off-by: Andy Leiserson <andy@leiserson.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/resize.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 469eecc..ad62d7a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1039,7 +1039,7 @@ exit_free:
  * do not copy the full number of backups at this time.  The resize
  * which changed s_groups_count will backup again.
  */
-static void update_backups(struct super_block *sb, int blk_off, char *data,
+static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
 			   int size, int meta_bg)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1064,7 +1064,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
 		group = ext4_list_backups(sb, &three, &five, &seven);
 		last = sbi->s_groups_count;
 	} else {
-		group = ext4_meta_bg_first_group(sb, group) + 1;
+		group = ext4_get_group_number(sb, blk_off) + 1;
 		last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
 	}
 
-- 
cgit v1.1


From b97f679954b7771183f3c0a6a47b17e185e64a8a Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 12 Oct 2015 18:09:09 -0700
Subject: debugfs: Consolidate file mode checks in debugfs_create_*()

The code that creates debugfs file with different file ops based
on the file mode is duplicated in each debugfs_create_*() API.
Consolidate that code into debugfs_create_mode(), that takes
three file ops structures so that we don't have to keep
copy/pasting that logic.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 98 ++++++++++++++++++-------------------------------------
 1 file changed, 32 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index b70c20f..5d594ef 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -42,6 +42,22 @@ const struct file_operations debugfs_file_operations = {
 	.llseek =	noop_llseek,
 };
 
+static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
+					  struct dentry *parent, void *value,
+				          const struct file_operations *fops,
+				          const struct file_operations *fops_ro,
+				          const struct file_operations *fops_wo)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, fops_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, fops_wo);
+
+	return debugfs_create_file(name, mode, parent, value, fops);
+}
+
 static int debugfs_u8_set(void *data, u64 val)
 {
 	*(u8 *)data = val;
@@ -83,14 +99,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
 struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_u8);
+	return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+				   &fops_u8_ro, &fops_u8_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u8);
 
@@ -135,14 +145,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
 struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_u16);
+	return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+				   &fops_u16_ro, &fops_u16_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u16);
 
@@ -187,14 +191,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
 struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_u32);
+	return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+				   &fops_u32_ro, &fops_u32_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32);
 
@@ -240,14 +238,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
 struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_u64);
+	return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+				   &fops_u64_ro, &fops_u64_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u64);
 
@@ -286,14 +278,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
 struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_x8);
+	return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+				   &fops_x8_ro, &fops_x8_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x8);
 
@@ -310,14 +296,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
 struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				 struct dentry *parent, u16 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_x16);
+	return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+				   &fops_x16_ro, &fops_x16_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x16);
 
@@ -334,14 +314,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
 struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_x32);
+	return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+				   &fops_x32_ro, &fops_x32_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
 
@@ -422,16 +396,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
 struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
 				 struct dentry *parent, atomic_t *value)
 {
-	/* if there are no write bits set, make read only */
-	if (!(mode & S_IWUGO))
-		return debugfs_create_file(name, mode, parent, value,
-					&fops_atomic_t_ro);
-	/* if there are no read bits set, make write only */
-	if (!(mode & S_IRUGO))
-		return debugfs_create_file(name, mode, parent, value,
-					&fops_atomic_t_wo);
-
-	return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+	return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
+				   &fops_atomic_t_ro, &fops_atomic_t_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 
-- 
cgit v1.1


From 82b7d4fb4e0bcb8fe0d49edea21f595664f83c2f Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 12 Oct 2015 18:09:10 -0700
Subject: debugfs: Add read-only/write-only x64 file ops

There aren't any read-only or write-only x64 file ops, but there
is a caller of debugfs_create_x64() that calls it with mode equal
to S_IRUGO. This leads to the possibility of userspace modifying
the file, so let's use the newly created debugfs_create_mode()
helper here to fix this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 5d594ef..f69d42e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -256,6 +256,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
 
 /*
  * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -332,7 +334,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
 struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
 {
-	return debugfs_create_file(name, mode, parent, value, &fops_x64);
+	return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+				   &fops_x64_ro, &fops_x64_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x64);
 
-- 
cgit v1.1


From 6db6652abc3d81e155b6c8740065e9a4927825d9 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 12 Oct 2015 18:09:11 -0700
Subject: debugfs: Add read-only/write-only size_t file ops

There aren't any read-only or write-only size_t file ops, but there
is a caller of debugfs_create_size_t() that calls it with mode
equal to 0400. This leads to the possibility of userspace
modifying the file, so let's use the newly created
debugfs_create_mode() helper here to fix this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f69d42e..e8e73ae 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -352,6 +352,8 @@ static int debugfs_size_t_get(void *data, u64 *val)
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
 			"%llu\n");	/* %llu and %zu are more or less the same */
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
 
 /**
  * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -366,7 +368,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
 struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent, size_t *value)
 {
-	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+	return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
+				   &fops_size_t_ro, &fops_size_t_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_size_t);
 
-- 
cgit v1.1


From 6713e8fb541656d00249dca6627395b461c39ece Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 12 Oct 2015 18:09:12 -0700
Subject: debugfs: Add read-only/write-only bool file ops

There aren't any read-only or write-only bool file ops, but there
is a caller of debugfs_create_bool() that calls it with mode
equal to 0400. This leads to the possibility of userspace
modifying the file, so let's use the newly created
debugfs_create_mode() helper here to fix this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e8e73ae..8450549 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -450,6 +450,18 @@ static const struct file_operations fops_bool = {
 	.llseek =	default_llseek,
 };
 
+static const struct file_operations fops_bool_ro = {
+	.read =		debugfs_read_file_bool,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static const struct file_operations fops_bool_wo = {
+	.write =	debugfs_write_file_bool,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
 /**
  * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
  * @name: a pointer to a string containing the name of the file to create.
@@ -477,7 +489,8 @@ static const struct file_operations fops_bool = {
 struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				   struct dentry *parent, bool *value)
 {
-	return debugfs_create_file(name, mode, parent, value, &fops_bool);
+	return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+				   &fops_bool_ro, &fops_bool_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_bool);
 
-- 
cgit v1.1


From c23fe83138ed7b11ad763cbe8bf98e5378c04bd6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Sun, 18 Oct 2015 22:43:19 +0530
Subject: debugfs: Add debugfs_create_ulong()

Add debugfs_create_ulong() for the users of type 'unsigned long'. These
will be 32 bits long on a 32 bit machine and 64 bits long on a 64 bit
machine.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 8450549..d2ba12e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -243,6 +243,54 @@ struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u64);
 
+static int debugfs_ulong_set(void *data, u64 val)
+{
+	*(unsigned long *)data = val;
+	return 0;
+}
+
+static int debugfs_ulong_get(void *data, u64 *val)
+{
+	*val = *(unsigned long *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+
+/**
+ * debugfs_create_ulong - create a debugfs file that is used to read and write
+ * an unsigned long value.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+				    struct dentry *parent, unsigned long *value)
+{
+	return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
+				   &fops_ulong_ro, &fops_ulong_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_ulong);
+
 DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
-- 
cgit v1.1


From 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Sun, 18 Oct 2015 17:02:56 -0400
Subject: ext4, jbd2: ensure entering into panic after recording an error in
 superblock

If a EXT4 filesystem utilizes JBD2 journaling and an error occurs, the
journaling will be aborted first and the error number will be recorded
into JBD2 superblock and, finally, the system will enter into the
panic state in "errors=panic" option.  But, in the rare case, this
sequence is little twisted like the below figure and it will happen
that the system enters into panic state, which means the system reset
in mobile environment, before completion of recording an error in the
journal superblock. In this case, e2fsck cannot recognize that the
filesystem failure occurred in the previous run and the corruption
wouldn't be fixed.

Task A                        Task B
ext4_handle_error()
-> jbd2_journal_abort()
  -> __journal_abort_soft()
    -> __jbd2_journal_abort_hard()
    | -> journal->j_flags |= JBD2_ABORT;
    |
    |                         __ext4_abort()
    |                         -> jbd2_journal_abort()
    |                         | -> __journal_abort_soft()
    |                         |   -> if (journal->j_flags & JBD2_ABORT)
    |                         |           return;
    |                         -> panic()
    |
    -> jbd2_journal_update_sb_errno()

Tested-by: Hobin Woo <hobin.woo@samsung.com>
Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c   | 12 ++++++++++--
 fs/jbd2/journal.c |  6 +++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 49fa4d4..639613f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,9 +388,13 @@ static void ext4_handle_error(struct super_block *sb)
 		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 	}
-	if (test_opt(sb, ERRORS_PANIC))
+	if (test_opt(sb, ERRORS_PANIC)) {
+		if (EXT4_SB(sb)->s_journal &&
+		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+			return;
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
+	}
 }
 
 #define ext4_error_ratelimit(sb)					\
@@ -585,8 +589,12 @@ void __ext4_abort(struct super_block *sb, const char *function,
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
 	}
-	if (test_opt(sb, ERRORS_PANIC))
+	if (test_opt(sb, ERRORS_PANIC)) {
+		if (EXT4_SB(sb)->s_journal &&
+		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+			return;
 		panic("EXT4-fs panic from previous error\n");
+	}
 }
 
 void __ext4_msg(struct super_block *sb,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb8a754..81e6226 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2072,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno)
 
 	__jbd2_journal_abort_hard(journal);
 
-	if (errno)
+	if (errno) {
 		jbd2_journal_update_sb_errno(journal);
+		write_lock(&journal->j_state_lock);
+		journal->j_flags |= JBD2_REC_ERR;
+		write_unlock(&journal->j_state_lock);
+	}
 }
 
 /**
-- 
cgit v1.1


From 985ef4dcf94ac0113e2b32ef86bdd5bc47c016b3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 19 Oct 2015 08:42:46 +1100
Subject: xfs: stats are no longer dependent on CONFIG_PROC_FS

So we need to fix the makefile to understand this, otherwise build
errors with CONFIG_PROC_FS=n occur.

Reported-and-tested-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/Makefile    | 2 +-
 fs/xfs/xfs_stats.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a096841..f646391 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
+				   xfs_stats.o \
 				   xfs_super.o \
 				   xfs_symlink.o \
 				   xfs_sysfs.o \
@@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
-xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
 xfs-$(CONFIG_NFSD_PNFS)		+= xfs_pnfs.o
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index bd50619..8686df6 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -161,6 +161,7 @@ static const struct file_operations xqmstat_proc_fops = {
 };
 #endif /* CONFIG_XFS_QUOTA */
 
+#ifdef CONFIG_PROC_FS
 int
 xfs_init_procfs(void)
 {
@@ -191,3 +192,4 @@ xfs_cleanup_procfs(void)
 {
 	remove_proc_subtree("fs/xfs", NULL);
 }
+#endif /* CONFIG_PROC_FS */
-- 
cgit v1.1


From f9d460b341f23a9bb7df8868975fdfcc2e71aa9b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 19 Oct 2015 08:42:47 +1100
Subject: xfs: fix an error code in xfs_fs_fill_super()

If alloc_percpu() fails, we accidentally return PTR_ERR(NULL), which
means success, but we intended to return -ENOMEM.

Fixes: 225e4635580c ('xfs: per-filesystem stats in sysfs')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 368c55a..b2c252c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1477,7 +1477,7 @@ xfs_fs_fill_super(
 	/* Allocate stats memory before we do operations that might use it */
 	mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
 	if (!mp->m_stats.xs_stats) {
-		error = PTR_ERR(mp->m_stats.xs_stats);
+		error = -ENOMEM;
 		goto out_destroy_counters;
 	}
 
-- 
cgit v1.1


From 37c1c04cca920de8a68285751b2c7b3d937ad50c Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Date: Wed, 22 Apr 2015 19:36:06 +0300
Subject: sysfs: added __compat_only_sysfs_link_entry_to_kobj()

Added a new function __compat_only_sysfs_link_group_to_kobj() that adds
a symlink from attribute or group to a kobject. This needed for
maintaining backwards compatibility with PPI attributes in the TPM
driver.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 fs/sysfs/group.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 39a0199..e123659 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -352,3 +352,47 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
+
+/**
+ * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
+ * to a group or an attribute
+ * @kobj:		The kobject containing the group.
+ * @target_kobj:	The target kobject.
+ * @target_name:	The name of the target group or attribute.
+ */
+int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
+				      struct kobject *target_kobj,
+				      const char *target_name)
+{
+	struct kernfs_node *target;
+	struct kernfs_node *entry;
+	struct kernfs_node *link;
+
+	/*
+	 * We don't own @target_kobj and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
+	 * for details.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
+	target = target_kobj->sd;
+	if (target)
+		kernfs_get(target);
+	spin_unlock(&sysfs_symlink_target_lock);
+	if (!target)
+		return -ENOENT;
+
+	entry = kernfs_find_and_get(target_kobj->sd, target_name);
+	if (!entry) {
+		kernfs_put(target);
+		return -ENOENT;
+	}
+
+	link = kernfs_create_link(kobj->sd, target_name, entry);
+	if (IS_ERR(link) && PTR_ERR(link) == -EEXIST)
+		sysfs_warn_dup(kobj->sd, target_name);
+
+	kernfs_put(entry);
+	kernfs_put(target);
+	return IS_ERR(link) ? PTR_ERR(link) : 0;
+}
+EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
-- 
cgit v1.1


From c93cf2d75779d73376e84e2547de1304113d3f1c Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 18 Oct 2015 23:35:32 -0400
Subject: ext4: explicit mount options parsing cleanup

Currently MOPT_EXPLICIT treated as EXPLICIT_DELALLOC which may be changed
in future. Let's fix it now.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 639613f..db4be39 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1516,8 +1516,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		return -1;
 	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
 		return -1;
-	if (m->flags & MOPT_EXPLICIT)
-		set_opt2(sb, EXPLICIT_DELALLOC);
+	if (m->flags & MOPT_EXPLICIT) {
+		if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
+			set_opt2(sb, EXPLICIT_DELALLOC);
+		} else
+			return -1;
+	}
 	if (m->flags & MOPT_CLEAR_ERR)
 		clear_opt(sb, ERRORS_MASK);
 	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
-- 
cgit v1.1


From 1e381f60dad913625956ed3a9c5714d9fc7e865f Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 18 Oct 2015 23:50:26 -0400
Subject: ext4: do not allow journal_opts for fs w/o journal

It is appeared that we can pass journal related mount options and such options
be shown in /proc/mounts

Example:
#mkfs.ext4 -F /dev/vdb
#tune2fs -O ^has_journal /dev/vdb
#mount /dev/vdb /mnt/  -ocommit=20,journal_async_commit
#cat /proc/mounts  | grep /mnt
 /dev/vdb /mnt ext4 rw,relatime,journal_checksum,journal_async_commit,commit=20,data=ordered 0 0

But options:"journal_checksum,journal_async_commit,commit=20,data=ordered" has
nothing with reality because there is no journal at all.

This patch disallow following options for journalless configurations:
 - journal_checksum
 - journal_async_commit
 - commit=%ld
 - data={writeback,ordered,journal}

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/ext4.h  |  3 +++
 fs/ext4/super.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b1d04aa7..750063f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1023,6 +1023,9 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
 						      file systems */
 
+#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
+						specified journal checksum */
+
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index db4be39..04d0f1b3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1384,10 +1384,10 @@ static const struct mount_opts {
 	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
 	 MOPT_EXT4_ONLY | MOPT_CLEAR},
 	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
-	 MOPT_EXT4_ONLY | MOPT_SET},
+	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
 	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
 				    EXT4_MOUNT_JOURNAL_CHECKSUM),
-	 MOPT_EXT4_ONLY | MOPT_SET},
+	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
 	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
 	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
@@ -1519,6 +1519,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	if (m->flags & MOPT_EXPLICIT) {
 		if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
 			set_opt2(sb, EXPLICIT_DELALLOC);
+		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
+			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
 		} else
 			return -1;
 	}
@@ -3677,6 +3679,31 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       "suppressed and not mounted read-only");
 		goto failed_mount_wq;
 	} else {
+		/* Nojournal mode, all journal mount options are illegal */
+		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "journal_checksum, fs mounted w/o journal");
+			goto failed_mount_wq;
+		}
+		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "journal_async_commit, fs mounted w/o journal");
+			goto failed_mount_wq;
+		}
+		if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "commit=%lu, fs mounted w/o journal",
+				 sbi->s_commit_interval / HZ);
+			goto failed_mount_wq;
+		}
+		if (EXT4_MOUNT_DATA_FLAGS &
+		    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "data=, fs mounted w/o journal");
+			goto failed_mount_wq;
+		}
+		sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+		clear_opt(sb, JOURNAL_CHECKSUM);
 		clear_opt(sb, DATA_FLAGS);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
-- 
cgit v1.1


From 16175039e6c6527453a23b4949c2a68e82750895 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 19 Oct 2015 00:01:05 -0400
Subject: ext4: fix abs() usage in ext4_mb_check_group_pa

The ext4_fsblk_t type is a long long, which should not be used
with abs(), as is done in ext4_mb_check_group_pa().

This patch modifies ext4_mb_check_group_pa() to use abs64()
instead.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 61eaf74..b4b3c1f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3333,8 +3333,8 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
 		atomic_inc(&pa->pa_count);
 		return pa;
 	}
-	cur_distance = abs(goal_block - cpa->pa_pstart);
-	new_distance = abs(goal_block - pa->pa_pstart);
+	cur_distance = abs64(goal_block - cpa->pa_pstart);
+	new_distance = abs64(goal_block - pa->pa_pstart);
 
 	if (cur_distance <= new_distance)
 		return cpa;
-- 
cgit v1.1


From 5726b27b09cc92452b543764899a07e7c8037edd Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 13 Oct 2015 16:25:37 -0600
Subject: ext2: Add locking for DAX faults

Add locking to ensure that DAX faults are isolated from ext2 operations
that modify the data blocks allocation for an inode.  This is intended to
be analogous to the work being done in XFS by Dave Chinner:

http://www.spinics.net/lists/linux-fsdevel/msg90260.html

Compared with XFS the ext2 case is greatly simplified by the fact that ext2
already allocates and zeros new blocks before they are returned as part of
ext2_get_block(), so DAX doesn't need to worry about getting unmapped or
unwritten buffer heads.

This means that the only work we need to do in ext2 is to isolate the DAX
faults from inode block allocation changes.  I believe this just means that
we need to isolate the DAX faults from truncate operations.

The newly introduced dax_sem is intended to replicate the protection
offered by i_mmaplock in XFS.  In addition to truncate the i_mmaplock also
protects XFS operations like hole punching, fallocate down, extent
manipulation IOCTLS like xfs_ioc_space() and extent swapping.  Truncate is
the only one of these operations supported by ext2.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.com>
---
 fs/ext2/ext2.h  | 11 ++++++++
 fs/ext2/file.c  | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ext2/inode.c | 10 +++++++
 fs/ext2/super.c |  3 +++
 4 files changed, 104 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 8d15feb..4c69c94 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -684,6 +684,9 @@ struct ext2_inode_info {
 	struct rw_semaphore xattr_sem;
 #endif
 	rwlock_t i_meta_lock;
+#ifdef CONFIG_FS_DAX
+	struct rw_semaphore dax_sem;
+#endif
 
 	/*
 	 * truncate_mutex is for serialising ext2_truncate() against
@@ -699,6 +702,14 @@ struct ext2_inode_info {
 #endif
 };
 
+#ifdef CONFIG_FS_DAX
+#define dax_sem_down_write(ext2_inode)	down_write(&(ext2_inode)->dax_sem)
+#define dax_sem_up_write(ext2_inode)	up_write(&(ext2_inode)->dax_sem)
+#else
+#define dax_sem_down_write(ext2_inode)
+#define dax_sem_up_write(ext2_inode)
+#endif
+
 /*
  * Inode dynamic state flags
  */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 1982c3f..11a42c5 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -27,27 +27,103 @@
 #include "acl.h"
 
 #ifdef CONFIG_FS_DAX
+/*
+ * The lock ordering for ext2 DAX fault paths is:
+ *
+ * mmap_sem (MM)
+ *   sb_start_pagefault (vfs, freeze)
+ *     ext2_inode_info->dax_sem
+ *       address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
+ *         ext2_inode_info->truncate_mutex
+ *
+ * The default page_lock and i_size verification done by non-DAX fault paths
+ * is sufficient because ext2 doesn't support hole punching.
+ */
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext2_get_block, NULL);
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int ret;
+
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vma->vm_file);
+	}
+	down_read(&ei->dax_sem);
+
+	ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+
+	up_read(&ei->dax_sem);
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(inode->i_sb);
+	return ret;
 }
 
 static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 						pmd_t *pmd, unsigned int flags)
 {
-	return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int ret;
+
+	if (flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vma->vm_file);
+	}
+	down_read(&ei->dax_sem);
+
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+
+	up_read(&ei->dax_sem);
+	if (flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(inode->i_sb);
+	return ret;
 }
 
 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int ret;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	down_read(&ei->dax_sem);
+
+	ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+
+	up_read(&ei->dax_sem);
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+
+static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ext2_inode_info *ei = EXT2_I(inode);
+	int ret = VM_FAULT_NOPAGE;
+	loff_t size;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	down_read(&ei->dax_sem);
+
+	/* check that the faulting page hasn't raced with truncate */
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+
+	up_read(&ei->dax_sem);
+	sb_end_pagefault(inode->i_sb);
+	return ret;
 }
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	.pmd_fault	= ext2_dax_pmd_fault,
 	.page_mkwrite	= ext2_dax_mkwrite,
-	.pfn_mkwrite	= dax_pfn_mkwrite,
+	.pfn_mkwrite	= ext2_dax_pfn_mkwrite,
 };
 
 static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c60a248..0aa9bf6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1085,6 +1085,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
 		ext2_free_data(inode, p, q);
 }
 
+/* dax_sem must be held when calling this function */
 static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
 	__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1100,6 +1101,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 	blocksize = inode->i_sb->s_blocksize;
 	iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
 
+#ifdef CONFIG_FS_DAX
+	WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+#endif
+
 	n = ext2_block_to_path(inode, iblock, offsets, NULL);
 	if (n == 0)
 		return;
@@ -1185,7 +1190,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
+
+	dax_sem_down_write(EXT2_I(inode));
 	__ext2_truncate_blocks(inode, offset);
+	dax_sem_up_write(EXT2_I(inode));
 }
 
 static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1213,8 +1221,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	if (error)
 		return error;
 
+	dax_sem_down_write(EXT2_I(inode));
 	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
+	dax_sem_up_write(EXT2_I(inode));
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19c..3a71cea 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,6 +192,9 @@ static void init_once(void *foo)
 	init_rwsem(&ei->xattr_sem);
 #endif
 	mutex_init(&ei->truncate_mutex);
+#ifdef CONFIG_FS_DAX
+	init_rwsem(&ei->dax_sem);
+#endif
 	inode_init_once(&ei->vfs_inode);
 }
 
-- 
cgit v1.1


From 1d373a0ef7a7bc08f95ca820c627e961fb21e188 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 19 Oct 2015 10:29:51 -0700
Subject: f2fs: flush dirty data for bmap

Users expect bmap will give allocated block addresses.
Let's play likewise ext4.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 77dfc9e..b052e7c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1667,12 +1667,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 
-	/* we don't need to use inline_data strictly */
-	if (f2fs_has_inline_data(inode)) {
-		int err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
-	}
+	if (f2fs_has_inline_data(inode))
+		return 0;
+
+	/* make sure allocating whole blocks */
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		filemap_write_and_wait(mapping);
+
 	return generic_block_bmap(mapping, block, get_data_block_bmap);
 }
 
-- 
cgit v1.1


From 67f8cf3cee6f398d05de8333c04fea2ddb59c805 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 15 Oct 2015 11:34:49 -0700
Subject: f2fs: support fiemap for inline_data

There is a FIEMAP_EXTENT_INLINE_DATA, pointed out by Marc.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c   |  6 ++++++
 fs/f2fs/f2fs.h   |  2 ++
 fs/f2fs/inline.c | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b052e7c..972eab7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -771,6 +771,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		return ret;
 
+	if (f2fs_has_inline_data(inode)) {
+		ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
+		if (ret != -EAGAIN)
+			return ret;
+	}
+
 	mutex_lock(&inode->i_mutex);
 
 	if (len >= isize) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c3443da..9db5500 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2054,6 +2054,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
 bool f2fs_empty_inline_dir(struct inode *);
 int f2fs_read_inline_dir(struct file *, struct dir_context *,
 						struct f2fs_str *);
+int f2fs_inline_data_fiemap(struct inode *,
+		struct fiemap_extent_info *, __u64, __u64);
 
 /*
  * shrinker.c
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3b18e23..bda7126 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -12,6 +12,7 @@
 #include <linux/f2fs_fs.h>
 
 #include "f2fs.h"
+#include "node.h"
 
 bool f2fs_may_inline_data(struct inode *inode)
 {
@@ -570,3 +571,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	f2fs_put_page(ipage, 1);
 	return 0;
 }
+
+int f2fs_inline_data_fiemap(struct inode *inode,
+		struct fiemap_extent_info *fieinfo, __u64 start, __u64 len)
+{
+	__u64 byteaddr, ilen;
+	__u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+		FIEMAP_EXTENT_LAST;
+	struct node_info ni;
+	struct page *ipage;
+	int err = 0;
+
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+	if (IS_ERR(ipage))
+		return PTR_ERR(ipage);
+
+	if (!f2fs_has_inline_data(inode)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+	if (start >= ilen)
+		goto out;
+	if (start + len < ilen)
+		ilen = start + len;
+	ilen -= start;
+
+	get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+	byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
+	byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+	err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+out:
+	f2fs_put_page(ipage, 1);
+	return err;
+}
-- 
cgit v1.1


From 0f6925fa2907df58496cabc33fa4677c635e2223 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Wed, 14 Oct 2015 15:26:13 +0800
Subject: btrfs: Avoid truncate tailing page if fallocate range doesn't exceed
 inode size

Current code will always truncate tailing page if its alloc_start is
smaller than inode size.

For example, the file extent layout is like:
0	4K	8K	16K	32K
|<-----Extent A---------------->|
|<--Inode size: 18K---------->|

But if calling fallocate even for range [0,4K), it will cause btrfs to
re-truncate the range [16,32K), causing COW and a new extent.

0	4K	8K	16K	32K
|///////|	<- Fallocate call range
|<-----Extent A-------->|<--B-->|

The cause is quite easy, just a careless btrfs_truncate_inode() in a
else branch without extra judgment.
Fix it by add judgment on whether the fallocate range is beyond isize.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac9..8c6f247 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2584,7 +2584,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 					alloc_start);
 		if (ret)
 			goto out;
-	} else {
+	} else if (offset + len > inode->i_size) {
 		/*
 		 * If we are fallocating from the end of the file onward we
 		 * need to zero out the end of the page if i_size lands in the
-- 
cgit v1.1


From 146aa8b1453bd8f1ff2304ffb71b4ee0eb9acdcc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 21 Oct 2015 14:04:48 +0100
Subject: KEYS: Merge the type-specific data with the payload data

Merge the type-specific data with the payload data into one four-word chunk
as it seems pointless to keep them separate.

Use user_key_payload() for accessing the payloads of overloaded
user-defined keys.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-cifs@vger.kernel.org
cc: ecryptfs@vger.kernel.org
cc: linux-ext4@vger.kernel.org
cc: linux-f2fs-devel@lists.sourceforge.net
cc: linux-nfs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-ima-devel@lists.sourceforge.net
---
 fs/cifs/cifs_spnego.c         |  6 +++---
 fs/cifs/cifsacl.c             | 25 ++++++++++++-------------
 fs/cifs/connect.c             |  9 +++++----
 fs/cifs/sess.c                |  2 +-
 fs/cifs/smb2pdu.c             |  2 +-
 fs/ecryptfs/ecryptfs_kernel.h |  5 ++---
 fs/ext4/crypto_key.c          |  4 ++--
 fs/f2fs/crypto_key.c          |  4 ++--
 fs/fscache/object-list.c      |  4 ++--
 fs/nfs/nfs4idmap.c            |  4 ++--
 10 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index f4cf200..6908080 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -42,7 +42,7 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 		goto error;
 
 	/* attach the data */
-	key->payload.data = payload;
+	key->payload.data[0] = payload;
 	ret = 0;
 
 error:
@@ -52,7 +52,7 @@ error:
 static void
 cifs_spnego_key_destroy(struct key *key)
 {
-	kfree(key->payload.data);
+	kfree(key->payload.data[0]);
 }
 
 
@@ -167,7 +167,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 
 #ifdef CONFIG_CIFS_DEBUG2
 	if (cifsFYI && !IS_ERR(spnego_key)) {
-		struct cifs_spnego_msg *msg = spnego_key->payload.data;
+		struct cifs_spnego_msg *msg = spnego_key->payload.data[0];
 		cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
 				msg->secblob_len + msg->sesskey_len));
 	}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1ea780b..3f93125 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -58,16 +58,15 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 	 * dereference payload.data!
 	 */
 	if (prep->datalen <= sizeof(key->payload)) {
-		key->payload.value = 0;
-		memcpy(&key->payload.value, prep->data, prep->datalen);
-		key->datalen = prep->datalen;
-		return 0;
+		key->payload.data[0] = NULL;
+		memcpy(&key->payload, prep->data, prep->datalen);
+	} else {
+		payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+		if (!payload)
+			return -ENOMEM;
+		key->payload.data[0] = payload;
 	}
-	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
-	if (!payload)
-		return -ENOMEM;
 
-	key->payload.data = payload;
 	key->datalen = prep->datalen;
 	return 0;
 }
@@ -76,7 +75,7 @@ static inline void
 cifs_idmap_key_destroy(struct key *key)
 {
 	if (key->datalen > sizeof(key->payload))
-		kfree(key->payload.data);
+		kfree(key->payload.data[0]);
 }
 
 static struct key_type cifs_idmap_key_type = {
@@ -233,8 +232,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 	 * it could be.
 	 */
 	ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
-		(struct cifs_sid *)&sidkey->payload.value :
-		(struct cifs_sid *)sidkey->payload.data;
+		(struct cifs_sid *)&sidkey->payload :
+		(struct cifs_sid *)sidkey->payload.data[0];
 
 	ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
 	if (ksid_size > sidkey->datalen) {
@@ -307,14 +306,14 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
 	if (sidtype == SIDOWNER) {
 		kuid_t uid;
 		uid_t id;
-		memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+		memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t));
 		uid = make_kuid(&init_user_ns, id);
 		if (uid_valid(uid))
 			fuid = uid;
 	} else {
 		kgid_t gid;
 		gid_t id;
-		memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+		memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t));
 		gid = make_kgid(&init_user_ns, id);
 		if (gid_valid(gid))
 			fgid = gid;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 773f4dc..3f22285 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2325,13 +2325,14 @@ static int
 cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 {
 	int rc = 0;
-	char *desc, *delim, *payload;
+	const char *delim, *payload;
+	char *desc;
 	ssize_t len;
 	struct key *key;
 	struct TCP_Server_Info *server = ses->server;
 	struct sockaddr_in *sa;
 	struct sockaddr_in6 *sa6;
-	struct user_key_payload *upayload;
+	const struct user_key_payload *upayload;
 
 	desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
 	if (!desc)
@@ -2374,14 +2375,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	}
 
 	down_read(&key->sem);
-	upayload = key->payload.data;
+	upayload = user_key_payload(key);
 	if (IS_ERR_OR_NULL(upayload)) {
 		rc = upayload ? PTR_ERR(upayload) : -EINVAL;
 		goto out_key_put;
 	}
 
 	/* find first : in payload */
-	payload = (char *)upayload->data;
+	payload = upayload->data;
 	delim = strnchr(payload, upayload->datalen, ':');
 	cifs_dbg(FYI, "payload=%s\n", payload);
 	if (!delim) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index bce6fdc..59727e3 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -988,7 +988,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
 		goto out;
 	}
 
-	msg = spnego_key->payload.data;
+	msg = spnego_key->payload.data[0];
 	/*
 	 * check version field to make sure that cifs.upcall is
 	 * sending us a response in an expected form
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ce83e2e..52d14c9 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -660,7 +660,7 @@ ssetup_ntlmssp_authenticate:
 			goto ssetup_exit;
 		}
 
-		msg = spnego_key->payload.data;
+		msg = spnego_key->payload.data[0];
 		/*
 		 * check version field to make sure that cifs.upcall is
 		 * sending us a response in an expected form
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5ba029e..7b39260 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -86,7 +86,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key)
 {
 	if (key->type == &key_type_encrypted)
 		return (struct ecryptfs_auth_tok *)
-			(&((struct encrypted_key_payload *)key->payload.data)->payload_data);
+			(&((struct encrypted_key_payload *)key->payload.data[0])->payload_data);
 	else
 		return NULL;
 }
@@ -117,8 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 
 	auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
 	if (!auth_tok)
-		return (struct ecryptfs_auth_tok *)
-			(((struct user_key_payload *)key->payload.data)->data);
+		return (struct ecryptfs_auth_tok *)user_key_payload(key)->data;
 	else
 		return auth_tok;
 }
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 1d510c1..5c52c79 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -121,7 +121,7 @@ int _ext4_get_encryption_info(struct inode *inode)
 	struct key *keyring_key = NULL;
 	struct ext4_encryption_key *master_key;
 	struct ext4_encryption_context ctx;
-	struct user_key_payload *ukp;
+	const struct user_key_payload *ukp;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct crypto_ablkcipher *ctfm;
 	const char *cipher_str;
@@ -209,7 +209,7 @@ retry:
 	}
 	crypt_info->ci_keyring_key = keyring_key;
 	BUG_ON(keyring_key->type != &key_type_logon);
-	ukp = ((struct user_key_payload *)keyring_key->payload.data);
+	ukp = user_key_payload(keyring_key);
 	if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
 		res = -EINVAL;
 		goto out;
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
index 9f77de2..5de2d86 100644
--- a/fs/f2fs/crypto_key.c
+++ b/fs/f2fs/crypto_key.c
@@ -122,7 +122,7 @@ int _f2fs_get_encryption_info(struct inode *inode)
 	struct key *keyring_key = NULL;
 	struct f2fs_encryption_key *master_key;
 	struct f2fs_encryption_context ctx;
-	struct user_key_payload *ukp;
+	const struct user_key_payload *ukp;
 	struct crypto_ablkcipher *ctfm;
 	const char *cipher_str;
 	char raw_key[F2FS_MAX_KEY_SIZE];
@@ -199,7 +199,7 @@ retry:
 	}
 	crypt_info->ci_keyring_key = keyring_key;
 	BUG_ON(keyring_key->type != &key_type_logon);
-	ukp = ((struct user_key_payload *)keyring_key->payload.data);
+	ukp = user_key_payload(keyring_key);
 	if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
 		res = -EINVAL;
 		goto out;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 51dde81..6b028b7 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -316,7 +316,7 @@ static const struct seq_operations fscache_objlist_ops = {
 static void fscache_objlist_config(struct fscache_objlist_data *data)
 {
 #ifdef CONFIG_KEYS
-	struct user_key_payload *confkey;
+	const struct user_key_payload *confkey;
 	unsigned long config;
 	struct key *key;
 	const char *buf;
@@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
 	config = 0;
 	rcu_read_lock();
 
-	confkey = key->payload.data;
+	confkey = user_key_payload(key);
 	buf = confkey->data;
 
 	for (len = confkey->datalen - 1; len >= 0; len--) {
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 2e49022..5ba22c6 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -297,7 +297,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 {
 	const struct cred *saved_cred;
 	struct key *rkey;
-	struct user_key_payload *payload;
+	const struct user_key_payload *payload;
 	ssize_t ret;
 
 	saved_cred = override_creds(id_resolver_cache);
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 	if (ret < 0)
 		goto out_up;
 
-	payload = rcu_dereference(rkey->payload.rcudata);
+	payload = user_key_payload(rkey);
 	if (IS_ERR_OR_NULL(payload)) {
 		ret = PTR_ERR(payload);
 		goto out_up;
-- 
cgit v1.1


From 549b39a9e7ef3eee2d0895530f277c012abc7b43 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Tue, 20 Oct 2015 00:39:01 -0700
Subject: pstore: add vmalloc error check

If vmalloc fails, make write_pmsg return -ENOMEM.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/pmsg.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index feb5dd2..5a2f05a 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -37,6 +37,8 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
 	if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
 		buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
 	buffer = vmalloc(buffer_size);
+	if (!buffer)
+		return -ENOMEM;
 
 	mutex_lock(&pmsg_lock);
 	for (i = 0; i < count; ) {
-- 
cgit v1.1


From 1873041152d2e324e25ddeec5541b90c22c2dc0f Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Tue, 20 Oct 2015 00:39:02 -0700
Subject: pstore: add a helper function pstore_register_kmsg

Add a new wrapper function pstore_register_kmsg to keep the
consistency with other similar pstore_register_* functions.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/platform.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 791743d..1b11249 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -353,6 +353,11 @@ static struct kmsg_dumper pstore_dumper = {
 	.dump = pstore_dump,
 };
 
+static void pstore_register_kmsg(void)
+{
+	kmsg_dump_register(&pstore_dumper);
+}
+
 #ifdef CONFIG_PSTORE_CONSOLE
 static void pstore_console_write(struct console *con, const char *s, unsigned c)
 {
@@ -442,7 +447,7 @@ int pstore_register(struct pstore_info *psi)
 	if (pstore_is_mounted())
 		pstore_get_records(0);
 
-	kmsg_dump_register(&pstore_dumper);
+	pstore_register_kmsg();
 
 	if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
 		pstore_register_console();
-- 
cgit v1.1


From 0d51e28a11ad428ac8545a3ec8724395fa1c71a7 Mon Sep 17 00:00:00 2001
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Date: Mon, 27 Jul 2015 15:26:43 +0530
Subject: Btrfs: btrfs_submit_bio_hook: Use btrfs_wq_endio_type values instead
 of integer constants

btrfs_submit_bio_hook() uses integer constants instead of values from "enum
btrfs_wq_endio_type". Fix this.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d..dc3a9fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1861,15 +1861,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
 	int ret = 0;
 	int skip_sum;
-	int metadata = 0;
 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	if (btrfs_is_free_space_inode(inode))
-		metadata = 2;
+		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
 
 	if (!(rw & REQ_WRITE)) {
 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
-- 
cgit v1.1


From ce0eac2a1d82eb841eae357bcef7ac39df131b21 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Sun, 23 Aug 2015 16:01:42 +0000
Subject: btrfs: Fixed dsize and last_off declarations

The return values of btrfs_item_offset_nr and btrfs_item_size_nr are of
type u32. To avoid mixing signed and unsigned integers we should also
declare dsize and last_off to be of type u32.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745ea..f553913 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
-	int last_off;
-	int dsize = 0;
+	u32 last_off;
+	u32 dsize = 0;
 	int ret = 0;
 	int wret;
 	int i;
-- 
cgit v1.1


From 0412e58c6d62168643a285f99b2354fa97df6cf3 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Mon, 24 Aug 2015 21:56:28 +0000
Subject: btrfs: Fixed declaration of old_len

old_len is used to store the return value of btrfs_item_size_nr().
The return value of btrfs_item_size_nr() is of type u32.
To improve code correctness and avoid mixing signed and unsigned
integers I've changed old_len to be of type u32 as well.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..024ea4f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,7 +141,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	int slot;
 	unsigned long ptr;
-	int old_len;
+	u32 old_len;
 
 	path = btrfs_alloc_path();
 	if (!path)
-- 
cgit v1.1


From f2f767e7345dfe56102d6809f647ba38a238f718 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Thu, 27 Aug 2015 23:53:45 +0000
Subject: btrfs: trimming some start_transaction() code away

Just call kmem_cache_zalloc() instead of calling kmem_cache_alloc().
We're just initializing most fields to 0, false and NULL later on
_anyway_, so to make the code mode readable and potentially gain
a bit of performance (completely untested claim), we should fill our
btrfs_trans_handle with zeros on allocation then just initialize
those five remaining fields (not counting the list_heads) as normal.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a5b0644..c191606 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -502,7 +502,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 			goto reserve_fail;
 	}
 again:
-	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	if (!h) {
 		ret = -ENOMEM;
 		goto alloc_fail;
@@ -543,23 +543,10 @@ again:
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
-	h->blocks_used = 0;
-	h->bytes_reserved = 0;
-	h->chunk_bytes_reserved = 0;
 	h->root = root;
-	h->delayed_ref_updates = 0;
 	h->use_count = 1;
-	h->adding_csums = 0;
-	h->block_rsv = NULL;
-	h->orig_rsv = NULL;
-	h->aborted = 0;
-	h->qgroup_reserved = 0;
-	h->delayed_ref_elem.seq = 0;
 	h->type = type;
-	h->allocating_chunk = false;
 	h->can_flush_pending_bgs = true;
-	h->reloc_reserved = false;
-	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 	INIT_LIST_HEAD(&h->ordered);
-- 
cgit v1.1


From d7641a49a54f66e1a323d0de6b42caeee6d33aa5 Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Tue, 1 Sep 2015 23:10:57 +0900
Subject: btrfs: replace unnecessary list_for_each_entry_safe to
 list_for_each_entry

There is no removing list element while iterating over list.
So, replace list_for_each_entry_safe to list_for_each_entry.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..9181e64 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2699,7 +2699,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_fs_info_args *fi_args;
 	struct btrfs_device *device;
-	struct btrfs_device *next;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	int ret = 0;
 
@@ -2711,7 +2710,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 	fi_args->num_devices = fs_devices->num_devices;
 	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
 
-	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->devid > fi_args->max_id)
 			fi_args->max_id = device->devid;
 	}
-- 
cgit v1.1


From 568b1c9cca82623af764ee6ea65dc41a7079171c Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Tue, 1 Sep 2015 23:36:28 +0900
Subject: btrfs: remove unnecessary list_del

We can safely iterate whole list items, without using list_del macro.
So remove the list_del call.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc3a9fc..c4a97dc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2599,7 +2599,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
 		return;
 
 	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
 		kfree(old);
 	}
 	kfree(new);
-- 
cgit v1.1


From a099d0fdb34f00cee346703036a0a90d267e77d7 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Mon, 7 Sep 2015 17:24:37 +0300
Subject: btrfs: memset cur_trans->delayed_refs to zero

Use memset() to null out the btrfs_delayed_ref_root of
btrfs_transaction instead of setting all the members to 0 by hand.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c191606..c2f84e2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -236,15 +236,11 @@ loop:
 	cur_trans->start_time = get_seconds();
 	cur_trans->dirty_bg_run = 0;
 
+	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
+
 	cur_trans->delayed_refs.href_root = RB_ROOT;
 	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
-	cur_trans->delayed_refs.num_heads_ready = 0;
-	cur_trans->delayed_refs.pending_csums = 0;
-	cur_trans->delayed_refs.num_heads = 0;
-	cur_trans->delayed_refs.flushing = 0;
-	cur_trans->delayed_refs.run_delayed_start = 0;
-	cur_trans->delayed_refs.qgroup_to_skip = 0;
 
 	/*
 	 * although the tree mod log is per file system and not per transaction,
-- 
cgit v1.1


From e9cf439f0dc824cca6b2dc0dbb14e17f3f80b7f1 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Wed, 9 Sep 2015 00:18:50 +0000
Subject: btrfs: use a single if() statement for one outcome in get_block_rsv()

Rather than have three separate if() statements for the same outcome
we should just OR them together in the same if() statement.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 601d7d4..8a1759b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4891,13 +4891,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 {
 	struct btrfs_block_rsv *block_rsv = NULL;
 
-	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-		block_rsv = trans->block_rsv;
-
-	if (root == root->fs_info->csum_root && trans->adding_csums)
-		block_rsv = trans->block_rsv;
-
-	if (root == root->fs_info->uuid_root)
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    (root == root->fs_info->csum_root && trans->adding_csums) ||
+	     (root == root->fs_info->uuid_root))
 		block_rsv = trans->block_rsv;
 
 	if (!block_rsv)
-- 
cgit v1.1


From af902047508d3619499cbbc2f7c4c5e16428b2e3 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 15 Sep 2015 21:08:06 +0800
Subject: btrfs: Move btrfs_raid_array to public

This array is used to record attributes of each raid type,
make it public, and many functions will benifit with this array.

For example, num_tolerated_disk_barrier_failures(), we can
avoid complex conditions in this function, and get raid attribute
simply by accessing above array.

It can also make code logic simple, reduce duplication code, and
increase maintainability.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 128 +++++++++++++++++++++++++++++------------------------
 fs/btrfs/volumes.h |   4 ++
 2 files changed, 73 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..2a65d2d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,75 @@
 #include "dev-replace.h"
 #include "sysfs.h"
 
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = {
+		.sub_stripes	= 2,
+		.dev_stripes	= 1,
+		.devs_max	= 0,	/* 0 == as many as possible */
+		.devs_min	= 4,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID1] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 2,
+		.devs_min	= 2,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_DUP] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 2,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID0] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_SINGLE] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_RAID5] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID6] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 3,
+		.devs_increment	= 1,
+		.ncopies	= 3,
+	},
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
+	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
+	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
+	[BTRFS_RAID_SINGLE] = 0,
+	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
+	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
+};
+
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				struct btrfs_device *device);
@@ -4285,65 +4354,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
-	[BTRFS_RAID_RAID10] = {
-		.sub_stripes	= 2,
-		.dev_stripes	= 1,
-		.devs_max	= 0,	/* 0 == as many as possible */
-		.devs_min	= 4,
-		.devs_increment	= 2,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_RAID1] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 2,
-		.devs_min	= 2,
-		.devs_increment	= 2,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_DUP] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 2,
-		.devs_max	= 1,
-		.devs_min	= 1,
-		.devs_increment	= 1,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_RAID0] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 0,
-		.devs_min	= 2,
-		.devs_increment	= 1,
-		.ncopies	= 1,
-	},
-	[BTRFS_RAID_SINGLE] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 1,
-		.devs_min	= 1,
-		.devs_increment	= 1,
-		.ncopies	= 1,
-	},
-	[BTRFS_RAID_RAID5] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 0,
-		.devs_min	= 2,
-		.devs_increment	= 1,
-		.ncopies	= 2,
-	},
-	[BTRFS_RAID_RAID6] = {
-		.sub_stripes	= 1,
-		.dev_stripes	= 1,
-		.devs_max	= 0,
-		.devs_min	= 3,
-		.devs_increment	= 1,
-		.ncopies	= 3,
-	},
-};
-
 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
 {
 	/* TODO allow them to set a preferred stripe size */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ca784a..75d6083 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -338,6 +338,10 @@ struct btrfs_raid_attr {
 	int ncopies;		/* how many copies to data has */
 };
 
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
 struct map_lookup {
 	u64 type;
 	int io_align;
-- 
cgit v1.1


From 8789f4fe608922f484369382a8e507ca12df7f4e Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 15 Sep 2015 21:08:07 +0800
Subject: btrfs: use btrfs_raid_array for
 btrfs_get_num_tolerated_disk_barrier_failures()

btrfs_raid_array[] is used to define all raid attributes, use it
to get tolerated_failures in btrfs_get_num_tolerated_disk_barrier_failures(),
instead of complex condition in function.

It can make code simple and auto-support other possible raid-type in
future.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 35 ++++++++++++++++++++++-------------
 fs/btrfs/volumes.c |  7 +++++++
 fs/btrfs/volumes.h |  1 +
 3 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e60d00..322fd6d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3449,22 +3449,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
 {
-	if ((flags & (BTRFS_BLOCK_GROUP_DUP |
-		      BTRFS_BLOCK_GROUP_RAID0 |
-		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
-	    ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
-		return 0;
+	int raid_type;
+	int min_tolerated = INT_MAX;
 
-	if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		     BTRFS_BLOCK_GROUP_RAID5 |
-		     BTRFS_BLOCK_GROUP_RAID10))
-		return 1;
+	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+		min_tolerated = min(min_tolerated,
+				    btrfs_raid_array[BTRFS_RAID_SINGLE].
+				    tolerated_failures);
 
-	if (flags & BTRFS_BLOCK_GROUP_RAID6)
-		return 2;
+	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+		if (raid_type == BTRFS_RAID_SINGLE)
+			continue;
+		if (!(flags & btrfs_raid_group[raid_type]))
+			continue;
+		min_tolerated = min(min_tolerated,
+				    btrfs_raid_array[raid_type].
+				    tolerated_failures);
+	}
 
-	pr_warn("BTRFS: unknown raid type: %llu\n", flags);
-	return 0;
+	if (min_tolerated == INT_MAX) {
+		pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+		min_tolerated = 0;
+	}
+
+	return min_tolerated;
 }
 
 int btrfs_calc_num_tolerated_disk_barrier_failures(
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2a65d2d..7b53390 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -48,6 +48,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,	/* 0 == as many as possible */
 		.devs_min	= 4,
+		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
 	},
@@ -56,6 +57,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 2,
 		.devs_min	= 2,
+		.tolerated_failures = 1,
 		.devs_increment	= 2,
 		.ncopies	= 2,
 	},
@@ -64,6 +66,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 2,
 		.devs_max	= 1,
 		.devs_min	= 1,
+		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 2,
 	},
@@ -72,6 +75,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,
 		.devs_min	= 2,
+		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
 	},
@@ -80,6 +84,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 1,
 		.devs_min	= 1,
+		.tolerated_failures = 0,
 		.devs_increment	= 1,
 		.ncopies	= 1,
 	},
@@ -88,6 +93,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,
 		.devs_min	= 2,
+		.tolerated_failures = 1,
 		.devs_increment	= 1,
 		.ncopies	= 2,
 	},
@@ -96,6 +102,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 		.dev_stripes	= 1,
 		.devs_max	= 0,
 		.devs_min	= 3,
+		.tolerated_failures = 2,
 		.devs_increment	= 1,
 		.ncopies	= 3,
 	},
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 75d6083..04ff0ef 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -334,6 +334,7 @@ struct btrfs_raid_attr {
 	int dev_stripes;	/* stripes per dev */
 	int devs_max;		/* max devs to use */
 	int devs_min;		/* min devs needed */
+	int tolerated_failures; /* max tolerated fail devs */
 	int devs_increment;	/* ndevs has to be a multiple of this */
 	int ncopies;		/* how many copies to data has */
 };
-- 
cgit v1.1


From 9c170b2644610a62b107c12a007e065d4456798d Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 15 Sep 2015 21:08:08 +0800
Subject: btrfs: use btrfs_raid_array in btrfs_reduce_alloc_profile

btrfs_raid_array[] holds attributes of all raid types.

Use btrfs_raid_array[].devs_min is best way for request
in btrfs_reduce_alloc_profile(), instead of use complex
condition of each raid types.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8a1759b..2b29800 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3822,7 +3822,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
 	u64 target;
-	u64 tmp;
+	u64 raid_type;
+	u64 allowed = 0;
 
 	/*
 	 * see if restripe for this chunk_type is in progress, if so
@@ -3840,31 +3841,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	spin_unlock(&root->fs_info->balance_lock);
 
 	/* First, mask out the RAID levels which aren't possible */
-	if (num_devices == 1)
-		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
-			   BTRFS_BLOCK_GROUP_RAID5);
-	if (num_devices < 3)
-		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
-	if (num_devices < 4)
-		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
-	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-	flags &= ~tmp;
-
-	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-		tmp = BTRFS_BLOCK_GROUP_RAID6;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-		tmp = BTRFS_BLOCK_GROUP_RAID5;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-		tmp = BTRFS_BLOCK_GROUP_RAID10;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
-		tmp = BTRFS_BLOCK_GROUP_RAID1;
-	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
-		tmp = BTRFS_BLOCK_GROUP_RAID0;
-
-	return extended_to_chunk(flags | tmp);
+	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+			allowed |= btrfs_raid_group[raid_type];
+	}
+	allowed &= flags;
+
+	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+		allowed = BTRFS_BLOCK_GROUP_RAID6;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+		allowed = BTRFS_BLOCK_GROUP_RAID5;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+		allowed = BTRFS_BLOCK_GROUP_RAID10;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+		allowed = BTRFS_BLOCK_GROUP_RAID1;
+	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+		allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	return extended_to_chunk(flags | allowed);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
-- 
cgit v1.1


From bb78915203c0c1b07096e50d12f6e8fe54666f4f Mon Sep 17 00:00:00 2001
From: Shan Hai <shan.hai@windriver.com>
Date: Mon, 21 Sep 2015 11:40:57 +0800
Subject: btrfs/file.c: remove an unsed varialbe first_index

The commit b37392ea86761 ("Btrfs: cleanup unnecessary parameter
and variant of prepare_pages()") makes it redundant.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Shan Hai <haishan.bai@hotmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac9..b6695c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	u64 release_bytes = 0;
 	u64 lockstart;
 	u64 lockend;
-	unsigned long first_index;
 	size_t num_written = 0;
 	int nrptrs;
 	int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	if (!pages)
 		return -ENOMEM;
 
-	first_index = pos >> PAGE_CACHE_SHIFT;
-
 	while (iov_iter_count(i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(i),
-- 
cgit v1.1


From bdcd3c97d1cb4b8de8b57caab56eb0bb44d1f0f5 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 22 Sep 2015 20:02:25 +0000
Subject: btrfs: cleanup btrfs_balance profile validity checks

Improve readability by generalizing the profile validity checks.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7b53390..59538a78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3542,6 +3542,15 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
 
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+		u64 allowed)
+{
+	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
+		 (bctl_arg->target & ~allowed)));
+}
+
 /*
  * Should be called with both balance and volume mutexes held
  */
@@ -3599,27 +3608,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	if (num_devices > 3)
 		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
 			    BTRFS_BLOCK_GROUP_RAID6);
-	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
-	     (bctl->data.target & ~allowed))) {
+	if (validate_convert_profile(&bctl->data, allowed)) {
 		btrfs_err(fs_info, "unable to start balance with target "
 			   "data profile %llu",
 		       bctl->data.target);
 		ret = -EINVAL;
 		goto out;
 	}
-	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
-	     (bctl->meta.target & ~allowed))) {
+	if (validate_convert_profile(&bctl->meta, allowed)) {
 		btrfs_err(fs_info,
 			   "unable to start balance with target metadata profile %llu",
 		       bctl->meta.target);
 		ret = -EINVAL;
 		goto out;
 	}
-	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
-	     (bctl->sys.target & ~allowed))) {
+	if (validate_convert_profile(&bctl->sys, allowed)) {
 		btrfs_err(fs_info,
 			   "unable to start balance with target system profile %llu",
 		       bctl->sys.target);
-- 
cgit v1.1


From 5aed1dd8b458aa63aa9b7c1c5dd78f54de143c6f Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 22 Sep 2015 20:59:15 +0000
Subject: btrfs: change num_items type from u64 to unsigned int

The value of num_items that start_transaction() ultimately
always takes is a small one, so a 64 bit integer is overkill.

Also change num_items for btrfs_start_transaction() and
btrfs_start_transaction_lflush() as well.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 9 +++++----
 fs/btrfs/transaction.h | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c2f84e2..0c04245 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -443,8 +443,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
-		  enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+		  unsigned int type, enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
@@ -586,14 +586,15 @@ reserve_fail:
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_items)
+						   unsigned int num_items)
 {
 	return start_transaction(root, num_items, TRANS_START,
 				 BTRFS_RESERVE_FLUSH_ALL);
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-					struct btrfs_root *root, int num_items)
+					struct btrfs_root *root,
+					unsigned int num_items)
 {
 	return start_transaction(root, num_items, TRANS_START,
 				 BTRFS_RESERVE_FLUSH_LIMIT);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index a994bb0..db6bfd9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -185,9 +185,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_items);
+						   unsigned int num_items);
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-					struct btrfs_root *root, int num_items);
+					struct btrfs_root *root,
+					unsigned int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
-- 
cgit v1.1


From 6e4d6fa12ceb22f87d49e043e5a7636abdac970f Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 22 Sep 2015 21:00:07 +0000
Subject: btrfs: declare rsv_count as unsigned int instead of int

rsv_count ultimately gets passed to start_transaction() which
now takes an unsigned int as its num_items parameter.
The value of rsv_count should always be positive so declare it
as being unsigned.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b6695c4..7031e96 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2263,7 +2263,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	u64 drop_end;
 	int ret = 0;
 	int err = 0;
-	int rsv_count;
+	unsigned int rsv_count;
 	bool same_page;
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 	u64 ino_size;
-- 
cgit v1.1


From 8cd1e73111f95ca915f37ed534bbfbdf24fc1ca8 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Sun, 4 Oct 2015 17:05:32 +0800
Subject: btrfs: fix a comment typo

Just fix a typo in the code comment.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66..7332995 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1215,7 +1215,7 @@ out:
  * @offset - the offset for the key we'll insert
  *
  * This function writes out a free space cache struct to disk for quick recovery
- * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * on mount.  This will return 0 if it was successful in writing the cache out,
  * or an errno if it was not.
  */
 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
-- 
cgit v1.1


From 619ed39242d901339c09faa1b52dad47b0476f6f Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Thu, 8 Oct 2015 20:49:34 +0900
Subject: btrfs: cleanup iterating over prop_handlers array

This patch eliminates the last item of prop_handlers array which is used
to check end of array and instead uses ARRAY_SIZE macro.
Though this is a very tiny optimization, using ARRAY_SIZE macro is a
good practice to iterate array.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b..f9e6023 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
 		.extract = prop_compression_extract,
 		.inheritable = 1
 	},
-	{
-		.xattr_name = NULL
-	}
 };
 
 void __init btrfs_props_init(void)
 {
-	struct prop_handler *p;
+	int i;
 
 	hash_init(prop_handlers_ht);
 
-	for (p = &prop_handlers[0]; p->xattr_name; p++) {
+	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+		struct prop_handler *p = &prop_handlers[i];
 		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
 
 		hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
 			 struct inode *inode,
 			 struct inode *parent)
 {
-	const struct prop_handler *h;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
+	int i;
 
 	if (!test_bit(BTRFS_INODE_HAS_PROPS,
 		      &BTRFS_I(parent)->runtime_flags))
 		return 0;
 
-	for (h = &prop_handlers[0]; h->xattr_name; h++) {
+	for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+		const struct prop_handler *h = &prop_handlers[i];
 		const char *value;
 		u64 num_bytes;
 
-- 
cgit v1.1


From d91876496bcf2236efb75ef12378c964bbe8c970 Mon Sep 17 00:00:00 2001
From: Byongho Lee <bhlee.kernel@gmail.com>
Date: Wed, 14 Oct 2015 14:05:24 +0900
Subject: btrfs: compress: put variables defined per compress type in struct to
 make cache friendly

Below variables are defined per compress type.
 - struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]
 - spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]
 - int comp_num_workspace[BTRFS_COMPRESS_TYPES]
 - atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]
 - wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]

BTW, while accessing one compress type of these variables, the next or
before address is other compress types of it.
So this patch puts these variables in a struct to make cache friendly.

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Byongho Lee <bhlee.kernel@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 94 ++++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca..b524e02 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -745,11 +745,13 @@ out:
 	return ret;
 }
 
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+	struct list_head idle_ws;
+	spinlock_t ws_lock;
+	int num_ws;
+	atomic_t alloc_ws;
+	wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
 
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
@@ -761,10 +763,10 @@ void __init btrfs_init_compress(void)
 	int i;
 
 	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		INIT_LIST_HEAD(&comp_idle_workspace[i]);
-		spin_lock_init(&comp_workspace_lock[i]);
-		atomic_set(&comp_alloc_workspace[i], 0);
-		init_waitqueue_head(&comp_workspace_wait[i]);
+		INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+		spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+		atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+		init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
 	}
 }
 
@@ -778,38 +780,38 @@ static struct list_head *find_workspace(int type)
 	int cpus = num_online_cpus();
 	int idx = type - 1;
 
-	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
-	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
-	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
-	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
-	int *num_workspace			= &comp_num_workspace[idx];
+	struct list_head *idle_ws	= &btrfs_comp_ws[idx].idle_ws;
+	spinlock_t *ws_lock		= &btrfs_comp_ws[idx].ws_lock;
+	atomic_t *alloc_ws		= &btrfs_comp_ws[idx].alloc_ws;
+	wait_queue_head_t *ws_wait	= &btrfs_comp_ws[idx].ws_wait;
+	int *num_ws			= &btrfs_comp_ws[idx].num_ws;
 again:
-	spin_lock(workspace_lock);
-	if (!list_empty(idle_workspace)) {
-		workspace = idle_workspace->next;
+	spin_lock(ws_lock);
+	if (!list_empty(idle_ws)) {
+		workspace = idle_ws->next;
 		list_del(workspace);
-		(*num_workspace)--;
-		spin_unlock(workspace_lock);
+		(*num_ws)--;
+		spin_unlock(ws_lock);
 		return workspace;
 
 	}
-	if (atomic_read(alloc_workspace) > cpus) {
+	if (atomic_read(alloc_ws) > cpus) {
 		DEFINE_WAIT(wait);
 
-		spin_unlock(workspace_lock);
-		prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+		spin_unlock(ws_lock);
+		prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(alloc_ws) > cpus && !*num_ws)
 			schedule();
-		finish_wait(workspace_wait, &wait);
+		finish_wait(ws_wait, &wait);
 		goto again;
 	}
-	atomic_inc(alloc_workspace);
-	spin_unlock(workspace_lock);
+	atomic_inc(alloc_ws);
+	spin_unlock(ws_lock);
 
 	workspace = btrfs_compress_op[idx]->alloc_workspace();
 	if (IS_ERR(workspace)) {
-		atomic_dec(alloc_workspace);
-		wake_up(workspace_wait);
+		atomic_dec(alloc_ws);
+		wake_up(ws_wait);
 	}
 	return workspace;
 }
@@ -821,27 +823,27 @@ again:
 static void free_workspace(int type, struct list_head *workspace)
 {
 	int idx = type - 1;
-	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
-	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
-	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
-	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
-	int *num_workspace			= &comp_num_workspace[idx];
-
-	spin_lock(workspace_lock);
-	if (*num_workspace < num_online_cpus()) {
-		list_add(workspace, idle_workspace);
-		(*num_workspace)++;
-		spin_unlock(workspace_lock);
+	struct list_head *idle_ws	= &btrfs_comp_ws[idx].idle_ws;
+	spinlock_t *ws_lock		= &btrfs_comp_ws[idx].ws_lock;
+	atomic_t *alloc_ws		= &btrfs_comp_ws[idx].alloc_ws;
+	wait_queue_head_t *ws_wait	= &btrfs_comp_ws[idx].ws_wait;
+	int *num_ws			= &btrfs_comp_ws[idx].num_ws;
+
+	spin_lock(ws_lock);
+	if (*num_ws < num_online_cpus()) {
+		list_add(workspace, idle_ws);
+		(*num_ws)++;
+		spin_unlock(ws_lock);
 		goto wake;
 	}
-	spin_unlock(workspace_lock);
+	spin_unlock(ws_lock);
 
 	btrfs_compress_op[idx]->free_workspace(workspace);
-	atomic_dec(alloc_workspace);
+	atomic_dec(alloc_ws);
 wake:
 	smp_mb();
-	if (waitqueue_active(workspace_wait))
-		wake_up(workspace_wait);
+	if (waitqueue_active(ws_wait))
+		wake_up(ws_wait);
 }
 
 /*
@@ -853,11 +855,11 @@ static void free_workspaces(void)
 	int i;
 
 	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-		while (!list_empty(&comp_idle_workspace[i])) {
-			workspace = comp_idle_workspace[i].next;
+		while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+			workspace = btrfs_comp_ws[i].idle_ws.next;
 			list_del(workspace);
 			btrfs_compress_op[i]->free_workspace(workspace);
-			atomic_dec(&comp_alloc_workspace[i]);
+			atomic_dec(&btrfs_comp_ws[i].alloc_ws);
 		}
 	}
 }
-- 
cgit v1.1


From 0b8d8ce0297c4376d50318692e0aa53a3aaed90c Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Tue, 20 Oct 2015 14:56:22 +0100
Subject: btrfs: check-integrity: Fix returned errno codes

check-integrity is using -1 instead of the -ENOMEM defined macro to
specify that a buffer allocation failed. Since the error number is
propagated, the caller will get a -EPERM which is the wrong error
condition.

Also, the smatch tool complains with the following warnings:
btrfsic_process_superblock() warn: returning -1 instead of -ENOMEM is sloppy
btrfsic_read_block() warn: returning -1 instead of -ENOMEM is sloppy

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfa..9cacd06 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
 	if (NULL == selected_super) {
 		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-		return -1;
+		return -ENOMEM;
 	}
 
 	list_for_each_entry(device, dev_head, dev_list) {
@@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 					  sizeof(*block_ctx->pagev)) *
 					 num_pages, GFP_NOFS);
 	if (!block_ctx->mem_to_free)
-		return -1;
+		return -ENOMEM;
 	block_ctx->datav = block_ctx->mem_to_free;
 	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
 	for (i = 0; i < num_pages; i++) {
-- 
cgit v1.1


From ddd664f4478a4aaf0ac67a4297e33f1992bc9be2 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Tue, 20 Oct 2015 14:56:23 +0100
Subject: btrfs: reada: Fix returned errno code

reada is using -1 instead of the -ENOMEM defined macro to specify that
a buffer allocation failed. Since the error number is propagated, the
caller will get a -EPERM which is the wrong error condition.

Also, updating the caller to return the exact value from
reada_add_block.

Smatch tool warning:
reada_add_block() warn: returning -1 instead of -ENOMEM is sloppy

Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/reada.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 4645cd1..619f929 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
 	rec = kzalloc(sizeof(*rec), GFP_NOFS);
 	if (!rec) {
 		reada_extent_put(root->fs_info, re);
-		return -1;
+		return -ENOMEM;
 	}
 
 	rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 	u64 start;
 	u64 generation;
 	int level;
+	int ret;
 	struct extent_buffer *node;
 	static struct btrfs_key max_key = {
 		.objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 	generation = btrfs_header_generation(node);
 	free_extent_buffer(node);
 
-	if (reada_add_block(rc, start, &max_key, level, generation)) {
+	ret = reada_add_block(rc, start, &max_key, level, generation);
+	if (ret) {
 		kfree(rc);
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(ret);
 	}
 
 	reada_start_machine(root->fs_info);
-- 
cgit v1.1


From 25520d55cdb6ee289abc68f553d364d22478ff54 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 21 Oct 2015 13:19:49 -0400
Subject: block: Inline blk_integrity in struct gendisk

Up until now the_integrity profile has been dynamically allocated and
attached to struct gendisk after the disk has been made active.

This causes problems because NVMe devices need to register the profile
prior to the partition table being read due to a mandatory metadata
buffer requirement. In addition, DM goes through hoops to deal with
preallocating, but not initializing integrity profiles.

Since the integrity profile is small (4 bytes + a pointer), Christoph
suggested moving it to struct gendisk proper. This requires several
changes:

 - Moving the blk_integrity definition to genhd.h.

 - Inlining blk_integrity in struct gendisk.

 - Removing the dynamic allocation code.

 - Adding helper functions which allow gendisk to set up and tear down
   the integrity sysfs dir when a disk is added/deleted.

 - Adding a blk_integrity_revalidate() callback for updating the stable
   pages bdi setting.

 - The calls that depend on whether a device has an integrity profile or
   not now key off of the bi->profile pointer.

 - Simplifying the integrity support routines in DM (Mike Snitzer).

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reported-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 073bb57..0a793c7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1075,7 +1075,7 @@ int revalidate_disk(struct gendisk *disk)
 
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
-
+	blk_integrity_revalidate(disk);
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
 		return ret;
-- 
cgit v1.1


From c646619355d196293daffdbb4fd877c8f5048e49 Mon Sep 17 00:00:00 2001
From: Li RongQing <roy.qing.li@gmail.com>
Date: Thu, 24 Sep 2015 10:19:09 +0800
Subject: nfsroot: make nfsroot to accept the 1024 bytes long directory name

although NFS_MAXPATHLEN is defined to 1024, nfs client hopes to accept
a 1024 byte path, but nfs_root_parms is limited to 256, and the nfs path
will truncated when a user inputs nfs path from kernel cmdline

enlarge nfs_root_parms to 1024, to make it accept the 1024 bytes long
directory name, since nfs_root_parms is defined as _initdata, it will
be released after system bootup

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfsroot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 9bc9f04..89a15db 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -90,7 +90,7 @@
 #define NFS_DEF_OPTIONS		"vers=2,udp,rsize=4096,wsize=4096"
 
 /* Parameters passed from the kernel command line */
-static char nfs_root_parms[256] __initdata = "";
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
 
 /* Text-based mount options passed to super.c */
 static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
-- 
cgit v1.1


From d4e2ce096101bad9ca5bad3c27488905f91e30cb Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:55:21 +0800
Subject: NFS: Get rid of the unneeded addr stored in callback arguments

Commit c36fca52f5 "NFS refactor nfs_find_client and reference client
across callback processing" has store clp in cb_process_state
which is set in cb_sequence.

So that, it's unneeded to store address pointer in any callback arguments.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback.h     | 5 -----
 fs/nfs/callback_xdr.c | 5 -----
 2 files changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 84326e9..0a590f0 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -61,7 +61,6 @@ struct cb_compound_hdr_res {
 };
 
 struct cb_getattrargs {
-	struct sockaddr *addr;
 	struct nfs_fh fh;
 	uint32_t bitmap[2];
 };
@@ -76,7 +75,6 @@ struct cb_getattrres {
 };
 
 struct cb_recallargs {
-	struct sockaddr *addr;
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
 	uint32_t truncate;
@@ -134,7 +132,6 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
 #define RCA4_TYPE_MASK_ALL 0xf31f
 
 struct cb_recallanyargs {
-	struct sockaddr	*craa_addr;
 	uint32_t	craa_objs_to_keep;
 	uint32_t	craa_type_mask;
 };
@@ -144,7 +141,6 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
 					struct cb_process_state *cps);
 
 struct cb_recallslotargs {
-	struct sockaddr	*crsa_addr;
 	uint32_t	crsa_target_highest_slotid;
 };
 extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
@@ -152,7 +148,6 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
 					 struct cb_process_state *cps);
 
 struct cb_layoutrecallargs {
-	struct sockaddr		*cbl_addr;
 	uint32_t		cbl_recall_type;
 	uint32_t		cbl_layout_type;
 	uint32_t		cbl_layoutchanged;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 6b1697a..e87d7f0 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -198,7 +198,6 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
 	status = decode_fh(xdr, &args->fh);
 	if (unlikely(status != 0))
 		goto out;
-	args->addr = svc_addr(rqstp);
 	status = decode_bitmap(xdr, args->bitmap);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
@@ -210,7 +209,6 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	__be32 *p;
 	__be32 status;
 
-	args->addr = svc_addr(rqstp);
 	status = decode_stateid(xdr, &args->stateid);
 	if (unlikely(status != 0))
 		goto out;
@@ -236,7 +234,6 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
 	__be32 status = 0;
 	uint32_t iomode;
 
-	args->cbl_addr = svc_addr(rqstp);
 	p = read_buf(xdr, 4 * sizeof(uint32_t));
 	if (unlikely(p == NULL)) {
 		status = htonl(NFS4ERR_BADXDR);
@@ -500,7 +497,6 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
 	uint32_t bitmap[2];
 	__be32 *p, status;
 
-	args->craa_addr = svc_addr(rqstp);
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
@@ -519,7 +515,6 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
 {
 	__be32 *p;
 
-	args->crsa_addr = svc_addr(rqstp);
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_BADXDR);
-- 
cgit v1.1


From 8c163d8e5ad2bd7982904bbe568706e1b0bbf60a Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:55:59 +0800
Subject: NFS: Remove the left global variable nfs_callback_tcpport

Commit bbe0a3aa4e22 "NFS: make nfs_callback_tcpport per network context" has
make nfs_callback_tcpport per network, but left the global nfs_callback_tcpport,
remove it.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback.h | 1 -
 fs/nfs/super.c    | 2 --
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 0a590f0..cbcc903 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -204,6 +204,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #define NFS41_BC_MAX_CALLBACKS 1
 
 extern unsigned int nfs_callback_set_tcpport;
-extern unsigned short nfs_callback_tcpport;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 383a027..f126828 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2816,7 +2816,6 @@ out_invalid_transport_udp:
  * NFS client for backwards compatibility
  */
 unsigned int nfs_callback_set_tcpport;
-unsigned short nfs_callback_tcpport;
 /* Default cache timeout is 10 minutes */
 unsigned int nfs_idmap_cache_timeout = 600;
 /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
@@ -2827,7 +2826,6 @@ char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
 bool recover_lost_locks = false;
 
 EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
-EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
 EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
 EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
 EXPORT_SYMBOL_GPL(max_session_slots);
-- 
cgit v1.1


From f765bf762bb57d8bda825e5440a4801232cd835f Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:56:29 +0800
Subject: NFS: Remove the left function defines in callback.h

Commit 778be232a207 "NFS do not find client in NFSv4 pg_authenticate" has remove
the define and using of nfs4_set_callback_sessionid(), and
commit 36281caa839f "NFSv4: Further clean-ups of delegation stateid validation"
has update the checking of stateid, and move the code to nfs4proc.c.

This patch remove those function defines left in callback.h

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index cbcc903..ff8195b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -117,9 +117,6 @@ extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 				       struct cb_sequenceres *res,
 				       struct cb_process_state *cps);
 
-extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
-					     const nfs4_stateid *stateid);
-
 #define RCA4_TYPE_MASK_RDATA_DLG	0
 #define RCA4_TYPE_MASK_WDATA_DLG	1
 #define RCA4_TYPE_MASK_DIR_DLG         2
@@ -191,9 +188,6 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 #if IS_ENABLED(CONFIG_NFS_V4)
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion, struct net *net);
-extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
-					    const nfs4_stateid *stateid);
-extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
  * nfs41: Callbacks are expected to not cause substantial latency,
-- 
cgit v1.1


From 39de493e8801812cae076a02d84b4f80b88f94b9 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:57:03 +0800
Subject: NFS: Remove unneeded NFS_DEBUG checking before define NFSDBG_FACILITY

It's not needed to checking NFS_DEBUG before define NFSDBG_FACILITY, remove it.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c | 2 --
 fs/nfs/mount_clnt.c    | 4 +---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b85cf7a..807eb6e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -17,9 +17,7 @@
 #include "nfs4session.h"
 #include "nfs4trace.h"
 
-#ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
-#endif
 
 __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 			     struct cb_getattrres *res,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 99a4528..09b1900 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,9 +16,7 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 
-#ifdef NFS_DEBUG
-# define NFSDBG_FACILITY	NFSDBG_MOUNT
-#endif
+#define NFSDBG_FACILITY	NFSDBG_MOUNT
 
 /*
  * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
-- 
cgit v1.1


From 590184a6cea32c1f864a8e614f44e35a81a53fe0 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:57:37 +0800
Subject: NFS: Use NFS4_MAX_SESSIONID_LEN directly for decode/encode sessionid

It's no need to define a temporary variables for NFS4_MAX_SESSIONID_LEN.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_xdr.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e87d7f0..09aeb9a 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -380,13 +380,12 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
 				 struct nfs4_sessionid *sid)
 {
 	__be32 *p;
-	int len = NFS4_MAX_SESSIONID_LEN;
 
-	p = read_buf(xdr, len);
+	p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 
-	memcpy(sid->data, p, len);
+	memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
 	return 0;
 }
 
@@ -679,13 +678,12 @@ static __be32 encode_sessionid(struct xdr_stream *xdr,
 				 const struct nfs4_sessionid *sid)
 {
 	__be32 *p;
-	int len = NFS4_MAX_SESSIONID_LEN;
 
-	p = xdr_reserve_space(xdr, len);
+	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 
-	memcpy(p, sid, len);
+	memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
 	return 0;
 }
 
-- 
cgit v1.1


From 45724e8a5b2a69b9524fd16ff73345fab9aae279 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:57:58 +0800
Subject: NFS: Fix bad defines of callback response maxsize

As CB_OP_TAGLEN_MAXSZ, all XXX_MAXSZ should be defined as bit.
Each operation should not cantains CB_OP_TAGLEN_MAXSZ.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_xdr.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 09aeb9a..9f0f0f6 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -18,19 +18,21 @@
 #include "internal.h"
 #include "nfs4session.h"
 
-#define CB_OP_TAGLEN_MAXSZ	(512)
-#define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ)
-#define CB_OP_GETATTR_BITMAP_MAXSZ	(4)
-#define CB_OP_GETATTR_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
-				CB_OP_GETATTR_BITMAP_MAXSZ + \
-				2 + 2 + 3 + 3)
-#define CB_OP_RECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_TAGLEN_MAXSZ		(512)
+#define CB_OP_HDR_RES_MAXSZ		(2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ	(4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ		(CB_OP_HDR_RES_MAXSZ + \
+					 CB_OP_GETATTR_BITMAP_MAXSZ + \
+					 /* change, size, ctime, mtime */\
+					 (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ		(CB_OP_HDR_RES_MAXSZ)
 
 #if defined(CONFIG_NFS_V4_1)
 #define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
-					4 + 1 + 3)
+					 NFS4_MAX_SESSIONID_LEN + \
+					 (1 + 3) * 4) // seqid, 3 slotids
 #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_RECALLSLOT_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.1


From 403889c0399c01a12877e3736ae1e96c9ded27be Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:58:16 +0800
Subject: NFS: Fix bad checking of max taglen in callback request

The taglen should be checked with CB_OP_TAGLEN_MAXSZ directly.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 9f0f0f6..4ad39fe 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -159,7 +159,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	if (unlikely(status != 0))
 		return status;
 	/* We do not like overly long tags! */
-	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) {
 		printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
 				__func__, hdr->taglen);
 		return htonl(NFS4ERR_RESOURCE);
-- 
cgit v1.1


From e0a63c0bfcc6f8283377258b5d892f88abd913c1 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 24 Sep 2015 20:58:38 +0800
Subject: NFS: Return directly if encode_sessionid fail

encode_sessionid() may return error, nfs needs process the return value.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_xdr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 4ad39fe..646cdac 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -699,7 +699,9 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
 	if (unlikely(status != 0))
 		goto out;
 
-	encode_sessionid(xdr, &res->csr_sessionid);
+	status = encode_sessionid(xdr, &res->csr_sessionid);
+	if (status)
+		goto out;
 
 	p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
 	if (unlikely(p == NULL))
-- 
cgit v1.1


From 15ae2c7bdc9a5a46999319b88a465a64d265dc40 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Fri, 16 Oct 2015 17:22:50 +0800
Subject: nfs/blocklayout: Fix bad using of page offset in bl_read_pagelist

Blocklayout uses file offset for the read-back page's offset of first writing,
it's definitely wrong, it writes data to bad address of page that cause userspace
application segment fault. It must be the page base stored in header->args.pgbase.

Also, the pg_offset has no influence with isect and extent length.

Note: The offset of the non-first page is always zero.

Ps: A test program will segment fault at read() as,
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>

int main(int argc, char **argv)
{
        char buf[2049];
        char *filename = NULL;
        int fd = -1;

        if (argc < 2) {
                printf("Usage: %s filename\n", argv[0]);
                return 0;
        }

        filename = argv[1];
        fd = open(filename, O_RDONLY | O_DIRECT);
        if (fd < 0) {
                printf("Open %s fail: %m\n", filename);
                return 1;
        }

        lseek(fd, 2048, SEEK_SET);
        if (read(fd, buf, sizeof(buf) - 1) != (sizeof(buf) - 1))
                printf("Read 4096 bityes data from %s fail: %m\n", filename);
out:
        close(fd);
        return 0;
}

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9cd4eb3..ddd0138 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -229,7 +229,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 	struct parallel_io *par;
 	loff_t f_offset = header->args.offset;
 	size_t bytes_left = header->args.count;
-	unsigned int pg_offset, pg_len;
+	unsigned int pg_offset = header->args.pgbase, pg_len;
 	struct page **pages = header->args.pages;
 	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	const bool is_dio = (header->dreq != NULL);
@@ -262,7 +262,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
-		pg_offset = f_offset & ~PAGE_CACHE_MASK;
 		if (is_dio) {
 			if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
 				pg_len = PAGE_CACHE_SIZE - pg_offset;
@@ -273,9 +272,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 			pg_len = PAGE_CACHE_SIZE;
 		}
 
-		isect += (pg_offset >> SECTOR_SHIFT);
-		extent_length -= (pg_offset >> SECTOR_SHIFT);
-
 		if (is_hole(&be)) {
 			bio = bl_submit_bio(READ, bio);
 			/* Fill hole w/ zeroes w/o accessing device */
@@ -301,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 		extent_length -= (pg_len >> SECTOR_SHIFT);
 		f_offset += pg_len;
 		bytes_left -= pg_len;
+		pg_offset = 0;
 	}
 	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
 		header->res.eof = 1;
-- 
cgit v1.1


From f8417b481cce2bed4744fda733f2ff22278bd7ce Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Fri, 16 Oct 2015 17:23:29 +0800
Subject: NFSv4.1/pnfs: Retry through MDS when getting bad length of data

If non rpc-based layout driver return bad length of data, nfs retries
by calling rpc_restart_call_prepare() that cause an NULL reference panic.

This patch lets nfs retry through MDS for non rpc-based layout driver
return bad length of data.

[13034.883329] BUG: unable to handle kernel NULL pointer dereference at           (null)
[13034.884902] IP: [<ffffffffa00db372>] rpc_restart_call_prepare+0x62/0x90 [sunrpc]
[13034.886558] PGD 0
[13034.888126] Oops: 0000 [#1] KASAN
[13034.889710] Modules linked in: blocklayoutdriver(OE) nfsv4(OE) nfs(OE) fscache(E) nfsd(OE) xfs libcrc32c coretemp btrfs crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ppdev vmw_balloon auth_rpcgss shpchp nfs_acl lockd vmw_vmci parport_pc xor raid6_pq grace parport sunrpc i2c_piix4 vmwgfx drm_kms_helper ttm drm mptspi e1000 serio_raw scsi_transport_spi mptscsih mptbase ata_generic pata_acpi [last unloaded: fscache]
[13034.898260] CPU: 0 PID: 10112 Comm: kworker/0:1 Tainted: G           OE   4.3.0-rc5+ #279
[13034.899932] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[13034.903342] Workqueue: events bl_read_cleanup [blocklayoutdriver]
[13034.905059] task: ffff88006a9148c0 ti: ffff880035e90000 task.ti: ffff880035e90000
[13034.906827] RIP: 0010:[<ffffffffa00db372>]  [<ffffffffa00db372>] rpc_restart_call_prepare+0x62/0x90 [sunrpc]
[13034.910522] RSP: 0018:ffff880035e97b58  EFLAGS: 00010282
[13034.912378] RAX: fffffbfff04a5a94 RBX: ffff880068fe4858 RCX: 0000000000000003
[13034.914339] RDX: dffffc0000000000 RSI: 0000000000000003 RDI: 0000000000000282
[13034.916236] RBP: ffff880035e97b68 R08: 0000000000000001 R09: 0000000000000001
[13034.918229] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[13034.920007] R13: ffff880068fe4858 R14: ffff880068fe4a60 R15: 0000000000001000
[13034.921845] FS:  0000000000000000(0000) GS:ffffffff82247000(0000) knlGS:0000000000000000
[13034.923645] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[13034.925525] CR2: 0000000000000000 CR3: 00000000063dd000 CR4: 00000000001406f0
[13034.932808] Stack:
[13034.934813]  ffff880068fe4780 0000000000001000 ffff880035e97ba8 ffffffffa08800d2
[13034.936675]  ffffffffa088029d ffff880068fe4780 ffff880068fe4858 ffffffffa089c0a0
[13034.938593]  ffff880068fe47e0 ffff88005d59faf0 ffff880035e97be0 ffffffffa087e08f
[13034.940454] Call Trace:
[13034.942388]  [<ffffffffa08800d2>] nfs_readpage_result+0x112/0x200 [nfs]
[13034.944317]  [<ffffffffa088029d>] ? nfs_readpage_done+0xdd/0x160 [nfs]
[13034.946267]  [<ffffffffa087e08f>] nfs_pgio_result+0x9f/0x120 [nfs]
[13034.948166]  [<ffffffffa09266cc>] pnfs_ld_read_done+0x7c/0x1e0 [nfsv4]
[13034.950247]  [<ffffffffa03b07ee>] bl_read_cleanup+0x2e/0x60 [blocklayoutdriver]
[13034.952156]  [<ffffffff810ebf62>] process_one_work+0x412/0x870
[13034.954102]  [<ffffffff810ebe84>] ? process_one_work+0x334/0x870
[13034.955949]  [<ffffffff810ebb50>] ? queue_delayed_work_on+0x40/0x40
[13034.957985]  [<ffffffff810ec441>] worker_thread+0x81/0x6a0
[13034.959817]  [<ffffffff810ec3c0>] ? process_one_work+0x870/0x870
[13034.961785]  [<ffffffff810f43bd>] kthread+0x17d/0x1a0
[13034.963544]  [<ffffffff810f4240>] ? kthread_create_on_node+0x330/0x330
[13034.965479]  [<ffffffff81100428>] ? finish_task_switch+0x88/0x220
[13034.967223]  [<ffffffff810f4240>] ? kthread_create_on_node+0x330/0x330
[13034.968929]  [<ffffffff81b6ae5f>] ret_from_fork+0x3f/0x70
[13034.970534]  [<ffffffff810f4240>] ? kthread_create_on_node+0x330/0x330
[13034.972176] Code: c7 43 50 40 84 0d a0 e8 3d fe 1c e1 48 8d 7b 58 c7 83 e4 00 00 00 00 00 00 00 e8 ca fe 1c e1 4c 8b 63 58 4c 89 e7 e8 be fe 1c e1 <49> 83 3c 24 00 74 12 48 c7 43 50 f0 a2 0e a0 b8 01 00 00 00 5b
[13034.977148] RIP  [<ffffffffa00db372>] rpc_restart_call_prepare+0x62/0x90 [sunrpc]
[13034.978780]  RSP <ffff880035e97b58>
[13034.980399] CR2: 0000000000000000

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs.c  | 12 +++++++-----
 fs/nfs/read.c  |  9 ++++++++-
 fs/nfs/write.c |  7 +++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8abe271..93496c0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1912,12 +1912,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
  */
 void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
 {
-	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
-	if (!hdr->pnfs_error) {
+	if (likely(!hdr->pnfs_error)) {
 		pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
 				hdr->mds_offset + hdr->res.count);
 		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
-	} else
+	}
+	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+	if (unlikely(hdr->pnfs_error))
 		pnfs_ld_handle_write_error(hdr);
 	hdr->mds_ops->rpc_release(hdr);
 }
@@ -2028,11 +2029,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
  */
 void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
 	if (likely(!hdr->pnfs_error)) {
 		__nfs4_read_done_cb(hdr);
 		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
-	} else
+	}
+	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+	if (unlikely(hdr->pnfs_error))
 		pnfs_ld_handle_read_error(hdr);
 	hdr->mds_ops->rpc_release(hdr);
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 01b8cc8..0a5e33f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -246,6 +246,13 @@ static void nfs_readpage_retry(struct rpc_task *task,
 		nfs_set_pgio_error(hdr, -EIO, argp->offset);
 		return;
 	}
+
+	/* For non rpc-based layout drivers, retry-through-MDS */
+	if (!task->tk_ops) {
+		hdr->pnfs_error = -EAGAIN;
+		return;
+	}
+
 	/* Yes, so retry the read at the end of the hdr */
 	hdr->mds_offset += resp->count;
 	argp->offset += resp->count;
@@ -268,7 +275,7 @@ static void nfs_readpage_result(struct rpc_task *task,
 			hdr->good_bytes = bound - hdr->io_start;
 		}
 		spin_unlock(&hdr->lock);
-	} else if (hdr->res.count != hdr->args.count)
+	} else if (hdr->res.count < hdr->args.count)
 		nfs_readpage_retry(task, hdr);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75ab762..7b93164 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1505,6 +1505,13 @@ static void nfs_writeback_result(struct rpc_task *task,
 			task->tk_status = -EIO;
 			return;
 		}
+
+		/* For non rpc-based layout drivers, retry-through-MDS */
+		if (!task->tk_ops) {
+			hdr->pnfs_error = -EAGAIN;
+			return;
+		}
+
 		/* Was this an NFSv2 write or an NFSv3 stable write? */
 		if (resp->verf->committed != NFS_UNSTABLE) {
 			/* Resend from where the server left off */
-- 
cgit v1.1


From f96999c35f46fa9bce8a3a2812cd0a28fcde5903 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 20 Oct 2015 15:17:19 -0700
Subject: f2fs: refactor __find_rev_next_{zero}_bit

This patch refactors __find_rev_next_{zero}_bit which was disabled previously
due to bugs.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 106 +++++++++++++++++++++++++-----------------------------
 1 file changed, 49 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f37c212..7835e41 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab;
 static struct kmem_cache *sit_entry_set_slab;
 static struct kmem_cache *inmem_entry_slab;
 
+static unsigned long __reverse_ulong(unsigned char *str)
+{
+	unsigned long tmp = 0;
+	int shift = 24, idx = 0;
+
+#if BITS_PER_LONG == 64
+	shift = 56;
+#endif
+	while (shift >= 0) {
+		tmp |= (unsigned long)str[idx++] << shift;
+		shift -= BITS_PER_BYTE;
+	}
+	return tmp;
+}
+
 /*
  * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
  * MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word)
 	int num = 0;
 
 #if BITS_PER_LONG == 64
-	if ((word & 0xffffffff) == 0) {
+	if ((word & 0xffffffff00000000UL) == 0)
 		num += 32;
+	else
 		word >>= 32;
-	}
 #endif
-	if ((word & 0xffff) == 0) {
+	if ((word & 0xffff0000) == 0)
 		num += 16;
+	else
 		word >>= 16;
-	}
-	if ((word & 0xff) == 0) {
+
+	if ((word & 0xff00) == 0)
 		num += 8;
+	else
 		word >>= 8;
-	}
+
 	if ((word & 0xf0) == 0)
 		num += 4;
 	else
 		word >>= 4;
+
 	if ((word & 0xc) == 0)
 		num += 2;
 	else
 		word >>= 2;
+
 	if ((word & 0x2) == 0)
 		num += 1;
 	return num;
@@ -68,26 +87,16 @@ static inline unsigned long __reverse_ffs(unsigned long word)
  * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
  * f2fs_set_bit makes MSB and LSB reversed in a byte.
  * Example:
- *                             LSB <--> MSB
- *   f2fs_set_bit(0, bitmap) => 0000 0001
- *   f2fs_set_bit(7, bitmap) => 1000 0000
+ *                             MSB <--> LSB
+ *   f2fs_set_bit(0, bitmap) => 1000 0000
+ *   f2fs_set_bit(7, bitmap) => 0000 0001
  */
 static unsigned long __find_rev_next_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
-	while (!f2fs_test_bit(offset, (unsigned char *)addr))
-		offset++;
-
-	if (offset > size)
-		offset = size;
-
-	return offset;
-#if 0
 	const unsigned long *p = addr + BIT_WORD(offset);
 	unsigned long result = offset & ~(BITS_PER_LONG - 1);
 	unsigned long tmp;
-	unsigned long mask, submask;
-	unsigned long quot, rest;
 
 	if (offset >= size)
 		return size;
@@ -97,14 +106,9 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
 	if (!offset)
 		goto aligned;
 
-	tmp = *(p++);
-	quot = (offset >> 3) << 3;
-	rest = offset & 0x7;
-	mask = ~0UL << quot;
-	submask = (unsigned char)(0xff << rest) >> rest;
-	submask <<= quot;
-	mask &= submask;
-	tmp &= mask;
+	tmp = __reverse_ulong((unsigned char *)p);
+	tmp &= ~0UL >> offset;
+
 	if (size < BITS_PER_LONG)
 		goto found_first;
 	if (tmp)
@@ -112,42 +116,34 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
 
 	size -= BITS_PER_LONG;
 	result += BITS_PER_LONG;
+	p++;
 aligned:
 	while (size & ~(BITS_PER_LONG-1)) {
-		tmp = *(p++);
+		tmp = __reverse_ulong((unsigned char *)p);
 		if (tmp)
 			goto found_middle;
 		result += BITS_PER_LONG;
 		size -= BITS_PER_LONG;
+		p++;
 	}
 	if (!size)
 		return result;
-	tmp = *p;
+
+	tmp = __reverse_ulong((unsigned char *)p);
 found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
+	tmp &= (~0UL << (BITS_PER_LONG - size));
+	if (!tmp)		/* Are any bits set? */
 		return result + size;   /* Nope. */
 found_middle:
 	return result + __reverse_ffs(tmp);
-#endif
 }
 
 static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
-	while (f2fs_test_bit(offset, (unsigned char *)addr))
-		offset++;
-
-	if (offset > size)
-		offset = size;
-
-	return offset;
-#if 0
 	const unsigned long *p = addr + BIT_WORD(offset);
 	unsigned long result = offset & ~(BITS_PER_LONG - 1);
 	unsigned long tmp;
-	unsigned long mask, submask;
-	unsigned long quot, rest;
 
 	if (offset >= size)
 		return size;
@@ -157,40 +153,36 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
 	if (!offset)
 		goto aligned;
 
-	tmp = *(p++);
-	quot = (offset >> 3) << 3;
-	rest = offset & 0x7;
-	mask = ~(~0UL << quot);
-	submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
-	submask <<= quot;
-	mask += submask;
-	tmp |= mask;
+	tmp = __reverse_ulong((unsigned char *)p);
+	tmp |= ~((~0UL << offset) >> offset);
+
 	if (size < BITS_PER_LONG)
 		goto found_first;
-	if (~tmp)
+	if (tmp != ~0UL)
 		goto found_middle;
 
 	size -= BITS_PER_LONG;
 	result += BITS_PER_LONG;
+	p++;
 aligned:
 	while (size & ~(BITS_PER_LONG - 1)) {
-		tmp = *(p++);
-		if (~tmp)
+		tmp = __reverse_ulong((unsigned char *)p);
+		if (tmp != ~0UL)
 			goto found_middle;
 		result += BITS_PER_LONG;
 		size -= BITS_PER_LONG;
+		p++;
 	}
 	if (!size)
 		return result;
-	tmp = *p;
 
+	tmp = __reverse_ulong((unsigned char *)p);
 found_first:
-	tmp |= ~0UL << size;
-	if (tmp == ~0UL)        /* Are any bits zero? */
+	tmp |= ~(~0UL << (BITS_PER_LONG - size));
+	if (tmp == ~0UL)	/* Are any bits zero? */
 		return result + size;   /* Nope. */
 found_middle:
 	return result + __reverse_ffz(tmp);
-#endif
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
-- 
cgit v1.1


From 0f89abf56abbd0e1c6e3cef9813e6d9f05383c1e Mon Sep 17 00:00:00 2001
From: Christian Engelmayer <cengelma@gmx.at>
Date: Wed, 21 Oct 2015 00:50:06 +0200
Subject: btrfs: fix possible leak in btrfs_ioctl_balance()

Commit 8eb934591f8b ("btrfs: check unsupported filters in balance
arguments") adds a jump to exit label out_bargs in case the argument
check fails. At this point in addition to the bargs memory, the
memory for struct btrfs_balance_control has already been allocated.
Ownership of bctl is passed to btrfs_balance() in the good case,
thus the memory is not freed due to the introduced jump. Make sure
that the memory gets freed in any case as necessary. Detected by
Coverity CID 1328378.

Signed-off-by: Christian Engelmayer <cengelma@gmx.at>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3e3e613..8d20f3b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4641,7 +4641,7 @@ locked:
 
 	if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
 		ret = -EINVAL;
-		goto out_bargs;
+		goto out_bctl;
 	}
 
 do_balance:
@@ -4655,12 +4655,15 @@ do_balance:
 	need_unlock = false;
 
 	ret = btrfs_balance(bctl, bargs);
+	bctl = NULL;
 
 	if (arg) {
 		if (copy_to_user(arg, bargs, sizeof(*bargs)))
 			ret = -EFAULT;
 	}
 
+out_bctl:
+	kfree(bctl);
 out_bargs:
 	kfree(bargs);
 out_unlock:
-- 
cgit v1.1


From ac46777213e00e26b9210060586f473368c54da2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 12:08:16 +0800
Subject: btrfs: extent_io: Introduce needed structure for recoding set/clear
 bits

Add a new structure, extent_change_set, to record how many bytes are
changed in one set/clear_extent_bits() operation, with detailed changed
ranges info.

This provides the needed facilities for later qgroup reserve framework.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36..3107a6e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
 #define __EXTENTIO__
 
 #include <linux/rbtree.h>
+#include "ulist.h"
 
 /* bits for the extent state */
 #define EXTENT_DIRTY		(1U << 0)
@@ -161,6 +162,17 @@ struct extent_buffer {
 #endif
 };
 
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+	/* How many bytes are set/cleared in this operation */
+	u64 bytes_changed;
+
+	/* Changed ranges */
+	struct ulist *range_changed;
+};
+
 static inline void extent_set_compress_type(unsigned long *bio_flags,
 					    int compress_type)
 {
-- 
cgit v1.1


From d38ed27f0442c8cd520e093081127949d4bcf9bc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 14:53:37 +0800
Subject: btrfs: extent_io: Introduce new function set_record_extent_bits

Introduce new function set_record_extent_bits(), which will not only set
given bits, but also record how many bytes are changed, and detailed
range info.

This is quite important for later qgroup reserve framework.
The number of bytes will be used to do qgroup reserve, and detailed
range info will be used to cleanup for EQUOT case.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 71 +++++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/extent_io.h |  3 +++
 2 files changed, 56 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6e6df34..9c066d6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -131,6 +131,23 @@ struct extent_page_data {
 	unsigned int sync_io:1;
 };
 
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+				 struct extent_changeset *changeset,
+				 int set)
+{
+	int ret;
+
+	if (!changeset)
+		return;
+	if (set && (state->state & bits) == bits)
+		return;
+	changeset->bytes_changed += state->end - state->start + 1;
+	ret = ulist_add(changeset->range_changed, state->start, state->end,
+			GFP_ATOMIC);
+	/* ENOMEM */
+	BUG_ON(ret < 0);
+}
+
 static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +427,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-			   struct extent_state *state, unsigned *bits);
+			   struct extent_state *state, unsigned *bits,
+			   struct extent_changeset *changeset);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +444,7 @@ static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
 			struct rb_node ***p,
 			struct rb_node **parent,
-			unsigned *bits)
+			unsigned *bits, struct extent_changeset *changeset)
 {
 	struct rb_node *node;
 
@@ -436,7 +454,7 @@ static int insert_state(struct extent_io_tree *tree,
 	state->start = start;
 	state->end = end;
 
-	set_state_bits(tree, state, bits);
+	set_state_bits(tree, state, bits, changeset);
 
 	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 	if (node) {
@@ -789,7 +807,7 @@ out:
 
 static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
-			   unsigned *bits)
+			   unsigned *bits, struct extent_changeset *changeset)
 {
 	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 
@@ -798,6 +816,7 @@ static void set_state_bits(struct extent_io_tree *tree,
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
+	add_extent_changeset(state, bits_to_set, changeset, 1);
 	state->state |= bits_to_set;
 }
 
@@ -835,7 +854,7 @@ static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		 unsigned bits, unsigned exclusive_bits,
 		 u64 *failed_start, struct extent_state **cached_state,
-		 gfp_t mask)
+		 gfp_t mask, struct extent_changeset *changeset)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -873,7 +892,7 @@ again:
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = insert_state(tree, prealloc, start, end,
-				   &p, &parent, &bits);
+				   &p, &parent, &bits, changeset);
 		if (err)
 			extent_io_tree_panic(tree, err);
 
@@ -899,7 +918,7 @@ hit_next:
 			goto out;
 		}
 
-		set_state_bits(tree, state, &bits);
+		set_state_bits(tree, state, &bits, changeset);
 		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
@@ -945,7 +964,7 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set_state_bits(tree, state, &bits);
+			set_state_bits(tree, state, &bits, changeset);
 			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
@@ -980,7 +999,7 @@ hit_next:
 		 * the later extent.
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   NULL, NULL, &bits);
+				   NULL, NULL, &bits, changeset);
 		if (err)
 			extent_io_tree_panic(tree, err);
 
@@ -1008,7 +1027,7 @@ hit_next:
 		if (err)
 			extent_io_tree_panic(tree, err);
 
-		set_state_bits(tree, prealloc, &bits);
+		set_state_bits(tree, prealloc, &bits, changeset);
 		cache_state(prealloc, cached_state);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
@@ -1038,7 +1057,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   struct extent_state **cached_state, gfp_t mask)
 {
 	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
-				cached_state, mask);
+				cached_state, mask, NULL);
 }
 
 
@@ -1111,7 +1130,7 @@ again:
 			goto out;
 		}
 		err = insert_state(tree, prealloc, start, end,
-				   &p, &parent, &bits);
+				   &p, &parent, &bits, NULL);
 		if (err)
 			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
@@ -1130,7 +1149,7 @@ hit_next:
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		set_state_bits(tree, state, &bits);
+		set_state_bits(tree, state, &bits, NULL);
 		cache_state(state, cached_state);
 		state = clear_state_bit(tree, state, &clear_bits, 0);
 		if (last_end == (u64)-1)
@@ -1171,7 +1190,7 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set_state_bits(tree, state, &bits);
+			set_state_bits(tree, state, &bits, NULL);
 			cache_state(state, cached_state);
 			state = clear_state_bit(tree, state, &clear_bits, 0);
 			if (last_end == (u64)-1)
@@ -1208,7 +1227,7 @@ hit_next:
 		 * the later extent.
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   NULL, NULL, &bits);
+				   NULL, NULL, &bits, NULL);
 		if (err)
 			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
@@ -1233,7 +1252,7 @@ hit_next:
 		if (err)
 			extent_io_tree_panic(tree, err);
 
-		set_state_bits(tree, prealloc, &bits);
+		set_state_bits(tree, prealloc, &bits, NULL);
 		cache_state(prealloc, cached_state);
 		clear_state_bit(tree, prealloc, &clear_bits, 0);
 		prealloc = NULL;
@@ -1274,6 +1293,22 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			      NULL, mask);
 }
 
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			   unsigned bits, gfp_t mask,
+			   struct extent_changeset *changeset)
+{
+	/*
+	 * We don't support EXTENT_LOCKED yet, as current changeset will
+	 * record any bits changed, so for EXTENT_LOCKED case, it will
+	 * either fail with -EEXIST or changeset will record the whole
+	 * range.
+	 */
+	BUG_ON(bits & EXTENT_LOCKED);
+
+	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+				changeset);
+}
+
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask)
 {
@@ -1343,7 +1378,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	while (1) {
 		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
 				       EXTENT_LOCKED, &failed_start,
-				       cached_state, GFP_NOFS);
+				       cached_state, GFP_NOFS, NULL);
 		if (err == -EEXIST) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
@@ -1365,7 +1400,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 	u64 failed_start;
 
 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-			       &failed_start, NULL, GFP_NOFS);
+			       &failed_start, NULL, GFP_NOFS, NULL);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3107a6e..4a7c9d9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -227,6 +227,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    unsigned bits, gfp_t mask);
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			   unsigned bits, gfp_t mask,
+			   struct extent_changeset *changeset);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   unsigned bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
-- 
cgit v1.1


From fefdc55702a5f9f99778b6bdce4c4e1185ff943f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 15:35:38 +0800
Subject: btrfs: extent_io: Introduce new function clear_record_extent_bits()

Introduce new function clear_record_extent_bits(), which will clear bits
for given range and record the details about which ranges are cleared
and how many bytes in total it changes.

This provides the basis for later qgroup reserve codes.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/extent_io.h |  3 +++
 2 files changed, 42 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9c066d6..33a01ea 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -141,6 +141,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
 		return;
 	if (set && (state->state & bits) == bits)
 		return;
+	if (!set && (state->state & bits) == 0)
+		return;
 	changeset->bytes_changed += state->end - state->start + 1;
 	ret = ulist_add(changeset->range_changed, state->start, state->end,
 			GFP_ATOMIC);
@@ -529,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
  */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 					    struct extent_state *state,
-					    unsigned *bits, int wake)
+					    unsigned *bits, int wake,
+					    struct extent_changeset *changeset)
 {
 	struct extent_state *next;
 	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -540,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 		tree->dirty_bytes -= range;
 	}
 	clear_state_cb(tree, state, bits);
+	add_extent_changeset(state, bits_to_clear, changeset, 0);
 	state->state &= ~bits_to_clear;
 	if (wake)
 		wake_up(&state->wq);
@@ -587,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  *
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     unsigned bits, int wake, int delete,
-		     struct extent_state **cached_state,
-		     gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			      unsigned bits, int wake, int delete,
+			      struct extent_state **cached_state,
+			      gfp_t mask, struct extent_changeset *changeset)
 {
 	struct extent_state *state;
 	struct extent_state *cached;
@@ -689,7 +693,8 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			state = clear_state_bit(tree, state, &bits, wake);
+			state = clear_state_bit(tree, state, &bits, wake,
+						changeset);
 			goto next;
 		}
 		goto search_again;
@@ -710,13 +715,13 @@ hit_next:
 		if (wake)
 			wake_up(&state->wq);
 
-		clear_state_bit(tree, prealloc, &bits, wake);
+		clear_state_bit(tree, prealloc, &bits, wake, changeset);
 
 		prealloc = NULL;
 		goto out;
 	}
 
-	state = clear_state_bit(tree, state, &bits, wake);
+	state = clear_state_bit(tree, state, &bits, wake, changeset);
 next:
 	if (last_end == (u64)-1)
 		goto out;
@@ -1151,7 +1156,7 @@ hit_next:
 	if (state->start == start && state->end <= end) {
 		set_state_bits(tree, state, &bits, NULL);
 		cache_state(state, cached_state);
-		state = clear_state_bit(tree, state, &clear_bits, 0);
+		state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
 		if (last_end == (u64)-1)
 			goto out;
 		start = last_end + 1;
@@ -1192,7 +1197,8 @@ hit_next:
 		if (state->end <= end) {
 			set_state_bits(tree, state, &bits, NULL);
 			cache_state(state, cached_state);
-			state = clear_state_bit(tree, state, &clear_bits, 0);
+			state = clear_state_bit(tree, state, &clear_bits, 0,
+						NULL);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
@@ -1254,7 +1260,7 @@ hit_next:
 
 		set_state_bits(tree, prealloc, &bits, NULL);
 		cache_state(prealloc, cached_state);
-		clear_state_bit(tree, prealloc, &clear_bits, 0);
+		clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
 		prealloc = NULL;
 		goto out;
 	}
@@ -1309,6 +1315,14 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 				changeset);
 }
 
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask)
+{
+	return __clear_extent_bit(tree, start, end, bits, wake, delete,
+				  cached, mask, NULL);
+}
+
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask)
 {
@@ -1320,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
 }
 
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			     unsigned bits, gfp_t mask,
+			     struct extent_changeset *changeset)
+{
+	/*
+	 * Don't support EXTENT_LOCKED case, same reason as
+	 * set_record_extent_bits().
+	 */
+	BUG_ON(bits & EXTENT_LOCKED);
+
+	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+				  changeset);
+}
+
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4a7c9d9..51e1b71 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -222,6 +222,9 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+			     unsigned bits, gfp_t mask,
+			     struct extent_changeset *changeset);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
 		     struct extent_state **cached, gfp_t mask);
-- 
cgit v1.1


From 524725537023bb25a371722b1329446e5a2adcdb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 16:05:40 +0800
Subject: btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function

Introduce a new function, btrfs_qgroup_reserve_data(), which will use
io_tree to accurate qgroup reserve, to avoid reserved space leaking.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.h |  1 +
 fs/btrfs/qgroup.c    | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h    |  2 ++
 3 files changed, 52 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 51e1b71..f4c1ae1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
 #define EXTENT_NEED_WAIT	(1U << 13)
 #define EXTENT_DAMAGED		(1U << 14)
 #define EXTENT_NORESERVE	(1U << 15)
+#define EXTENT_QGROUP_RESERVED	(1U << 16)
 #define EXTENT_IOBITS		(EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1..5066274 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2486,3 +2486,52 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 		btrfs_queue_work(fs_info->qgroup_rescan_workers,
 				 &fs_info->qgroup_rescan_work);
 }
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_changeset changeset;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret;
+
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+	    len == 0)
+		return 0;
+
+	changeset.bytes_changed = 0;
+	changeset.range_changed = ulist_alloc(GFP_NOFS);
+
+	ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+			start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+			&changeset);
+	if (ret < 0)
+		goto cleanup;
+	ret = btrfs_qgroup_reserve(root, changeset.bytes_changed);
+	if (ret < 0)
+		goto cleanup;
+
+	ulist_free(changeset.range_changed);
+	return ret;
+
+cleanup:
+	/* cleanup already reserved ranges */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(changeset.range_changed, &uiter)))
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+				 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+				 GFP_NOFS);
+	ulist_free(changeset.range_changed);
+	return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcf..bd17cc2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -81,4 +81,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 			       u64 rfer, u64 excl);
 #endif
 
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From f695fdcef83ac6972e0eda1d6588a56348f521a2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 12 Oct 2015 16:28:06 +0800
Subject: btrfs: qgroup: Introduce functions to release/free qgroup reserve
 data space

Introduce functions btrfs_qgroup_release/free_data() to release/free
reserved data range.

Release means, just remove the data range from io_tree, but doesn't
free the reserved space.
This is for normal buffered write case, when data is written into disc
and its metadata is added into tree, its reserved space should still be
kept until commit_trans().
So in that case, we only release dirty range, but keep the reserved
space recorded some other place until commit_tran().

Free means not only remove data range, but also free reserved space.
This is used for case for cleanup and invalidate page.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h |  2 ++
 2 files changed, 62 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5066274..e05d1f6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2535,3 +2535,63 @@ cleanup:
 	ulist_free(changeset.range_changed);
 	return ret;
 }
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+				       int free)
+{
+	struct extent_changeset changeset;
+	int ret;
+
+	changeset.bytes_changed = 0;
+	changeset.range_changed = ulist_alloc(GFP_NOFS);
+	if (!changeset.range_changed)
+		return -ENOMEM;
+
+	ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
+			start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+			&changeset);
+	if (ret < 0)
+		goto out;
+
+	if (free)
+		btrfs_qgroup_free(BTRFS_I(inode)->root,
+				  changeset.bytes_changed);
+out:
+	ulist_free(changeset.range_changed);
+	return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+	return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+	return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index bd17cc2..564eb21 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -83,4 +83,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 
 /* New io_tree based accurate qgroup reserve API */
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From f64d5ca868214f9beb60108c82fcb96940132bfb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:08:36 +0800
Subject: btrfs: delayed_ref: Add new function to record reserved space into
 delayed ref

Add new function btrfs_add_delayed_qgroup_reserve() function to record
how much space is reserved for that extent.

As btrfs only accounts qgroup at run_delayed_refs() time, so newly
allocated extent should keep the reserved space until then.

So add needed function with related members to do it.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 29 +++++++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h | 14 ++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81d..bd9b63b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -476,6 +476,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&head_ref->ref_list);
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
+	head_ref->qgroup_reserved = 0;
+	head_ref->qgroup_ref_root = 0;
 
 	/* Record qgroup extent info if provided */
 	if (qrecord) {
@@ -746,6 +748,33 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+				     struct btrfs_trans_handle *trans,
+				     u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *ref_head;
+	int ret = 0;
+
+	if (!fs_info->quota_enabled || !is_fstree(ref_root))
+		return 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+	if (!ref_head) {
+		ret = -ENOENT;
+		goto out;
+	}
+	WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+	ref_head->qgroup_ref_root = ref_root;
+	ref_head->qgroup_reserved = num_bytes;
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6..d4c41e2 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -113,6 +113,17 @@ struct btrfs_delayed_ref_head {
 	int total_ref_mod;
 
 	/*
+	 * For qgroup reserved space freeing.
+	 *
+	 * ref_root and reserved will be recorded after
+	 * BTRFS_ADD_DELAYED_EXTENT is called.
+	 * And will be used to free reserved qgroup space at
+	 * run_delayed_refs() time.
+	 */
+	u64 qgroup_ref_root;
+	u64 qgroup_reserved;
+
+	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
 	 * until the delayed ref is processed.  must_insert_reserved is
@@ -242,6 +253,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       u64 owner, u64 offset, int action,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       int no_quota);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+				     struct btrfs_trans_handle *trans,
+				     u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
-- 
cgit v1.1


From 297d750b9f8d7e6f2dbdf8abc5aa3b5c656affdc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:08:37 +0800
Subject: btrfs: delayed_ref: release and free qgroup reserved at proper timing

Qgroup reserved space needs to be released from inode dirty map and get
freed at different timing:

1) Release when the metadata is written into tree
After corresponding metadata is written into tree, any newer write will
be COWed(don't include NOCOW case yet).
So we must release its range from inode dirty range map, or we will
forget to reserve needed range, causing accounting exceeding the limit.

2) Free reserved bytes when delayed ref is run
When delayed refs are run, qgroup accounting will follow soon and turn
the reserved bytes into rfer/excl numbers.
As run_delayed_refs and qgroup accounting are all done at
commit_transaction() time, we are safe to free reserved space in
run_delayed_ref time().

With these timing to release/free reserved space, we should be able to
resolve the long existing qgroup reserve space leak problem.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  5 +++++
 fs/btrfs/inode.c       | 10 ++++++++++
 fs/btrfs/qgroup.c      |  5 ++---
 fs/btrfs/qgroup.h      | 18 +++++++++++++++++-
 4 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2df4bc7..6c7927c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2345,6 +2345,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 						      node->num_bytes);
 			}
 		}
+
+		/* Also free its reserved qgroup space */
+		btrfs_qgroup_free_delayed_ref(root->fs_info,
+					      head->qgroup_ref_root,
+					      head->qgroup_reserved);
 		return ret;
 	}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ce55f6..c21ed7b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2119,6 +2119,16 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_alloc_reserved_file_extent(trans, root,
 					root->root_key.objectid,
 					btrfs_ino(inode), file_pos, &ins);
+	if (ret < 0)
+		goto out;
+	/*
+	 * Release the reserved range from inode dirty range map, and
+	 * move it to delayed ref codes, as now accounting only happens at
+	 * commit_transaction() time.
+	 */
+	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+	ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+			root->objectid, disk_bytenr, ram_bytes);
 out:
 	btrfs_free_path(path);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e05d1f6..75d8e58 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2116,14 +2116,13 @@ out:
 	return ret;
 }
 
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+			       u64 ref_root, u64 num_bytes)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
-	u64 ref_root = root->root_key.objectid;
 	int ret = 0;
 
 	if (!is_fstree(ref_root))
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 564eb21..80924ae 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -72,7 +72,23 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+			       u64 ref_root, u64 num_bytes);
+static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+					 num_bytes);
+}
+
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+						 u64 ref_root, u64 num_bytes)
+{
+	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+}
 
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
-- 
cgit v1.1


From 55eeaf0578038c40baaf3cf9408c23e42cd2a2b8 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:08:38 +0800
Subject: btrfs: qgroup: Introduce new functions to reserve/free metadata

Introduce new functions btrfs_qgroup_reserve/free_meta() to reserve/free
metadata reserved space.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/qgroup.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h  |  4 ++++
 4 files changed, 48 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49bc792..9df8f01 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1943,6 +1943,9 @@ struct btrfs_root {
 	int send_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
 	atomic_t will_be_snapshoted;
+
+	/* For qgroup metadata space reserve */
+	atomic_t qgroup_meta_rsv;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bcbb596..0e0544e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1267,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
 	atomic_set(&root->orphan_inodes, 0);
 	atomic_set(&root->refs, 1);
 	atomic_set(&root->will_be_snapshoted, 0);
+	atomic_set(&root->qgroup_meta_rsv, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 75d8e58..897a49d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2594,3 +2594,43 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
 {
 	return __btrfs_qgroup_release_data(inode, start, len, 0);
 }
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+	int ret;
+
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+	    num_bytes == 0)
+		return 0;
+
+	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+	ret = btrfs_qgroup_reserve(root, num_bytes);
+	if (ret < 0)
+		return ret;
+	atomic_add(num_bytes, &root->qgroup_meta_rsv);
+	return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+	int reserved;
+
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+		return;
+
+	reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+	if (reserved == 0)
+		return;
+	btrfs_qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+		return;
+
+	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+	WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+	atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+	btrfs_qgroup_free(root, num_bytes);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 80924ae..7d1c87c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -101,4 +101,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From 7174109c6548c4db85a383b8ae9d01469cddd110 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:41 +0800
Subject: btrfs: qgroup: Use new metadata reservation.

As we have the new metadata reservation functions, use them to replace
the old btrfs_qgroup_reserve() call for metadata.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++--------
 fs/btrfs/transaction.c | 34 +++++++---------------------------
 fs/btrfs/transaction.h |  1 -
 3 files changed, 13 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6c7927c..130df70 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5337,7 +5337,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	if (root->fs_info->quota_enabled) {
 		/* One for parent inode, two for dir entries */
 		num_bytes = 3 * root->nodesize;
-		ret = btrfs_qgroup_reserve(root, num_bytes);
+		ret = btrfs_qgroup_reserve_meta(root, num_bytes);
 		if (ret)
 			return ret;
 	} else {
@@ -5355,10 +5355,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	if (ret == -ENOSPC && use_global_rsv)
 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
 
-	if (ret) {
-		if (*qgroup_reserved)
-			btrfs_qgroup_free(root, *qgroup_reserved);
-	}
+	if (ret && *qgroup_reserved)
+		btrfs_qgroup_free_meta(root, *qgroup_reserved);
 
 	return ret;
 }
@@ -5519,15 +5517,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	spin_unlock(&BTRFS_I(inode)->lock);
 
 	if (root->fs_info->quota_enabled) {
-		ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+		ret = btrfs_qgroup_reserve_meta(root,
+				nr_extents * root->nodesize);
 		if (ret)
 			goto out_fail;
 	}
 
 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (unlikely(ret)) {
-		if (root->fs_info->quota_enabled)
-			btrfs_qgroup_free(root, nr_extents * root->nodesize);
+		btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
 		goto out_fail;
 	}
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9354e7a..e377d7b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -480,13 +480,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items > 0 && root != root->fs_info->chunk_root) {
-		if (root->fs_info->quota_enabled &&
-		    is_fstree(root->root_key.objectid)) {
-			qgroup_reserved = num_items * root->nodesize;
-			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-			if (ret)
-				return ERR_PTR(ret);
-		}
+		qgroup_reserved = num_items * root->nodesize;
+		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+		if (ret)
+			return ERR_PTR(ret);
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
 		/*
@@ -547,6 +544,7 @@ again:
 	h->transaction = cur_trans;
 	h->root = root;
 	h->use_count = 1;
+
 	h->type = type;
 	h->can_flush_pending_bgs = true;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
@@ -568,7 +566,6 @@ again:
 		h->bytes_reserved = num_bytes;
 		h->reloc_reserved = reloc_reserved;
 	}
-	h->qgroup_reserved = qgroup_reserved;
 
 got_it:
 	btrfs_record_root_in_trans(h, root);
@@ -586,8 +583,7 @@ alloc_fail:
 		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
 					num_bytes);
 reserve_fail:
-	if (qgroup_reserved)
-		btrfs_qgroup_free(root, qgroup_reserved);
+	btrfs_qgroup_free_meta(root, qgroup_reserved);
 	return ERR_PTR(ret);
 }
 
@@ -805,15 +801,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			must_run_delayed_refs = 2;
 	}
 
-	if (trans->qgroup_reserved) {
-		/*
-		 * the same root has to be passed here between start_transaction
-		 * and end_transaction. Subvolume quota depends on this.
-		 */
-		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-		trans->qgroup_reserved = 0;
-	}
-
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
@@ -1231,6 +1218,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			spin_lock(&fs_info->fs_roots_radix_lock);
 			if (err)
 				break;
+			btrfs_qgroup_free_meta_all(root);
 		}
 	}
 	spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1835,10 +1823,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	if (trans->qgroup_reserved) {
-		btrfs_qgroup_free(root, trans->qgroup_reserved);
-		trans->qgroup_reserved = 0;
-	}
 
 	cur_trans = trans->transaction;
 
@@ -2191,10 +2175,6 @@ cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
 	btrfs_trans_release_chunk_metadata(trans);
 	trans->block_rsv = NULL;
-	if (trans->qgroup_reserved) {
-		btrfs_qgroup_free(root, trans->qgroup_reserved);
-		trans->qgroup_reserved = 0;
-	}
 	btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index db6bfd9..54b7dea 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,7 +107,6 @@ struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
 	u64 chunk_bytes_reserved;
-	u64 qgroup_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
-- 
cgit v1.1


From 4ceff0792d36256a5f879cec51c56e44db90b8ec Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:42 +0800
Subject: btrfs: extent-tree: Add new version of btrfs_check_data_free_space
 and btrfs_free_reserved_data_space.

Add new functions __btrfs_check_data_free_space() and
__btrfs_free_reserved_data_space() to work with new accurate qgroup
reserved space framework.

The new function will replace old btrfs_check_data_free_space() and
btrfs_free_reserved_data_space() respectively, but until all the change
is done, let's just use the new name.

Also, export internal use function btrfs_alloc_data_chunk_ondemand(), as
now qgroup reserve requires precious bytes, some operation can't get the
accurate number in advance(like fallocate).
But data space info check and data chunk allocate doesn't need to be
that accurate, and can be called at the beginning.

So export it for later operations.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 ++
 fs/btrfs/extent-tree.c | 85 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 79 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9df8f01..a55d199 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3453,7 +3453,10 @@ enum btrfs_reserve_flush_enum {
 };
 
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 130df70..71c172e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3904,11 +3904,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 	return ret;
 }
 
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4029,19 +4025,55 @@ commit_trans:
 					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
-	ret = btrfs_qgroup_reserve(root, write_bytes);
-	if (ret)
-		goto out;
 	data_sinfo->bytes_may_use += bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 1);
-out:
 	spin_unlock(&data_sinfo->lock);
 
 	return ret;
 }
 
 /*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_qgroup_reserve(root, write_bytes);
+	return ret;
+}
+
+/*
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	/* align the range */
+	len = round_up(start + len, root->sectorsize) -
+	      round_down(start, root->sectorsize);
+	start = round_down(start, root->sectorsize);
+
+	ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+	if (ret < 0)
+		return ret;
+
+	/* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+	ret = btrfs_qgroup_reserve_data(inode, start, len);
+	return ret;
+}
+
+/*
  * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
@@ -4061,6 +4093,41 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	spin_unlock(&data_sinfo->lock);
 }
 
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_space_info *data_sinfo;
+
+	/* Make sure the range is aligned to sectorsize */
+	len = round_up(start + len, root->sectorsize) -
+	      round_down(start, root->sectorsize);
+	start = round_down(start, root->sectorsize);
+
+	/*
+	 * Free any reserved qgroup data space first
+	 * As it will alloc memory, we can't do it with data sinfo
+	 * spinlock hold.
+	 */
+	btrfs_qgroup_free_data(inode, start, len);
+
+	data_sinfo = root->fs_info->data_sinfo;
+	spin_lock(&data_sinfo->lock);
+	if (WARN_ON(data_sinfo->bytes_may_use < len))
+		data_sinfo->bytes_may_use = 0;
+	else
+		data_sinfo->bytes_may_use -= len;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, len, 0);
+	spin_unlock(&data_sinfo->lock);
+}
+
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
 	struct list_head *head = &info->space_info;
-- 
cgit v1.1


From d9d8b2a51a404c2d45b9dc4c755f62cb3ddb7c79 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:43 +0800
Subject: btrfs: extent-tree: Switch to new check_data_free_space and
 free_reserved_data_space

Use new reserve/free for buffered write and inode cache.

For buffered write case, as nodatacow write won't increase quota account,
so unlike old behavior which does reserve before check nocow, now we
check nocow first and then only reserve data if we can't do nocow write.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/file.c        | 34 +++++++++++++++++++++-------------
 fs/btrfs/relocation.c  |  8 ++++----
 3 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71c172e..3c58fc3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3356,7 +3356,7 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+	ret = __btrfs_check_data_free_space(inode, 0, num_pages);
 	if (ret)
 		goto out_put;
 
@@ -3365,7 +3365,7 @@ again:
 					      &alloc_hint);
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-	btrfs_free_reserved_data_space(inode, num_pages);
+	__btrfs_free_reserved_data_space(inode, 0, num_pages);
 
 out_put:
 	iput(inode);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7031e96..61fb78f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1507,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		}
 
 		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-		ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
-		if (ret == -ENOSPC &&
-		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-					      BTRFS_INODE_PREALLOC))) {
+
+		if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					     BTRFS_INODE_PREALLOC)) {
 			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret < 0)
+				break;
 			if (ret > 0) {
+				/*
+				 * For nodata cow case, no need to reserve
+				 * data space.
+				 */
 				only_release_metadata = true;
 				/*
 				 * our prealloc extent may be smaller than
@@ -1521,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 				num_pages = DIV_ROUND_UP(write_bytes + offset,
 							 PAGE_CACHE_SIZE);
 				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-				ret = 0;
-			} else {
-				ret = -ENOSPC;
+				goto reserve_metadata;
 			}
 		}
-
-		if (ret)
+		ret = __btrfs_check_data_free_space(inode, pos, write_bytes);
+		if (ret < 0)
 			break;
 
+reserve_metadata:
 		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
 		if (ret) {
 			if (!only_release_metadata)
-				btrfs_free_reserved_data_space(inode,
-							       reserve_bytes);
+				__btrfs_free_reserved_data_space(inode, pos,
+							         write_bytes);
 			else
 				btrfs_end_write_no_snapshoting(root);
 			break;
@@ -2566,8 +2570,11 @@ static long btrfs_fallocate(struct file *file, int mode,
 	/*
 	 * Make sure we have enough space before we do the
 	 * allocation.
+	 * XXX: The behavior must be changed to do accurate check first
+	 * and then check data reserved space.
 	 */
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_start,
+					  alloc_end - alloc_start);
 	if (ret)
 		return ret;
 
@@ -2700,7 +2707,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+	__btrfs_free_reserved_data_space(inode, alloc_start,
+					 alloc_end - alloc_start);
 	return ret;
 }
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 58ede0a..490c9e2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3034,8 +3034,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	BUG_ON(cluster->start != cluster->boundary[0]);
 	mutex_lock(&inode->i_mutex);
 
-	ret = btrfs_check_data_free_space(inode, cluster->end +
-					  1 - cluster->start, 0);
+	ret = __btrfs_check_data_free_space(inode, cluster->start,
+					    cluster->end + 1 - cluster->start);
 	if (ret)
 		goto out;
 
@@ -3056,8 +3056,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 			break;
 		nr++;
 	}
-	btrfs_free_reserved_data_space(inode, cluster->end +
-				       1 - cluster->start);
+	__btrfs_free_reserved_data_space(inode, cluster->start,
+					 cluster->end + 1 - cluster->start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.1


From 1ada3a62b56605befdfc34d6d1796601c0869103 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:53 +0800
Subject: btrfs: extent-tree: Add new version of
 btrfs_delalloc_reserve/release_space

Add new version of btrfs_delalloc_reserve_space() and
btrfs_delalloc_release_space() functions, which supports accurate qgroup
reserve.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a55d199..0f1ade1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3473,7 +3473,9 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 					      unsigned short type);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3c58fc3..8c8d8b3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5715,6 +5715,44 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 }
 
 /**
+ * __btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
+ *
+ * This will do the following things
+ *
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
+ *   extents and how much csums will be needed
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+{
+	int ret;
+
+	ret = __btrfs_check_data_free_space(inode, start, len);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_delalloc_reserve_metadata(inode, len);
+	if (ret < 0)
+		__btrfs_free_reserved_data_space(inode, start, len);
+	return ret;
+}
+
+/**
  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
  * @inode: inode we're writing to
  * @num_bytes: the number of bytes we want to allocate
@@ -5747,6 +5785,27 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 }
 
 /**
+ * __btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
+ */
+void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
+{
+	btrfs_delalloc_release_metadata(inode, len);
+	__btrfs_free_reserved_data_space(inode, start, len);
+}
+
+/**
  * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
  * @num_bytes: the number of bytes we want to free up
-- 
cgit v1.1


From df480633b891cf03301d87e56024a8ec3251da5b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:54 +0800
Subject: btrfs: extent-tree: Switch to new delalloc space reserve and release

Use new __btrfs_delalloc_reserve_space() and
__btrfs_delalloc_release_space() to reserve and release space for
delalloc.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c      |  5 +++--
 fs/btrfs/inode-map.c |  6 +++---
 fs/btrfs/inode.c     | 38 +++++++++++++++++++++++---------------
 fs/btrfs/ioctl.c     | 14 +++++++++-----
 4 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 61fb78f..29b3702 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1608,7 +1608,7 @@ again:
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
 			else
-				btrfs_delalloc_release_space(inode,
+				__btrfs_delalloc_release_space(inode, pos,
 							     release_bytes);
 		}
 
@@ -1661,7 +1661,8 @@ again:
 			btrfs_end_write_no_snapshoting(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
 		} else {
-			btrfs_delalloc_release_space(inode, release_bytes);
+			__btrfs_delalloc_release_space(inode, pos,
+						       release_bytes);
 		}
 	}
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582a..78bc09c 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
 	/* Just to make sure we have enough space */
 	prealloc += 8 * PAGE_CACHE_SIZE;
 
-	ret = btrfs_delalloc_reserve_space(inode, prealloc);
+	ret = __btrfs_delalloc_reserve_space(inode, 0, prealloc);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		btrfs_delalloc_release_space(inode, prealloc);
+		__btrfs_delalloc_release_space(inode, 0, prealloc);
 		goto out_put;
 	}
-	btrfs_free_reserved_data_space(inode, prealloc);
+	__btrfs_free_reserved_data_space(inode, 0, prealloc);
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c21ed7b..a3942a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1769,7 +1769,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
-			btrfs_free_reserved_data_space(inode, len);
+			__btrfs_free_reserved_data_space(inode, state->start,
+							 len);
 
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
 				     root->fs_info->delalloc_batch);
@@ -1992,7 +1993,8 @@ again:
 		goto again;
 	}
 
-	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	ret = __btrfs_delalloc_reserve_space(inode, page_start,
+					     PAGE_CACHE_SIZE);
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		end_extent_writepage(page, ret, page_start, page_end);
@@ -4638,14 +4640,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 	if ((offset & (blocksize - 1)) == 0 &&
 	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
-	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	ret = __btrfs_delalloc_reserve_space(inode,
+			round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
 again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
-		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		__btrfs_delalloc_release_space(inode,
+				round_down(from, PAGE_CACHE_SIZE),
+				PAGE_CACHE_SIZE);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -4713,7 +4718,8 @@ again:
 
 out_unlock:
 	if (ret)
-		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		__btrfs_delalloc_release_space(inode, page_start,
+					       PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -7644,7 +7650,7 @@ unlock:
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		btrfs_free_reserved_data_space(inode, len);
+		__btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
 		current->journal_info = dio_data;
@@ -8434,7 +8440,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			mutex_unlock(&inode->i_mutex);
 			relock = true;
 		}
-		ret = btrfs_delalloc_reserve_space(inode, count);
+		ret = __btrfs_delalloc_reserve_space(inode, offset, count);
 		if (ret)
 			goto out;
 		dio_data.outstanding_extents = div64_u64(count +
@@ -8463,11 +8469,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		current->journal_info = NULL;
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
-				btrfs_delalloc_release_space(inode,
-							dio_data.reserve);
+				__btrfs_delalloc_release_space(inode, offset,
+						dio_data.reserve);
 		} else if (ret >= 0 && (size_t)ret < count)
-			btrfs_delalloc_release_space(inode,
-						     count - (size_t)ret);
+			__btrfs_delalloc_release_space(inode, offset,
+						       count - (size_t)ret);
 	}
 out:
 	if (wakeup)
@@ -8675,7 +8681,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_end;
 
 	sb_start_pagefault(inode->i_sb);
-	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	ret = __btrfs_delalloc_reserve_space(inode, page_start,
+					     PAGE_CACHE_SIZE);
 	if (!ret) {
 		ret = file_update_time(vma->vm_file);
 		reserved = 1;
@@ -8694,8 +8704,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 again:
 	lock_page(page);
 	size = i_size_read(inode);
-	page_start = page_offset(page);
-	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
@@ -8772,7 +8780,7 @@ out_unlock:
 	}
 	unlock_page(page);
 out:
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+	__btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 685df7e..70732e6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1119,8 +1119,9 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-	ret = btrfs_delalloc_reserve_space(inode,
-					   page_cnt << PAGE_CACHE_SHIFT);
+	ret = __btrfs_delalloc_reserve_space(inode,
+			start_index << PAGE_CACHE_SHIFT,
+			page_cnt << PAGE_CACHE_SHIFT);
 	if (ret)
 		return ret;
 	i_done = 0;
@@ -1209,8 +1210,9 @@ again:
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
-		btrfs_delalloc_release_space(inode,
-				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+		__btrfs_delalloc_release_space(inode,
+				start_index << PAGE_CACHE_SHIFT,
+				(page_cnt - i_done) << PAGE_CACHE_SHIFT);
 	}
 
 
@@ -1235,7 +1237,9 @@ out:
 		unlock_page(pages[i]);
 		page_cache_release(pages[i]);
 	}
-	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+	__btrfs_delalloc_release_space(inode,
+			start_index << PAGE_CACHE_SHIFT,
+			page_cnt << PAGE_CACHE_SHIFT);
 	return ret;
 
 }
-- 
cgit v1.1


From 7cf5b97650f2ecefbd5afa2d58b61b289b6e3750 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:55 +0800
Subject: btrfs: qgroup: Cleanup old inaccurate facilities

Cleanup the old facilities which use old btrfs_qgroup_reserve() function
call, replace them with the newer version, and remove the "__" prefix in
them.

Also, make btrfs_qgroup_reserve/free() functions private, as they are
now only used inside qgroup codes.

Now, the whole btrfs qgroup is swithed to use the new reserve facilities.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  12 ++----
 fs/btrfs/extent-tree.c | 109 +++++--------------------------------------------
 fs/btrfs/file.c        |  15 ++++---
 fs/btrfs/inode-map.c   |   6 +--
 fs/btrfs/inode.c       |  34 +++++++--------
 fs/btrfs/ioctl.c       |   6 +--
 fs/btrfs/qgroup.c      |  18 ++++----
 fs/btrfs/qgroup.h      |   8 ----
 fs/btrfs/relocation.c  |   8 ++--
 9 files changed, 60 insertions(+), 156 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0f1ade1..0d0f5d2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3452,11 +3452,9 @@ enum btrfs_reserve_flush_enum {
 	BTRFS_RESERVE_FLUSH_ALL,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3472,10 +3470,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 				      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 					      unsigned short type);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8c8d8b3..3a23225 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3356,7 +3356,7 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = __btrfs_check_data_free_space(inode, 0, num_pages);
+	ret = btrfs_check_data_free_space(inode, 0, num_pages);
 	if (ret)
 		goto out_put;
 
@@ -3365,7 +3365,7 @@ again:
 					      &alloc_hint);
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-	__btrfs_free_reserved_data_space(inode, 0, num_pages);
+	btrfs_free_reserved_data_space(inode, 0, num_pages);
 
 out_put:
 	iput(inode);
@@ -4034,27 +4034,11 @@ commit_trans:
 }
 
 /*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-
-	ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
-	if (ret < 0)
-		return ret;
-	ret = btrfs_qgroup_reserve(root, write_bytes);
-	return ret;
-}
-
-/*
  * New check_data_free_space() with ability for precious data reservation
  * Will replace old btrfs_check_data_free_space(), but for patch split,
  * add a new function first and then replace it.
  */
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -4074,33 +4058,13 @@ int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 }
 
 /*
- * Called if we need to clear a data reservation for this inode.
- */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_space_info *data_sinfo;
-
-	/* make sure bytes are sectorsize aligned */
-	bytes = ALIGN(bytes, root->sectorsize);
-
-	data_sinfo = root->fs_info->data_sinfo;
-	spin_lock(&data_sinfo->lock);
-	WARN_ON(data_sinfo->bytes_may_use < bytes);
-	data_sinfo->bytes_may_use -= bytes;
-	trace_btrfs_space_reservation(root->fs_info, "space_info",
-				      data_sinfo->flags, bytes, 0);
-	spin_unlock(&data_sinfo->lock);
-}
-
-/*
  * Called if we need to clear a data reservation for this inode
  * Normally in a error case.
  *
  * This one will handle the per-indoe data rsv map for accurate reserved
  * space framework.
  */
-void __btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_space_info *data_sinfo;
@@ -5715,7 +5679,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 }
 
 /**
- * __btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
  * delalloc
  * @inode: inode we're writing to
  * @start: start range we are writing to
@@ -5739,53 +5703,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
  * Return 0 for success
  * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
 {
 	int ret;
 
-	ret = __btrfs_check_data_free_space(inode, start, len);
+	ret = btrfs_check_data_free_space(inode, start, len);
 	if (ret < 0)
 		return ret;
 	ret = btrfs_delalloc_reserve_metadata(inode, len);
 	if (ret < 0)
-		__btrfs_free_reserved_data_space(inode, start, len);
+		btrfs_free_reserved_data_space(inode, start, len);
 	return ret;
 }
 
 /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
- * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
- *
- * This will do the following things
- *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
- *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
- * o add it to the fs_info's delalloc inodes list.
- *
- * This will return 0 for success and -ENOSPC if there is no space left.
- */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
-{
-	int ret;
-
-	ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-	if (ret)
-		return ret;
-
-	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
-	if (ret) {
-		btrfs_free_reserved_data_space(inode, num_bytes);
-		return ret;
-	}
-
-	return 0;
-}
-
-/**
- * __btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
  * @start: start position of the space already reserved
  * @len: the len of the space already reserved
@@ -5799,29 +5731,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
  * list if there are no delalloc bytes left.
  * Also it will handle the qgroup reserved space.
  */
-void __btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
 {
 	btrfs_delalloc_release_metadata(inode, len);
-	__btrfs_free_reserved_data_space(inode, start, len);
-}
-
-/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
- *
- * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
- * called in the case that we don't need the metadata AND data reservations
- * anymore.  So if there is an error or we insert an inline extent.
- *
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- */
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
-{
-	btrfs_delalloc_release_metadata(inode, num_bytes);
-	btrfs_free_reserved_data_space(inode, num_bytes);
+	btrfs_free_reserved_data_space(inode, start, len);
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29b3702..47785c8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1529,7 +1529,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 				goto reserve_metadata;
 			}
 		}
-		ret = __btrfs_check_data_free_space(inode, pos, write_bytes);
+		ret = btrfs_check_data_free_space(inode, pos, write_bytes);
 		if (ret < 0)
 			break;
 
@@ -1537,8 +1537,8 @@ reserve_metadata:
 		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
 		if (ret) {
 			if (!only_release_metadata)
-				__btrfs_free_reserved_data_space(inode, pos,
-							         write_bytes);
+				btrfs_free_reserved_data_space(inode, pos,
+							       write_bytes);
 			else
 				btrfs_end_write_no_snapshoting(root);
 			break;
@@ -1608,7 +1608,7 @@ again:
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
 			else
-				__btrfs_delalloc_release_space(inode, pos,
+				btrfs_delalloc_release_space(inode, pos,
 							     release_bytes);
 		}
 
@@ -1661,8 +1661,7 @@ again:
 			btrfs_end_write_no_snapshoting(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
 		} else {
-			__btrfs_delalloc_release_space(inode, pos,
-						       release_bytes);
+			btrfs_delalloc_release_space(inode, pos, release_bytes);
 		}
 	}
 
@@ -2708,8 +2707,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	__btrfs_free_reserved_data_space(inode, alloc_start,
-					 alloc_end - alloc_start);
+	btrfs_free_reserved_data_space(inode, alloc_start,
+				       alloc_end - alloc_start);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 78bc09c..767a605 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
 	/* Just to make sure we have enough space */
 	prealloc += 8 * PAGE_CACHE_SIZE;
 
-	ret = __btrfs_delalloc_reserve_space(inode, 0, prealloc);
+	ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		__btrfs_delalloc_release_space(inode, 0, prealloc);
+		btrfs_delalloc_release_space(inode, 0, prealloc);
 		goto out_put;
 	}
-	__btrfs_free_reserved_data_space(inode, 0, prealloc);
+	btrfs_free_reserved_data_space(inode, 0, prealloc);
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3942a7..a34e5a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1769,8 +1769,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
-			__btrfs_free_reserved_data_space(inode, state->start,
-							 len);
+			btrfs_free_reserved_data_space(inode, state->start,
+						       len);
 
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
 				     root->fs_info->delalloc_batch);
@@ -1993,8 +1993,8 @@ again:
 		goto again;
 	}
 
-	ret = __btrfs_delalloc_reserve_space(inode, page_start,
-					     PAGE_CACHE_SIZE);
+	ret = btrfs_delalloc_reserve_space(inode, page_start,
+					   PAGE_CACHE_SIZE);
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		end_extent_writepage(page, ret, page_start, page_end);
@@ -4640,7 +4640,7 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 	if ((offset & (blocksize - 1)) == 0 &&
 	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
-	ret = __btrfs_delalloc_reserve_space(inode,
+	ret = btrfs_delalloc_reserve_space(inode,
 			round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
@@ -4648,7 +4648,7 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
 again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
-		__btrfs_delalloc_release_space(inode,
+		btrfs_delalloc_release_space(inode,
 				round_down(from, PAGE_CACHE_SIZE),
 				PAGE_CACHE_SIZE);
 		ret = -ENOMEM;
@@ -4718,8 +4718,8 @@ again:
 
 out_unlock:
 	if (ret)
-		__btrfs_delalloc_release_space(inode, page_start,
-					       PAGE_CACHE_SIZE);
+		btrfs_delalloc_release_space(inode, page_start,
+					     PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -7650,7 +7650,7 @@ unlock:
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		__btrfs_free_reserved_data_space(inode, start, len);
+		btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
 		current->journal_info = dio_data;
@@ -8440,7 +8440,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			mutex_unlock(&inode->i_mutex);
 			relock = true;
 		}
-		ret = __btrfs_delalloc_reserve_space(inode, offset, count);
+		ret = btrfs_delalloc_reserve_space(inode, offset, count);
 		if (ret)
 			goto out;
 		dio_data.outstanding_extents = div64_u64(count +
@@ -8469,11 +8469,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		current->journal_info = NULL;
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
-				__btrfs_delalloc_release_space(inode, offset,
-						dio_data.reserve);
+				btrfs_delalloc_release_space(inode, offset,
+							     dio_data.reserve);
 		} else if (ret >= 0 && (size_t)ret < count)
-			__btrfs_delalloc_release_space(inode, offset,
-						       count - (size_t)ret);
+			btrfs_delalloc_release_space(inode, offset,
+						     count - (size_t)ret);
 	}
 out:
 	if (wakeup)
@@ -8684,8 +8684,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	page_start = page_offset(page);
 	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-	ret = __btrfs_delalloc_reserve_space(inode, page_start,
-					     PAGE_CACHE_SIZE);
+	ret = btrfs_delalloc_reserve_space(inode, page_start,
+					   PAGE_CACHE_SIZE);
 	if (!ret) {
 		ret = file_update_time(vma->vm_file);
 		reserved = 1;
@@ -8780,7 +8780,7 @@ out_unlock:
 	}
 	unlock_page(page);
 out:
-	__btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
+	btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 70732e6..7ed033a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1119,7 +1119,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-	ret = __btrfs_delalloc_reserve_space(inode,
+	ret = btrfs_delalloc_reserve_space(inode,
 			start_index << PAGE_CACHE_SHIFT,
 			page_cnt << PAGE_CACHE_SHIFT);
 	if (ret)
@@ -1210,7 +1210,7 @@ again:
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
-		__btrfs_delalloc_release_space(inode,
+		btrfs_delalloc_release_space(inode,
 				start_index << PAGE_CACHE_SHIFT,
 				(page_cnt - i_done) << PAGE_CACHE_SHIFT);
 	}
@@ -1237,7 +1237,7 @@ out:
 		unlock_page(pages[i]);
 		page_cache_release(pages[i]);
 	}
-	__btrfs_delalloc_release_space(inode,
+	btrfs_delalloc_release_space(inode,
 			start_index << PAGE_CACHE_SHIFT,
 			page_cnt << PAGE_CACHE_SHIFT);
 	return ret;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 897a49d..7d5339d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2035,7 +2035,7 @@ out:
 	return ret;
 }
 
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
@@ -2168,6 +2168,11 @@ out:
 	spin_unlock(&fs_info->qgroup_lock);
 }
 
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+					 num_bytes);
+}
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
 	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2517,7 +2522,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
 			&changeset);
 	if (ret < 0)
 		goto cleanup;
-	ret = btrfs_qgroup_reserve(root, changeset.bytes_changed);
+	ret = qgroup_reserve(root, changeset.bytes_changed);
 	if (ret < 0)
 		goto cleanup;
 
@@ -2553,8 +2558,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
 		goto out;
 
 	if (free)
-		btrfs_qgroup_free(BTRFS_I(inode)->root,
-				  changeset.bytes_changed);
+		qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
 out:
 	ulist_free(changeset.range_changed);
 	return ret;
@@ -2604,7 +2608,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
 		return 0;
 
 	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
-	ret = btrfs_qgroup_reserve(root, num_bytes);
+	ret = qgroup_reserve(root, num_bytes);
 	if (ret < 0)
 		return ret;
 	atomic_add(num_bytes, &root->qgroup_meta_rsv);
@@ -2621,7 +2625,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
 	reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
 	if (reserved == 0)
 		return;
-	btrfs_qgroup_free(root, reserved);
+	qgroup_free(root, reserved);
 }
 
 void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
@@ -2632,5 +2636,5 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
 	WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
 	atomic_sub(num_bytes, &root->qgroup_meta_rsv);
-	btrfs_qgroup_free(root, num_bytes);
+	qgroup_free(root, num_bytes);
 }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7d1c87c..adb03da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -71,15 +71,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       u64 ref_root, u64 num_bytes);
-static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
-{
-	return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
-					 num_bytes);
-}
-
 /*
  * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
  * called by everywhere, can't provide good trace for delayed ref case.
@@ -89,7 +82,6 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
 {
 	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
 }
-
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 490c9e2..a7dc456 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3034,8 +3034,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	BUG_ON(cluster->start != cluster->boundary[0]);
 	mutex_lock(&inode->i_mutex);
 
-	ret = __btrfs_check_data_free_space(inode, cluster->start,
-					    cluster->end + 1 - cluster->start);
+	ret = btrfs_check_data_free_space(inode, cluster->start,
+					  cluster->end + 1 - cluster->start);
 	if (ret)
 		goto out;
 
@@ -3056,8 +3056,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
 			break;
 		nr++;
 	}
-	__btrfs_free_reserved_data_space(inode, cluster->start,
-					 cluster->end + 1 - cluster->start);
+	btrfs_free_reserved_data_space(inode, cluster->start,
+				       cluster->end + 1 - cluster->start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.1


From 94ed938aba557aa798acf496f09afb289b619fcd Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:25:56 +0800
Subject: btrfs: qgroup: Add handler for NOCOW and inline

For NOCOW and inline case, there will be no delayed_ref created for
them, so we should free their reserved data space at proper
time(finish_ordered_io for NOCOW and cow_file_inline for inline).

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  7 ++++++-
 fs/btrfs/inode.c       | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3a23225..aaaa774 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4052,7 +4052,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 	if (ret < 0)
 		return ret;
 
-	/* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+	/*
+	 * Use new btrfs_qgroup_reserve_data to reserve precious data space
+	 *
+	 * TODO: Find a good method to avoid reserve data space for NOCOW
+	 * range, but don't impact performance on quota disable case.
+	 */
 	ret = btrfs_qgroup_reserve_data(inode, start, len);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a34e5a9..ef5d369 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+	/*
+	 * Don't forget to free the reserved space, as for inlined extent
+	 * it won't count as data extent, free them directly here.
+	 * And at reserve time, it's always aligned to page size, so
+	 * just free one page here.
+	 */
+	btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -2838,6 +2845,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+		/*
+		 * For mwrite(mmap + memset to write) case, we still reserve
+		 * space for NOCOW range.
+		 * As NOCOW won't cause a new delayed ref, just free the space
+		 */
+		btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+				       ordered_extent->len);
 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (nolock)
 			trans = btrfs_join_transaction_nolock(root);
-- 
cgit v1.1


From b9d0b38928e21560550bd3c1a278d6e004d3bde6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 29 Sep 2015 10:35:16 +0800
Subject: btrfs: Add handler for invalidate page

For btrfs_invalidatepage() and its variant evict_inode_truncate_page(),
there will be pages don't reach disk.
In that case, their reserved space won't be release nor freed by
finish_ordered_io() nor delayed_ref handler.

So we must free their qgroup reserved space, or we will leaking reserved
space again.

So this will patch will call btrfs_qgroup_free_data() for
invalidatepage() and its variant evict_inode_truncate_page().

And due to the nature of new btrfs_qgroup_reserve/free_data() reserved
space will only be reserved or freed once, so for pages which are
already flushed to disk, their reserved space will be released and freed
by delayed_ref handler.

Double free won't be a problem.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef5d369..6670792 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5132,6 +5132,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
 		spin_unlock(&io_tree->lock);
 
 		lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+		/*
+		 * If still has DELALLOC flag, the extent didn't reach disk,
+		 * and its reserved space won't be freed by delayed_ref.
+		 * So we need to free its reserved space here.
+		 * (Refer to comment in btrfs_invalidatepage, case 2)
+		 *
+		 * Note, end is the bytenr of last byte, so we need + 1 here.
+		 */
+		if (state->state & EXTENT_DELALLOC)
+			btrfs_qgroup_free_data(inode, start, end - start + 1);
+
 		clear_extent_bit(io_tree, start, end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -8646,6 +8658,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		}
 	}
 
+	/*
+	 * Qgroup reserved space handler
+	 * Page here will be either
+	 * 1) Already written to disk
+	 *    In this case, its reserved space is released from data rsv map
+	 *    and will be freed by delayed_ref handler finally.
+	 *    So even we call qgroup_free_data(), it won't decrease reserved
+	 *    space.
+	 * 2) Not written to disk
+	 *    This means the reserved space should be freed here.
+	 */
+	btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
 	if (!inode_evicting) {
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
-- 
cgit v1.1


From 81fb6f77a02678ddb0755c24f83c4ed5207da046 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 28 Sep 2015 16:57:53 +0800
Subject: btrfs: qgroup: Add new trace point for qgroup data reserve

Now each qgroup reserve for data will has its ftrace event for better
debugging.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 11 +++++++++--
 fs/btrfs/qgroup.h |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7d5339d..879343c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2516,10 +2516,12 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
 
 	changeset.bytes_changed = 0;
 	changeset.range_changed = ulist_alloc(GFP_NOFS);
-
 	ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
 			start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
 			&changeset);
+	trace_btrfs_qgroup_reserve_data(inode, start, len,
+					changeset.bytes_changed,
+					QGROUP_RESERVE);
 	if (ret < 0)
 		goto cleanup;
 	ret = qgroup_reserve(root, changeset.bytes_changed);
@@ -2544,6 +2546,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
 				       int free)
 {
 	struct extent_changeset changeset;
+	int trace_op = QGROUP_RELEASE;
 	int ret;
 
 	changeset.bytes_changed = 0;
@@ -2557,8 +2560,12 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
 	if (ret < 0)
 		goto out;
 
-	if (free)
+	if (free) {
 		qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+		trace_op = QGROUP_FREE;
+	}
+	trace_btrfs_qgroup_release_data(inode, start, len,
+					changeset.bytes_changed, trace_op);
 out:
 	ulist_free(changeset.range_changed);
 	return ret;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index adb03da..686b60f 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
 	struct ulist *old_roots;
 };
 
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE		(1<<0)
+#define QGROUP_RELEASE		(1<<1)
+#define QGROUP_FREE		(1<<2)
+
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -81,6 +88,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
 						 u64 ref_root, u64 num_bytes)
 {
 	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+	trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
 }
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
-- 
cgit v1.1


From 14524a846eb52c18438e9bd5eb8cf1431fd57b44 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 17:22:44 +0800
Subject: btrfs: fallocate: Add support to accurate qgroup reserve

Now fallocate will do accurate qgroup reserve space check, unlike old
method, which will always reserve the whole length of the range.

With this patch, fallocate will:
1) Iterate the desired range and mark in data rsv map
   Only range which is going to be allocated will be recorded in data
   rsv map and reserve the space.
   For already allocated range (normal/prealloc extent) they will be
   skipped.
   Also, record the marked range into a new list for later use.

2) If 1) succeeded, do real file extent allocate.
   And at file extent allocation time, corresponding range will be
   removed from the range in data rsv map.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 161 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 117 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 47785c8..1243205 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2542,17 +2542,61 @@ out_only_mutex:
 	return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+	struct list_head list;
+	u64 start;
+	u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+	struct falloc_range *prev = NULL;
+	struct falloc_range *range = NULL;
+
+	if (list_empty(head))
+		goto insert;
+
+	/*
+	 * As fallocate iterate by bytenr order, we only need to check
+	 * the last range.
+	 */
+	prev = list_entry(head->prev, struct falloc_range, list);
+	if (prev->start + prev->len == start) {
+		prev->len += len;
+		return 0;
+	}
+insert:
+	range = kmalloc(sizeof(*range), GFP_NOFS);
+	if (!range)
+		return -ENOMEM;
+	range->start = start;
+	range->len = len;
+	list_add_tail(&range->list, head);
+	return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
 	struct extent_state *cached_state = NULL;
+	struct falloc_range *range;
+	struct falloc_range *tmp;
+	struct list_head reserve_list;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
 	u64 locked_end;
+	u64 actual_end = 0;
 	struct extent_map *em;
 	int blocksize = BTRFS_I(inode)->root->sectorsize;
 	int ret;
@@ -2568,14 +2612,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 		return btrfs_punch_hole(inode, offset, len);
 
 	/*
-	 * Make sure we have enough space before we do the
-	 * allocation.
-	 * XXX: The behavior must be changed to do accurate check first
-	 * and then check data reserved space.
+	 * Only trigger disk allocation, don't trigger qgroup reserve
+	 *
+	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_check_data_free_space(inode, alloc_start,
-					  alloc_end - alloc_start);
-	if (ret)
+	ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+	if (ret < 0)
 		return ret;
 
 	mutex_lock(&inode->i_mutex);
@@ -2583,6 +2625,13 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	/*
+	 * TODO: Move these two operations after we have checked
+	 * accurate reserved space, or fallocate can still fail but
+	 * with page truncated or size expanded.
+	 *
+	 * But that's a minor problem and won't do much harm BTW.
+	 */
 	if (alloc_start > inode->i_size) {
 		ret = btrfs_cont_expand(inode, i_size_read(inode),
 					alloc_start);
@@ -2641,10 +2690,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 		}
 	}
 
+	/* First, check if we exceed the qgroup limit */
+	INIT_LIST_HEAD(&reserve_list);
 	cur_offset = alloc_start;
 	while (1) {
-		u64 actual_end;
-
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				      alloc_end - cur_offset, 0);
 		if (IS_ERR_OR_NULL(em)) {
@@ -2657,54 +2706,78 @@ static long btrfs_fallocate(struct file *file, int mode,
 		last_byte = min(extent_map_end(em), alloc_end);
 		actual_end = min_t(u64, extent_map_end(em), offset + len);
 		last_byte = ALIGN(last_byte, blocksize);
-
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-							last_byte - cur_offset,
-							1 << inode->i_blkbits,
-							offset + len,
-							&alloc_hint);
-		} else if (actual_end > inode->i_size &&
-			   !(mode & FALLOC_FL_KEEP_SIZE)) {
-			struct btrfs_trans_handle *trans;
-			struct btrfs_root *root = BTRFS_I(inode)->root;
-
-			/*
-			 * We didn't need to allocate any more space, but we
-			 * still extended the size of the file so we need to
-			 * update i_size and the inode item.
-			 */
-			trans = btrfs_start_transaction(root, 1);
-			if (IS_ERR(trans)) {
-				ret = PTR_ERR(trans);
-			} else {
-				inode->i_ctime = CURRENT_TIME;
-				i_size_write(inode, actual_end);
-				btrfs_ordered_update_i_size(inode, actual_end,
-							    NULL);
-				ret = btrfs_update_inode(trans, root, inode);
-				if (ret)
-					btrfs_end_transaction(trans, root);
-				else
-					ret = btrfs_end_transaction(trans,
-								    root);
+			ret = add_falloc_range(&reserve_list, cur_offset,
+					       last_byte - cur_offset);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
 			}
+			ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+					last_byte - cur_offset);
+			if (ret < 0)
+				break;
 		}
 		free_extent_map(em);
-		if (ret < 0)
-			break;
-
 		cur_offset = last_byte;
-		if (cur_offset >= alloc_end) {
-			ret = 0;
+		if (cur_offset >= alloc_end)
 			break;
+	}
+
+	/*
+	 * If ret is still 0, means we're OK to fallocate.
+	 * Or just cleanup the list and exit.
+	 */
+	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+		if (!ret)
+			ret = btrfs_prealloc_file_range(inode, mode,
+					range->start,
+					range->len, 1 << inode->i_blkbits,
+					offset + len, &alloc_hint);
+		list_del(&range->list);
+		kfree(range);
+	}
+	if (ret < 0)
+		goto out_unlock;
+
+	if (actual_end > inode->i_size &&
+	    !(mode & FALLOC_FL_KEEP_SIZE)) {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		/*
+		 * We didn't need to allocate any more space, but we
+		 * still extended the size of the file so we need to
+		 * update i_size and the inode item.
+		 */
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+		} else {
+			inode->i_ctime = CURRENT_TIME;
+			i_size_write(inode, actual_end);
+			btrfs_ordered_update_i_size(inode, actual_end, NULL);
+			ret = btrfs_update_inode(trans, root, inode);
+			if (ret)
+				btrfs_end_transaction(trans, root);
+			else
+				ret = btrfs_end_transaction(trans, root);
 		}
 	}
+out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
 out:
+	/*
+	 * As we waited the extent range, the data_rsv_map must be empty
+	 * in the range, as written data range will be released from it.
+	 * And for prealloacted extent, it will also be released when
+	 * its metadata is written.
+	 * So this is completely used as cleanup.
+	 */
+	btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_start,
-- 
cgit v1.1


From 51773bec7ea352f3b9afa11ecfc72324c7977335 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 8 Oct 2015 18:19:37 +0800
Subject: btrfs: qgroup: Avoid calling btrfs_free_reserved_data_space in
 clear_bit_hook

In clear_bit_hook, qgroup reserved data is already handled quite well,
either released by finish_ordered_io or invalidatepage.

So calling btrfs_qgroup_free_data() here is completely meaningless, and
since btrfs_qgroup_free_data() will lock io_tree, so it can't be called
with io_tree lock hold.

This patch will add a new function
btrfs_free_reserved_data_space_noquota() for clear_bit_hook() to cease
the lockdep warning.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c | 28 ++++++++++++++++++----------
 fs/btrfs/inode.c       |  4 ++--
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d0f5d2..2135b82 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3455,6 +3455,8 @@ enum btrfs_reserve_flush_enum {
 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+					    u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aaaa774..4660960 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4066,10 +4066,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
  * Called if we need to clear a data reservation for this inode
  * Normally in a error case.
  *
- * This one will handle the per-indoe data rsv map for accurate reserved
- * space framework.
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
  */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+					    u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_space_info *data_sinfo;
@@ -4079,13 +4081,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 	      round_down(start, root->sectorsize);
 	start = round_down(start, root->sectorsize);
 
-	/*
-	 * Free any reserved qgroup data space first
-	 * As it will alloc memory, we can't do it with data sinfo
-	 * spinlock hold.
-	 */
-	btrfs_qgroup_free_data(inode, start, len);
-
 	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
 	if (WARN_ON(data_sinfo->bytes_may_use < len))
@@ -4097,6 +4092,19 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 	spin_unlock(&data_sinfo->lock);
 }
 
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+	btrfs_free_reserved_data_space_noquota(inode, start, len);
+	btrfs_qgroup_free_data(inode, start, len);
+}
+
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
 	struct list_head *head = &info->space_info;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6670792..3229c13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1776,8 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
 		    && do_list && !(state->state & EXTENT_NORESERVE))
-			btrfs_free_reserved_data_space(inode, state->start,
-						       len);
+			btrfs_free_reserved_data_space_noquota(inode,
+					state->start, len);
 
 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
 				     root->fs_info->delalloc_batch);
-- 
cgit v1.1


From 56fa9d0762ed17153c1bdff3c0aeeecbe522b504 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 13 Oct 2015 09:53:10 +0800
Subject: btrfs: qgroup: Check if qgroup reserved space leaked

Add check at btrfs_destroy_inode() time to detect qgroup reserved space
leak.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c  |  1 +
 fs/btrfs/qgroup.c | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h |  1 +
 3 files changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3229c13..df6b93f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9108,6 +9108,7 @@ void btrfs_destroy_inode(struct inode *inode)
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
+	btrfs_qgroup_check_reserved_leak(inode);
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 879343c..158633c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2645,3 +2645,35 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	atomic_sub(num_bytes, &root->qgroup_meta_rsv);
 	qgroup_free(root, num_bytes);
 }
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+	struct extent_changeset changeset;
+	struct ulist_node *unode;
+	struct ulist_iterator iter;
+	int ret;
+
+	changeset.bytes_changed = 0;
+	changeset.range_changed = ulist_alloc(GFP_NOFS);
+	if (WARN_ON(!changeset.range_changed))
+		return;
+
+	ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+			EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+	WARN_ON(ret < 0);
+	if (WARN_ON(changeset.bytes_changed)) {
+		ULIST_ITER_INIT(&iter);
+		while ((unode = ulist_next(changeset.range_changed, &iter))) {
+			btrfs_warn(BTRFS_I(inode)->root->fs_info,
+				"leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+				inode->i_ino, unode->val, unode->aux);
+		}
+		qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+	}
+	ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 686b60f..ecb2c14 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -105,4 +105,5 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
 void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
 void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 #endif /* __BTRFS_QGROUP__ */
-- 
cgit v1.1


From 161c3549b45aeef05451b6822d8aaaf39c7bedce Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 24 Sep 2015 16:17:39 -0400
Subject: Btrfs: change how we wait for pending ordered extents

We have a mechanism to make sure we don't lose updates for ordered extents that
were logged in the transaction that is currently running.  We add the ordered
extent to a transaction list and then the transaction waits on all the ordered
extents in that list.  However are substantially large file systems this list
can be extremely large, and can give us soft lockups, since the ordered extents
don't remove themselves from the list when they do complete.

To fix this we simply add a counter to the transaction that is incremented any
time we have a logged extent that needs to be completed in the current
transaction.  Then when the ordered extent finally completes it decrements the
per transaction counter and wakes up the transaction if we are the last ones.
This will eliminate the softlockup.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c      | 20 ----------------
 fs/btrfs/ordered-data.c | 64 ++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ordered-data.h |  2 ++
 fs/btrfs/transaction.c  | 34 +++++---------------------
 fs/btrfs/transaction.h  |  4 ++--
 5 files changed, 60 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bcbb596..dceabb1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4326,25 +4326,6 @@ again:
 	return 0;
 }
 
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
-				       struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_ordered_extent *ordered;
-
-	spin_lock(&fs_info->trans_lock);
-	while (!list_empty(&cur_trans->pending_ordered)) {
-		ordered = list_first_entry(&cur_trans->pending_ordered,
-					   struct btrfs_ordered_extent,
-					   trans_list);
-		list_del_init(&ordered->trans_list);
-		spin_unlock(&fs_info->trans_lock);
-
-		btrfs_put_ordered_extent(ordered);
-		spin_lock(&fs_info->trans_lock);
-	}
-	spin_unlock(&fs_info->trans_lock);
-}
-
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_root *root)
 {
@@ -4356,7 +4337,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	wake_up(&root->fs_info->transaction_wait);
 
-	btrfs_free_pending_ordered(cur_trans, root->fs_info);
 	btrfs_destroy_delayed_inodes(root);
 	btrfs_assert_delayed_root_empty(root);
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 071005f..8c27292 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -490,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 
 	spin_lock_irq(&log->log_extents_lock[index]);
 	while (!list_empty(&log->logged_list[index])) {
+		struct inode *inode;
 		ordered = list_first_entry(&log->logged_list[index],
 					   struct btrfs_ordered_extent,
 					   log_list);
 		list_del_init(&ordered->log_list);
+		inode = ordered->inode;
 		spin_unlock_irq(&log->log_extents_lock[index]);
 
 		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
 		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
-			struct inode *inode = ordered->inode;
 			u64 start = ordered->file_offset;
 			u64 end = ordered->file_offset + ordered->len - 1;
 
@@ -509,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 						   &ordered->flags));
 
 		/*
-		 * If our ordered extent completed it means it updated the
-		 * fs/subvol and csum trees already, so no need to make the
-		 * current transaction's commit wait for it, as we end up
-		 * holding memory unnecessarily and delaying the inode's iput
-		 * until the transaction commit (we schedule an iput for the
-		 * inode when the ordered extent's refcount drops to 0), which
-		 * prevents it from being evictable until the transaction
-		 * commits.
+		 * In order to keep us from losing our ordered extent
+		 * information when committing the transaction we have to make
+		 * sure that any logged extents are completed when we go to
+		 * commit the transaction.  To do this we simply increase the
+		 * current transactions pending_ordered counter and decrement it
+		 * when the ordered extent completes.
 		 */
-		if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
-			btrfs_put_ordered_extent(ordered);
-		else
-			list_add_tail(&ordered->trans_list, &trans->ordered);
-
+		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+			struct btrfs_ordered_inode_tree *tree;
+
+			tree = &BTRFS_I(inode)->ordered_tree;
+			spin_lock_irq(&tree->lock);
+			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+				atomic_inc(&trans->transaction->pending_ordered);
+			}
+			spin_unlock_irq(&tree->lock);
+		}
+		btrfs_put_ordered_extent(ordered);
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
@@ -584,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct rb_node *node;
+	bool dec_pending_ordered = false;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
@@ -593,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	if (tree->last == node)
 		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+		dec_pending_ordered = true;
 	spin_unlock_irq(&tree->lock);
 
+	/*
+	 * The current running transaction is waiting on us, we need to let it
+	 * know that we're complete and wake it up.
+	 */
+	if (dec_pending_ordered) {
+		struct btrfs_transaction *trans;
+
+		/*
+		 * The checks for trans are just a formality, it should be set,
+		 * but if it isn't we don't want to deref/assert under the spin
+		 * lock, so be nice and check if trans is set, but ASSERT() so
+		 * if it isn't set a developer will notice.
+		 */
+		spin_lock(&root->fs_info->trans_lock);
+		trans = root->fs_info->running_transaction;
+		if (trans)
+			atomic_inc(&trans->use_count);
+		spin_unlock(&root->fs_info->trans_lock);
+
+		ASSERT(trans);
+		if (trans) {
+			if (atomic_dec_and_test(&trans->pending_ordered))
+				wake_up(&trans->pending_wait);
+			btrfs_put_transaction(trans);
+		}
+	}
+
 	spin_lock(&root->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
 	root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0..23c9605 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
 				 * in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+				  * complete in the current transaction. */
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9354e7a..68a56c3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -232,6 +232,7 @@ loop:
 	extwriter_counter_init(cur_trans, type);
 	init_waitqueue_head(&cur_trans->writer_wait);
 	init_waitqueue_head(&cur_trans->commit_wait);
+	init_waitqueue_head(&cur_trans->pending_wait);
 	cur_trans->state = TRANS_STATE_RUNNING;
 	/*
 	 * One for this trans handle, one so it will live on until we
@@ -239,6 +240,7 @@ loop:
 	 */
 	atomic_set(&cur_trans->use_count, 2);
 	cur_trans->have_free_bgs = 0;
+	atomic_set(&cur_trans->pending_ordered, 0);
 	cur_trans->start_time = get_seconds();
 	cur_trans->dirty_bg_run = 0;
 
@@ -266,7 +268,6 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
-	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
 	INIT_LIST_HEAD(&cur_trans->io_bgs);
 	INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -551,7 +552,6 @@ again:
 	h->can_flush_pending_bgs = true;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
-	INIT_LIST_HEAD(&h->ordered);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -784,12 +784,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
-	if (!list_empty(&trans->ordered)) {
-		spin_lock(&info->trans_lock);
-		list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-		spin_unlock(&info->trans_lock);
-	}
-
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync) {
 		must_run_delayed_refs =
@@ -1788,25 +1782,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 }
 
 static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-			   struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
 {
-	struct btrfs_ordered_extent *ordered;
-
-	spin_lock(&fs_info->trans_lock);
-	while (!list_empty(&cur_trans->pending_ordered)) {
-		ordered = list_first_entry(&cur_trans->pending_ordered,
-					   struct btrfs_ordered_extent,
-					   trans_list);
-		list_del_init(&ordered->trans_list);
-		spin_unlock(&fs_info->trans_lock);
-
-		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-						   &ordered->flags));
-		btrfs_put_ordered_extent(ordered);
-		spin_lock(&fs_info->trans_lock);
-	}
-	spin_unlock(&fs_info->trans_lock);
+	wait_event(cur_trans->pending_wait,
+		   atomic_read(&cur_trans->pending_ordered) == 0);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1890,7 +1869,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&root->fs_info->trans_lock);
-	list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
 		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
@@ -1949,7 +1927,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_wait_delalloc_flush(root->fs_info);
 
-	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+	btrfs_wait_pending_ordered(cur_trans);
 
 	btrfs_scrub_pause(root);
 	/*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index db6bfd9..bf7b1dd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -46,6 +46,7 @@ struct btrfs_transaction {
 	 */
 	atomic_t num_writers;
 	atomic_t use_count;
+	atomic_t pending_ordered;
 
 	/*
 	 * true if there is free bgs operations in this transaction
@@ -59,9 +60,9 @@ struct btrfs_transaction {
 	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
+	wait_queue_head_t pending_wait;
 	struct list_head pending_snapshots;
 	struct list_head pending_chunks;
-	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
 	struct list_head io_bgs;
@@ -129,7 +130,6 @@ struct btrfs_trans_handle {
 	 */
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
-	struct list_head ordered;
 	struct list_head qgroup_ref_list;
 	struct list_head new_bgs;
 };
-- 
cgit v1.1


From d9ee522ba3ab51b7e3c6dfcf3743216371bc810f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 5 Oct 2015 10:35:29 -0400
Subject: Btrfs: fix qgroup sanity tests

With my changes to allow us to find old roots when resolving indirect refs I
introduced a regression to the sanity tests.  Since we don't really care to go
down into the fs roots we just need to have the old behavior of returning ENOENT
for dummy roots for the sanity tests.  In the future if we want to get fancy we
can populate the test fs trees with the references as well.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ecbc63d..2adc152a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
+	if (btrfs_test_is_dummy_root(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		ret = -ENOENT;
+		goto out;
+	}
+
 	if (path->search_commit_root)
 		root_level = btrfs_header_level(root->commit_root);
 	else if (time_seq == (u64)-1)
-- 
cgit v1.1


From d0bd456074dca089579818312da7cbe726ad2ff9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 23 Sep 2015 14:54:14 -0400
Subject: Btrfs: add fragment=* debug mount option

In tracking down these weird bitmap problems it was helpful to artificially
create an extremely fragmented file system.  These mount options let us either
fragment data or metadata or both.  With these options I could reproduce all
sorts of weird latencies and hangs that occur under extreme fragmentation and
get them fixed.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h                  | 14 +++++++
 fs/btrfs/extent-tree.c            | 82 ++++++++++++++++++++++++++++++++++++---
 fs/btrfs/free-space-cache.c       |  9 ++++-
 fs/btrfs/super.c                  | 30 ++++++++++++++
 fs/btrfs/tests/free-space-tests.c | 22 ++++++++++-
 5 files changed, 150 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49bc792..1638423 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2145,6 +2145,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
 #define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA	(1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA	(1 << 25)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(8192)
@@ -2169,6 +2171,18 @@ struct btrfs_ioctl_defrag_range_args {
 	btrfs_clear_opt(root->fs_info->mount_opt, opt);			\
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+				 struct btrfs_block_group_cache *block_group)
+{
+	return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+	       (btrfs_test_opt(root, FRAGMENT_DATA) &&
+		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
 /*
  * Requests for changes that need to be done during transaction commit.
  *
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2df4bc7..0e32abf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -332,6 +332,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
 		kfree(ctl);
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+				struct btrfs_block_group_cache *block_group)
+{
+	u64 start = block_group->key.objectid;
+	u64 len = block_group->key.offset;
+	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+		root->nodesize : root->sectorsize;
+	u64 step = chunk << 1;
+
+	while (len > chunk) {
+		btrfs_remove_free_space(block_group, start, chunk);
+		start += step;
+		if (len < step)
+			len = 0;
+		else
+			len -= step;
+	}
+}
+#endif
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +409,7 @@ static noinline void caching_thread(struct btrfs_work *work)
 	u64 last = 0;
 	u32 nritems;
 	int ret = -ENOMEM;
+	bool wakeup = true;
 
 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
@@ -400,6 +422,15 @@ static noinline void caching_thread(struct btrfs_work *work)
 
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	/*
+	 * If we're fragmenting we don't want to make anybody think we can
+	 * allocate from this block group until we've had a chance to fragment
+	 * the free space.
+	 */
+	if (btrfs_should_fragment_free_space(extent_root, block_group))
+		wakeup = false;
+#endif
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
 	 * extent for the extent root while also trying to search the extent
@@ -441,7 +472,8 @@ next:
 
 			if (need_resched() ||
 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
-				caching_ctl->progress = last;
+				if (wakeup)
+					caching_ctl->progress = last;
 				btrfs_release_path(path);
 				up_read(&fs_info->commit_root_sem);
 				mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +496,8 @@ next:
 			key.offset = 0;
 			key.type = BTRFS_EXTENT_ITEM_KEY;
 
-			caching_ctl->progress = last;
+			if (wakeup)
+				caching_ctl->progress = last;
 			btrfs_release_path(path);
 			goto next;
 		}
@@ -491,7 +524,8 @@ next:
 
 			if (total_found > (1024 * 1024 * 2)) {
 				total_found = 0;
-				wake_up(&caching_ctl->wait);
+				if (wakeup)
+					wake_up(&caching_ctl->wait);
 			}
 		}
 		path->slots[0]++;
@@ -501,13 +535,27 @@ next:
 	total_found += add_new_free_space(block_group, fs_info, last,
 					  block_group->key.objectid +
 					  block_group->key.offset);
-	caching_ctl->progress = (u64)-1;
-
 	spin_lock(&block_group->lock);
 	block_group->caching_ctl = NULL;
 	block_group->cached = BTRFS_CACHE_FINISHED;
 	spin_unlock(&block_group->lock);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+		u64 bytes_used;
+
+		spin_lock(&block_group->space_info->lock);
+		spin_lock(&block_group->lock);
+		bytes_used = block_group->key.offset -
+			btrfs_block_group_used(&block_group->item);
+		block_group->space_info->bytes_used += bytes_used >> 1;
+		spin_unlock(&block_group->lock);
+		spin_unlock(&block_group->space_info->lock);
+		fragment_free_space(extent_root, block_group);
+	}
+#endif
+
+	caching_ctl->progress = (u64)-1;
 err:
 	btrfs_free_path(path);
 	up_read(&fs_info->commit_root_sem);
@@ -607,6 +655,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 			}
 		}
 		spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+		if (ret == 1 &&
+		    btrfs_should_fragment_free_space(fs_info->extent_root,
+						     cache)) {
+			u64 bytes_used;
+
+			spin_lock(&cache->space_info->lock);
+			spin_lock(&cache->lock);
+			bytes_used = cache->key.offset -
+				btrfs_block_group_used(&cache->item);
+			cache->space_info->bytes_used += bytes_used >> 1;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+			fragment_free_space(fs_info->extent_root, cache);
+		}
+#endif
 		mutex_unlock(&caching_ctl->mutex);
 
 		wake_up(&caching_ctl->wait);
@@ -9624,6 +9688,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	free_excluded_extents(root, cache);
 
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(root, cache)) {
+		u64 new_bytes_used = size - bytes_used;
+
+		bytes_used += new_bytes_used >> 1;
+		fragment_free_space(root, cache);
+	}
+#endif
 	/*
 	 * Call to ensure the corresponding space_info object is created and
 	 * assigned to our block group, but don't update its counters just yet.
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5a9431d..c0eb84e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1951,12 +1951,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 		      struct btrfs_free_space *info)
 {
 	struct btrfs_block_group_cache *block_group = ctl->private;
+	bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+					     block_group))
+		forced = true;
+#endif
 
 	/*
 	 * If we are below the extents threshold then we can add this as an
 	 * extent, and don't have to deal with the bitmap
 	 */
-	if (ctl->free_extents < ctl->extents_thresh) {
+	if (!forced && ctl->free_extents < ctl->extents_thresh) {
 		/*
 		 * If this block group has some small extents we don't want to
 		 * use up all of our free slots in the cache with them, we want
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b23d49d..24154e4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,6 +303,9 @@ enum {
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
 	Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
 	Opt_err,
 };
 
@@ -355,6 +358,11 @@ static match_table_t tokens = {
 	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_fatal_errors, "fatal_errors=%s"},
 	{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+	{Opt_fragment_data, "fragment=data"},
+	{Opt_fragment_metadata, "fragment=metadata"},
+	{Opt_fragment_all, "fragment=all"},
+#endif
 	{Opt_err, NULL},
 };
 
@@ -721,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 			}
 			break;
+#ifdef CONFIG_BTRFS_DEBUG
+		case Opt_fragment_all:
+			btrfs_info(root->fs_info, "fragmenting all space");
+			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+			btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+			break;
+		case Opt_fragment_metadata:
+			btrfs_info(root->fs_info, "fragmenting metadata");
+			btrfs_set_opt(info->mount_opt,
+				      FRAGMENT_METADATA);
+			break;
+		case Opt_fragment_data:
+			btrfs_info(root->fs_info, "fragmenting data");
+			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+			break;
+#endif
 		case Opt_err:
 			btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
@@ -1172,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",fatal_errors=panic");
 	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
 		seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+	if (btrfs_test_opt(root, FRAGMENT_DATA))
+		seq_puts(seq, ",fragment=data");
+	if (btrfs_test_opt(root, FRAGMENT_METADATA))
+		seq_puts(seq, ",fragment=metadata");
+#endif
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfd..c8c3d70 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include "btrfs-tests.h"
 #include "../ctree.h"
+#include "../disk-io.h"
 #include "../free-space-cache.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
@@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
 		kfree(cache);
 		return NULL;
 	}
+	cache->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!cache->fs_info) {
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+		return NULL;
+	}
 
 	cache->key.objectid = 0;
 	cache->key.offset = 1024 * 1024 * 1024;
@@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 int btrfs_test_free_space_cache(void)
 {
 	struct btrfs_block_group_cache *cache;
-	int ret;
+	struct btrfs_root *root = NULL;
+	int ret = -ENOMEM;
 
 	test_msg("Running btrfs free space cache tests\n");
 
@@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void)
 		return 0;
 	}
 
+	root = btrfs_alloc_dummy_root();
+	if (!root)
+		goto out;
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info)
+		goto out;
+
+	root->fs_info->extent_root = root;
+	cache->fs_info = root->fs_info;
+
 	ret = test_extents(cache);
 	if (ret)
 		goto out;
@@ -904,6 +923,7 @@ out:
 	__btrfs_remove_free_space_cache(cache->free_space_ctl);
 	kfree(cache->free_space_ctl);
 	kfree(cache);
+	btrfs_free_dummy_root(root);
 	test_msg("Free space cache tests finished\n");
 	return ret;
 }
-- 
cgit v1.1


From 0b670dc44c91bd1e5fac15b5ac4c98c8bd255ca2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 23 Sep 2015 17:11:16 -0400
Subject: Btrfs: fix prealloc under heavy fragmentation conditions

If we are heavily fragmented we will continually try to prealloc the largest
extent size we can every time we call btrfs_reserve_extent.  This can be very
expensive when we are heavily fragmented, burning lots of CPU cycles and loops
through the allocator.  So instead notice when we get a smaller chunk from the
allocator than what we specified and use this as the new maximum size we try to
allocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ce55f6..a3e0783 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9687,6 +9687,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 	u64 cur_offset = start;
 	u64 i_size;
 	u64 cur_bytes;
+	u64 last_alloc = (u64)-1;
 	int ret = 0;
 	bool own_trans = true;
 
@@ -9703,6 +9704,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
 		cur_bytes = max(cur_bytes, min_size);
+		/*
+		 * If we are severely fragmented we could end up with really
+		 * small allocations, so if the allocator is returning small
+		 * chunks lets make its job easier by only searching for those
+		 * sized chunks.
+		 */
+		cur_bytes = min(cur_bytes, last_alloc);
 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
 					   *alloc_hint, &ins, 1, 0);
 		if (ret) {
@@ -9711,6 +9719,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 			break;
 		}
 
+		last_alloc = ins.offset;
 		ret = insert_reserved_file_extent(trans, inode,
 						  cur_offset, ins.objectid,
 						  ins.offset, ins.offset,
-- 
cgit v1.1


From 3204d33cda40d9bc97f257c441225d3713916661 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 24 Sep 2015 10:46:10 -0400
Subject: Btrfs: add a flags field to btrfs_transaction

I want to set some per transaction flags, so instead of adding yet another int
lets just convert the current two int indicators to flags and add a flags field
for future use.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/transaction.c | 18 ++++++++----------
 fs/btrfs/transaction.h |  9 ++++-----
 fs/btrfs/volumes.c     |  2 +-
 4 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0e32abf..e0dfe35 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4066,7 +4066,8 @@ commit_trans:
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
 			if (have_pinned_space >= 0 ||
-			    trans->transaction->have_free_bgs ||
+			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+				     &trans->transaction->flags) ||
 			    need_commit > 0) {
 				ret = btrfs_commit_transaction(trans, root);
 				if (ret)
@@ -8938,7 +8939,7 @@ again:
 	 * back off and let this transaction commit
 	 */
 	mutex_lock(&root->fs_info->ro_block_group_mutex);
-	if (trans->transaction->dirty_bg_run) {
+	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
 		u64 transid = trans->transid;
 
 		mutex_unlock(&root->fs_info->ro_block_group_mutex);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 68a56c3..222f9a9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -239,10 +239,9 @@ loop:
 	 * commit the transaction.
 	 */
 	atomic_set(&cur_trans->use_count, 2);
-	cur_trans->have_free_bgs = 0;
 	atomic_set(&cur_trans->pending_ordered, 0);
+	cur_trans->flags = 0;
 	cur_trans->start_time = get_seconds();
-	cur_trans->dirty_bg_run = 0;
 
 	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
 
@@ -1837,7 +1836,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	if (!cur_trans->dirty_bg_run) {
+	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
 		int run_it = 0;
 
 		/* this mutex is also taken before trying to set
@@ -1846,18 +1845,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		 * after a extents from that block group have been
 		 * allocated for cache files.  btrfs_set_block_group_ro
 		 * will wait for the transaction to commit if it
-		 * finds dirty_bg_run = 1
+		 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
 		 *
-		 * The dirty_bg_run flag is also used to make sure only
-		 * one process starts all the block group IO.  It wouldn't
+		 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+		 * only one process starts all the block group IO.  It wouldn't
 		 * hurt to have more than one go through, but there's no
 		 * real advantage to it either.
 		 */
 		mutex_lock(&root->fs_info->ro_block_group_mutex);
-		if (!cur_trans->dirty_bg_run) {
+		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+				      &cur_trans->flags))
 			run_it = 1;
-			cur_trans->dirty_bg_run = 1;
-		}
 		mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
 		if (run_it)
@@ -2127,7 +2125,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	if (cur_trans->have_free_bgs)
+	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
 		btrfs_clear_space_info_full(root->fs_info);
 
 	root->fs_info->last_trans_committed = cur_trans->transid;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index bf7b1dd..109b8b7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,9 @@ enum btrfs_trans_state {
 	TRANS_STATE_MAX			= 6,
 };
 
+#define BTRFS_TRANS_HAVE_FREE_BGS	0
+#define BTRFS_TRANS_DIRTY_BG_RUN	1
+
 struct btrfs_transaction {
 	u64 transid;
 	/*
@@ -48,10 +51,7 @@ struct btrfs_transaction {
 	atomic_t use_count;
 	atomic_t pending_ordered;
 
-	/*
-	 * true if there is free bgs operations in this transaction
-	 */
-	int have_free_bgs;
+	unsigned long flags;
 
 	/* Be protected by fs_info->trans_lock when we want to change it. */
 	enum btrfs_trans_state state;
@@ -81,7 +81,6 @@ struct btrfs_transaction {
 	spinlock_t dropped_roots_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
-	int dirty_bg_run;
 };
 
 #define __TRANS_FREEZABLE	(1U << 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e875b6c..f86d838 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1462,7 +1462,7 @@ again:
 		btrfs_std_error(root->fs_info, ret,
 			    "Failed to remove dev extent item");
 	} else {
-		trans->transaction->have_free_bgs = 1;
+		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
 	}
 out:
 	btrfs_free_path(path);
-- 
cgit v1.1


From 36af4e0737f6aa494e43497a5a34588a1d5cb12f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 25 Sep 2015 16:13:11 -0400
Subject: Btrfs: don't loop in allocator for space cache

The space cache needs to have contiguous allocations, and the allocator tries to
make allocations by reducing the amount of bytes requested and re-searching.
But this just makes us waste time when we are very fragmented, so if we can't
find our space just exit, don't bother trying to search again.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0dfe35..d6b5ef4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7337,7 +7337,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 empty_size, u64 hint_byte,
 			 struct btrfs_key *ins, int is_data, int delalloc)
 {
-	bool final_tried = false;
+	bool final_tried = num_bytes == min_alloc_size;
 	u64 flags;
 	int ret;
 
-- 
cgit v1.1


From 4f4db2174d8d6cdc093cbb79d17fbfe0f4d9fbde Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 29 Sep 2015 11:40:47 -0400
Subject: Btrfs: keep track of max_extent_size per space_info

When we are heavily fragmented we can induce a lot of latency trying to make an
allocation happen that is simply not going to happen.  Thankfully we keep track
of our max_extent_size when going through the allocator, so if we get to the
point where we are exiting find_free_extent with ENOSPC then set our
space_info->max_extent_size so we can keep future allocations from having to pay
this cost.  We reset the max_extent_size whenever we release pinned bytes back
into this space info so we can redo all the work.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/extent-tree.c | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1638423..78a8f56 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1154,6 +1154,10 @@ struct btrfs_space_info {
 				   delalloc/allocations */
 	u64 bytes_readonly;	/* total bytes that are read only */
 
+	u64 max_extent_size;	/* This will hold the maximum extent size of
+				   the space info if we had an ENOSPC in the
+				   allocator. */
+
 	unsigned int full:1;	/* indicates that we cannot allocate any more
 				   chunks for this space */
 	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d6b5ef4..febb5bc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3810,6 +3810,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_readonly = 0;
 	found->bytes_may_use = 0;
 	found->full = 0;
+	found->max_extent_size = 0;
 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
 	found->chunk_alloc = 0;
 	found->flush = 0;
@@ -6158,6 +6159,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		space_info->bytes_pinned -= len;
+		space_info->max_extent_size = 0;
 		percpu_counter_add(&space_info->total_bytes_pinned, -len);
 		if (cache->ro) {
 			space_info->bytes_readonly += len;
@@ -6915,6 +6917,29 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	}
 
 	/*
+	 * If our free space is heavily fragmented we may not be able to make
+	 * big contiguous allocations, so instead of doing the expensive search
+	 * for free space, simply return ENOSPC with our max_extent_size so we
+	 * can go ahead and search for a more manageable chunk.
+	 *
+	 * If our max_extent_size is large enough for our allocation simply
+	 * disable clustering since we will likely not be able to find enough
+	 * space to create a cluster and induce latency trying.
+	 */
+	if (unlikely(space_info->max_extent_size)) {
+		spin_lock(&space_info->lock);
+		if (space_info->max_extent_size &&
+		    num_bytes > space_info->max_extent_size) {
+			ins->offset = space_info->max_extent_size;
+			spin_unlock(&space_info->lock);
+			return -ENOSPC;
+		} else if (space_info->max_extent_size) {
+			use_cluster = false;
+		}
+		spin_unlock(&space_info->lock);
+	}
+
+	/*
 	 * If the space info is for both data and metadata it means we have a
 	 * small filesystem and we can't use the clustering stuff.
 	 */
@@ -7287,8 +7312,12 @@ loop:
 		ret = 0;
 	}
 out:
-	if (ret == -ENOSPC)
+	if (ret == -ENOSPC) {
+		spin_lock(&space_info->lock);
+		space_info->max_extent_size = max_extent_size;
+		spin_unlock(&space_info->lock);
 		ins->offset = max_extent_size;
+	}
 	return ret;
 }
 
-- 
cgit v1.1


From 2968b1f48bd7366dd7310acde1ee6d1bf7791142 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 1 Oct 2015 12:55:18 -0400
Subject: Btrfs: don't continue setting up space cache when enospc

If we hit ENOSPC when setting up a space cache don't bother setting up any of
the other space cache's in this transaction, it'll just induce unnecessary
latency.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 19 +++++++++++++++++++
 fs/btrfs/transaction.h |  1 +
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index febb5bc..3185c45 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3403,6 +3403,15 @@ again:
 	spin_unlock(&block_group->lock);
 
 	/*
+	 * We hit an ENOSPC when setting up the cache in this transaction, just
+	 * skip doing the setup, we've already cleared the cache so we're safe.
+	 */
+	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+		ret = -ENOSPC;
+		goto out_put;
+	}
+
+	/*
 	 * Try to preallocate enough space based on how big the block group is.
 	 * Keep in mind this has to include any pinned space which could end up
 	 * taking up quite a bit since it's not folded into the other space
@@ -3422,8 +3431,18 @@ again:
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
+	/*
+	 * Our cache requires contiguous chunks so that we don't modify a bunch
+	 * of metadata or split extents when writing the cache out, which means
+	 * we can enospc if we are heavily fragmented in addition to just normal
+	 * out of space conditions.  So if we hit this just skip setting up any
+	 * other block groups for this transaction, maybe we'll unpin enough
+	 * space the next time around.
+	 */
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
+	else if (ret == -ENOSPC)
+		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
 	btrfs_free_reserved_data_space(inode, num_pages);
 
 out_put:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 109b8b7..30ae750 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -34,6 +34,7 @@ enum btrfs_trans_state {
 
 #define BTRFS_TRANS_HAVE_FREE_BGS	0
 #define BTRFS_TRANS_DIRTY_BG_RUN	1
+#define BTRFS_TRANS_CACHE_ENOSPC	2
 
 struct btrfs_transaction {
 	u64 transid;
-- 
cgit v1.1


From a5e681d9bd641c4f0677e87d3a0c92a8f4f16293 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 1 Oct 2015 14:54:10 -0400
Subject: Btrfs: cut down on loops through the allocator

We try really really hard to make allocations, but sometimes it is just not
going to happen, especially when free space is extremely fragmented.  So add a
few short cuts through the looping states.  For example if we couldn't allocate
a chunk, just go straight to the NO_EMPTY_SIZE loop.  If there are no uncached
block groups and we've done a full search, go straight to the ALLOC_CHUNK stage.
And finally if we already have empty_size and empty_cluster set to 0 go ahead
and return -ENOSPC.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3185c45..9f18eb0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6921,6 +6921,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	bool failed_alloc = false;
 	bool use_cluster = true;
 	bool have_caching_bg = false;
+	bool full_search = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -7023,6 +7024,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	}
 search:
 	have_caching_bg = false;
+	if (index == 0 || index == __get_raid_index(flags))
+		full_search = true;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
@@ -7056,6 +7059,7 @@ search:
 have_block_group:
 		cached = block_group_cache_done(block_group);
 		if (unlikely(!cached)) {
+			have_caching_bg = true;
 			ret = cache_block_group(block_group, 0);
 			BUG_ON(ret < 0);
 			ret = 0;
@@ -7228,8 +7232,6 @@ unclustered_alloc:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
-			if (!cached)
-				have_caching_bg = true;
 			goto loop;
 		}
 checks:
@@ -7286,7 +7288,20 @@ loop:
 	 */
 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
 		index = 0;
-		loop++;
+		if (loop == LOOP_CACHING_NOWAIT) {
+			/*
+			 * We want to skip the LOOP_CACHING_WAIT step if we
+			 * don't have any unached bgs and we've alrelady done a
+			 * full search through.
+			 */
+			if (have_caching_bg || !full_search)
+				loop = LOOP_CACHING_WAIT;
+			else
+				loop = LOOP_ALLOC_CHUNK;
+		} else {
+			loop++;
+		}
+
 		if (loop == LOOP_ALLOC_CHUNK) {
 			struct btrfs_trans_handle *trans;
 			int exist = 0;
@@ -7304,6 +7319,15 @@ loop:
 
 			ret = do_chunk_alloc(trans, root, flags,
 					     CHUNK_ALLOC_FORCE);
+
+			/*
+			 * If we can't allocate a new chunk we've already looped
+			 * through at least once, move on to the NO_EMPTY_SIZE
+			 * case.
+			 */
+			if (ret == -ENOSPC)
+				loop = LOOP_NO_EMPTY_SIZE;
+
 			/*
 			 * Do not bail out on ENOSPC since we
 			 * can do more things.
@@ -7320,6 +7344,15 @@ loop:
 		}
 
 		if (loop == LOOP_NO_EMPTY_SIZE) {
+			/*
+			 * Don't loop again if we already have no empty_size and
+			 * no empty_cluster.
+			 */
+			if (empty_size == 0 &&
+			    empty_cluster == 0) {
+				ret = -ENOSPC;
+				goto out;
+			}
 			empty_size = 0;
 			empty_cluster = 0;
 		}
-- 
cgit v1.1


From c759c4e16179e47e099f491011e6acd7858f8625 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Oct 2015 15:25:10 -0400
Subject: Btrfs: don't keep trying to build clusters if we are fragmented

If we are extremely fragmented then we won't be able to create a free_cluster.
So if this happens set last_ptr->fragmented so that all future allcations will
give up trying to create a cluster.  When we unpin extents we will unset
->fragmented if we free up a sufficient amount of space in a block group.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |   3 ++
 fs/btrfs/extent-tree.c      | 102 +++++++++++++++++++++++++++++++++-----------
 fs/btrfs/free-space-cache.c |   1 +
 3 files changed, 82 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78a8f56..ced08c9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1232,6 +1232,9 @@ struct btrfs_free_cluster {
 	/* first extent starting offset */
 	u64 window_start;
 
+	/* We did a full search and couldn't create a cluster */
+	bool fragmented;
+
 	struct btrfs_block_group_cache *block_group;
 	/*
 	 * when a cluster is allocated from a block group, we put the
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f18eb0..4757645 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6142,6 +6142,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 	update_global_block_rsv(fs_info);
 }
 
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+		   u64 *empty_cluster)
+{
+	struct btrfs_free_cluster *ret = NULL;
+	bool ssd = btrfs_test_opt(root, SSD);
+
+	*empty_cluster = 0;
+	if (btrfs_mixed_space_info(space_info))
+		return ret;
+
+	if (ssd)
+		*empty_cluster = 2 * 1024 * 1024;
+	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		ret = &root->fs_info->meta_alloc_cluster;
+		if (!ssd)
+			*empty_cluster = 64 * 1024;
+	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+		ret = &root->fs_info->data_alloc_cluster;
+	}
+
+	return ret;
+}
+
 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 			      const bool return_free_space)
 {
@@ -6149,7 +6177,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 	struct btrfs_block_group_cache *cache = NULL;
 	struct btrfs_space_info *space_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_free_cluster *cluster = NULL;
 	u64 len;
+	u64 total_unpinned = 0;
+	u64 empty_cluster = 0;
 	bool readonly;
 
 	while (start <= end) {
@@ -6158,8 +6189,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 		    start >= cache->key.objectid + cache->key.offset) {
 			if (cache)
 				btrfs_put_block_group(cache);
+			total_unpinned = 0;
 			cache = btrfs_lookup_block_group(fs_info, start);
 			BUG_ON(!cache); /* Logic error */
+
+			cluster = fetch_cluster_info(root,
+						     cache->space_info,
+						     &empty_cluster);
+			empty_cluster <<= 1;
 		}
 
 		len = cache->key.objectid + cache->key.offset - start;
@@ -6172,8 +6209,22 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
 		}
 
 		start += len;
+		total_unpinned += len;
 		space_info = cache->space_info;
 
+		/*
+		 * If this space cluster has been marked as fragmented and we've
+		 * unpinned enough in this block group to potentially allow a
+		 * cluster to be created inside of it go ahead and clear the
+		 * fragmented check.
+		 */
+		if (cluster && cluster->fragmented &&
+		    total_unpinned > empty_cluster) {
+			spin_lock(&cluster->lock);
+			cluster->fragmented = 0;
+			spin_unlock(&cluster->lock);
+		}
+
 		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
@@ -6911,7 +6962,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	struct btrfs_block_group_cache *block_group = NULL;
 	u64 search_start = 0;
 	u64 max_extent_size = 0;
-	int empty_cluster = 2 * 1024 * 1024;
+	u64 empty_cluster = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
 	int index = __get_raid_index(flags);
@@ -6959,37 +7010,25 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 		spin_unlock(&space_info->lock);
 	}
 
-	/*
-	 * If the space info is for both data and metadata it means we have a
-	 * small filesystem and we can't use the clustering stuff.
-	 */
-	if (btrfs_mixed_space_info(space_info))
-		use_cluster = false;
-
-	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-		last_ptr = &root->fs_info->meta_alloc_cluster;
-		if (!btrfs_test_opt(root, SSD))
-			empty_cluster = 64 * 1024;
-	}
-
-	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-	    btrfs_test_opt(root, SSD)) {
-		last_ptr = &root->fs_info->data_alloc_cluster;
-	}
-
+	last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
 	if (last_ptr) {
 		spin_lock(&last_ptr->lock);
 		if (last_ptr->block_group)
 			hint_byte = last_ptr->window_start;
+		if (last_ptr->fragmented) {
+			/*
+			 * We still set window_start so we can keep track of the
+			 * last place we found an allocation to try and save
+			 * some time.
+			 */
+			hint_byte = last_ptr->window_start;
+			use_cluster = false;
+		}
 		spin_unlock(&last_ptr->lock);
 	}
 
 	search_start = max(search_start, first_logical_byte(root, 0));
 	search_start = max(search_start, hint_byte);
-
-	if (!last_ptr)
-		empty_cluster = 0;
-
 	if (search_start == hint_byte) {
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
@@ -7074,7 +7113,7 @@ have_block_group:
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
 		 */
-		if (last_ptr) {
+		if (last_ptr && use_cluster) {
 			struct btrfs_block_group_cache *used_block_group;
 			unsigned long aligned_cluster;
 			/*
@@ -7200,6 +7239,16 @@ refill_cluster:
 		}
 
 unclustered_alloc:
+		/*
+		 * We are doing an unclustered alloc, set the fragmented flag so
+		 * we don't bother trying to setup a cluster again until we get
+		 * more space.
+		 */
+		if (unlikely(last_ptr)) {
+			spin_lock(&last_ptr->lock);
+			last_ptr->fragmented = 1;
+			spin_unlock(&last_ptr->lock);
+		}
 		spin_lock(&block_group->free_space_ctl->tree_lock);
 		if (cached &&
 		    block_group->free_space_ctl->free_space <
@@ -7361,6 +7410,11 @@ loop:
 	} else if (!ins->objectid) {
 		ret = -ENOSPC;
 	} else if (ins->objectid) {
+		if (!use_cluster && last_ptr) {
+			spin_lock(&last_ptr->lock);
+			last_ptr->window_start = ins->objectid;
+			spin_unlock(&last_ptr->lock);
+		}
 		ret = 0;
 	}
 out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c0eb84e..a5922e8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3063,6 +3063,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 	spin_lock_init(&cluster->refill_lock);
 	cluster->root = RB_ROOT;
 	cluster->max_size = 0;
+	cluster->fragmented = false;
 	INIT_LIST_HEAD(&cluster->block_group_list);
 	cluster->block_group = NULL;
 }
-- 
cgit v1.1


From cef404837002103584c7c82f1e3fc3ec5961f47b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Oct 2015 16:09:42 -0400
Subject: Btrfs: keep track of largest extent in bitmaps

We can waste a lot of time searching through bitmaps when we are heavily
fragmented trying to find large contiguous areas that don't exist in the bitmap.
So keep track of the max extent size when we do a full search of a bitmap so
that next time around we can just skip the expensive searching if our max size
is less than what we are looking for.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 39 ++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/free-space-cache.h |  1 +
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a5922e8..e10c966 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1738,6 +1738,16 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	unsigned long next_zero;
 	unsigned long extent_bits;
 
+	/*
+	 * Skip searching the bitmap if we don't have a contiguous section that
+	 * is large enough for this allocation.
+	 */
+	if (bitmap_info->max_extent_size &&
+	    bitmap_info->max_extent_size < *bytes) {
+		*bytes = bitmap_info->max_extent_size;
+		return -1;
+	}
+
 	i = offset_to_bit(bitmap_info->offset, ctl->unit,
 			  max_t(u64, *offset, bitmap_info->offset));
 	bits = bytes_to_bits(*bytes, ctl->unit);
@@ -1762,6 +1772,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	}
 
 	*bytes = (u64)(max_bits) * ctl->unit;
+	bitmap_info->max_extent_size = *bytes;
 	return -1;
 }
 
@@ -1943,6 +1954,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
 
+	/*
+	 * We set some bytes, we have no idea what the max extent size is
+	 * anymore.
+	 */
+	info->max_extent_size = 0;
+
 	return bytes_to_set;
 
 }
@@ -2782,6 +2799,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	unsigned long want_bits;
 	unsigned long min_bits;
 	unsigned long found_bits;
+	unsigned long max_bits = 0;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
 	int ret;
@@ -2791,6 +2809,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 	want_bits = bytes_to_bits(bytes, ctl->unit);
 	min_bits = bytes_to_bits(min_bytes, ctl->unit);
 
+	/*
+	 * Don't bother looking for a cluster in this bitmap if it's heavily
+	 * fragmented.
+	 */
+	if (entry->max_extent_size &&
+	    entry->max_extent_size < cont1_bytes)
+		return -ENOSPC;
 again:
 	found_bits = 0;
 	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2798,13 +2823,19 @@ again:
 					       BITS_PER_BITMAP, i);
 		if (next_zero - i >= min_bits) {
 			found_bits = next_zero - i;
+			if (found_bits > max_bits)
+				max_bits = found_bits;
 			break;
 		}
+		if (next_zero - i > max_bits)
+			max_bits = next_zero - i;
 		i = next_zero;
 	}
 
-	if (!found_bits)
+	if (!found_bits) {
+		entry->max_extent_size = (u64)max_bits * ctl->unit;
 		return -ENOSPC;
+	}
 
 	if (!total_found) {
 		start = i;
@@ -3540,6 +3571,7 @@ again:
 		spin_lock(&ctl->tree_lock);
 		info->offset = offset;
 		info->bytes = bytes;
+		info->max_extent_size = 0;
 		ret = link_free_space(ctl, info);
 		spin_unlock(&ctl->tree_lock);
 		if (ret)
@@ -3567,6 +3599,11 @@ again:
 	}
 
 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
+	/* We used the newly allocated info, set the max_extent_size to bytes */
+	if (!info)
+		bitmap_info->max_extent_size = bytes_added;
+
 	bytes -= bytes_added;
 	offset += bytes_added;
 	spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029..f251865 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
 	struct rb_node offset_index;
 	u64 offset;
 	u64 bytes;
+	u64 max_extent_size;
 	unsigned long *bitmap;
 	struct list_head list;
 };
-- 
cgit v1.1


From 0584f718ed1f351fca5047a4b1ebba9b5ea41215 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Oct 2015 16:12:23 -0400
Subject: Btrfs: don't do extra bitmap search in one bit case

When we make ctl->unit allocations from a bitmap there is no point in searching
for the next 0 in the bitmap.  If we've found a bit we're done and can just exit
the loop.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e10c966..0948d34 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
  */
 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
-			 u64 *bytes)
+			 u64 *bytes, bool for_alloc)
 {
 	unsigned long found_bits = 0;
 	unsigned long max_bits = 0;
@@ -1742,7 +1742,8 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	 * Skip searching the bitmap if we don't have a contiguous section that
 	 * is large enough for this allocation.
 	 */
-	if (bitmap_info->max_extent_size &&
+	if (for_alloc &&
+	    bitmap_info->max_extent_size &&
 	    bitmap_info->max_extent_size < *bytes) {
 		*bytes = bitmap_info->max_extent_size;
 		return -1;
@@ -1753,6 +1754,10 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	bits = bytes_to_bits(*bytes, ctl->unit);
 
 	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+		if (for_alloc && bits == 1) {
+			found_bits = 1;
+			break;
+		}
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
 		extent_bits = next_zero - i;
@@ -1824,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		if (entry->bitmap) {
 			u64 size = *bytes;
 
-			ret = search_bitmap(ctl, entry, &tmp, &size);
+			ret = search_bitmap(ctl, entry, &tmp, &size, true);
 			if (!ret) {
 				*offset = tmp;
 				*bytes = size;
@@ -1885,7 +1890,8 @@ again:
 	search_start = *offset;
 	search_bytes = ctl->unit;
 	search_bytes = min(search_bytes, end - search_start + 1);
-	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+			    false);
 	if (ret < 0 || search_start != *offset)
 		return -EINVAL;
 
@@ -1930,7 +1936,7 @@ again:
 		search_start = *offset;
 		search_bytes = ctl->unit;
 		ret = search_bitmap(ctl, bitmap_info, &search_start,
-				    &search_bytes);
+				    &search_bytes, false);
 		if (ret < 0 || search_start != *offset)
 			return -EAGAIN;
 
@@ -2685,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 	search_start = min_start;
 	search_bytes = bytes;
 
-	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
 	if (err) {
 		if (search_bytes > *max_extent_size)
 			*max_extent_size = search_bytes;
@@ -3262,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 		}
 
 		bytes = minlen;
-		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
 		if (ret2 || start >= end) {
 			spin_unlock(&ctl->tree_lock);
 			mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3415,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
 		u64 count = 1;
 		int ret;
 
-		ret = search_bitmap(ctl, entry, &offset, &count);
+		ret = search_bitmap(ctl, entry, &offset, &count, true);
 		/* Logic error; Should be empty if it can't find anything */
 		ASSERT(!ret);
 
@@ -3600,10 +3606,6 @@ again:
 
 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
 
-	/* We used the newly allocated info, set the max_extent_size to bytes */
-	if (!info)
-		bitmap_info->max_extent_size = bytes_added;
-
 	bytes -= bytes_added;
 	offset += bytes_added;
 	spin_unlock(&ctl->tree_lock);
@@ -3647,7 +3649,7 @@ have_info:
 
 		bit_off = offset;
 		bit_bytes = ctl->unit;
-		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
 		if (!ret) {
 			if (bit_off == offset) {
 				ret = 1;
-- 
cgit v1.1


From d7b8b384b02f52b20a538e741c75c5d50c0e131f Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 21 Oct 2015 18:49:50 -0700
Subject: f2fs: fix leakage of inmemory atomic pages

If we got failure during commit_atomic_write, abort_volatile_write will be
called, but will not drop the inmemory pages due to no FI_ATOMIC_FILE.
Actually, there is no reason to check the flag in abort_volatile_write.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 91c51a6..a197215 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1443,13 +1443,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 
 	f2fs_balance_fs(F2FS_I_SB(inode));
 
-	if (f2fs_is_atomic_file(inode)) {
-		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-		commit_inmem_pages(inode, true);
-	}
-
-	if (f2fs_is_volatile_file(inode))
-		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	commit_inmem_pages(inode, true);
 
 	mnt_drop_write_file(filp);
 	return ret;
-- 
cgit v1.1


From 2b246fb0f60d318cec6901a0326b94b50d5e1dcb Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 21 Oct 2015 19:00:31 -0700
Subject: f2fs: don't need to submit bio on error case

If commit_atomic_write is failed, we don't need to submit any bio.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7835e41..7efd96ad 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -249,11 +249,11 @@ int commit_inmem_pages(struct inode *inode, bool abort)
 				trace_f2fs_commit_inmem_page(cur->page, INMEM);
 				fio.page = cur->page;
 				err = do_write_data_page(&fio);
-				submit_bio = true;
 				if (err) {
 					unlock_page(cur->page);
 					break;
 				}
+				submit_bio = true;
 			}
 		} else {
 			trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
-- 
cgit v1.1


From ee1d267423a1f8041e2b1a33fc23e4393c67677e Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Tue, 20 Oct 2015 00:39:03 -0700
Subject: pstore: add pstore unregister

pstore doesn't support unregistering yet. It was marked as TODO.
This patch adds some code to fix it:
 1) Add functions to unregister kmsg/console/ftrace/pmsg.
 2) Add a function to free compression buffer.
 3) Unmap the memory and free it.
 4) Add a function to unregister pstore filesystem.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Acked-by: Kees Cook <keescook@chromium.org>
[Removed __exit annotation from ramoops_remove(). Reported by Arnd Bergmann]
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/Kconfig    |  2 +-
 fs/pstore/Makefile   |  6 +++---
 fs/pstore/ftrace.c   | 25 +++++++++++++++++++------
 fs/pstore/inode.c    |  9 +++++++++
 fs/pstore/internal.h |  4 ++++
 fs/pstore/platform.c | 35 +++++++++++++++++++++++++++++++++++
 fs/pstore/pmsg.c     |  7 +++++++
 fs/pstore/ram.c      | 19 ++++++++-----------
 8 files changed, 86 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 916b8e2..360ae43 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,5 +1,5 @@
 config PSTORE
-	bool "Persistent store support"
+	tristate "Persistent store support"
 	default n
 	select ZLIB_DEFLATE
 	select ZLIB_INFLATE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index e647d8e..b8803cc 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -2,12 +2,12 @@
 # Makefile for the linux pstorefs routines.
 #
 
-obj-y += pstore.o
+obj-$(CONFIG_PSTORE) += pstore.o
 
 pstore-objs += inode.o platform.o
-obj-$(CONFIG_PSTORE_FTRACE)	+= ftrace.o
+pstore-$(CONFIG_PSTORE_FTRACE)	+= ftrace.o
 
-obj-$(CONFIG_PSTORE_PMSG)	+= pmsg.o
+pstore-$(CONFIG_PSTORE_PMSG)	+= pmsg.o
 
 ramoops-objs += ram.o ram_core.o
 obj-$(CONFIG_PSTORE_RAM)	+= ramoops.o
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 76a4eeb..d488770 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -104,22 +104,23 @@ static const struct file_operations pstore_knob_fops = {
 	.write	= pstore_ftrace_knob_write,
 };
 
+static struct dentry *pstore_ftrace_dir;
+
 void pstore_register_ftrace(void)
 {
-	struct dentry *dir;
 	struct dentry *file;
 
 	if (!psinfo->write_buf)
 		return;
 
-	dir = debugfs_create_dir("pstore", NULL);
-	if (!dir) {
+	pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
+	if (!pstore_ftrace_dir) {
 		pr_err("%s: unable to create pstore directory\n", __func__);
 		return;
 	}
 
-	file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
-				   &pstore_knob_fops);
+	file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir,
+				   NULL, &pstore_knob_fops);
 	if (!file) {
 		pr_err("%s: unable to create record_ftrace file\n", __func__);
 		goto err_file;
@@ -127,5 +128,17 @@ void pstore_register_ftrace(void)
 
 	return;
 err_file:
-	debugfs_remove(dir);
+	debugfs_remove(pstore_ftrace_dir);
+}
+
+void pstore_unregister_ftrace(void)
+{
+	mutex_lock(&pstore_ftrace_lock);
+	if (pstore_ftrace_enabled) {
+		unregister_ftrace_function(&pstore_ftrace_ops);
+		pstore_ftrace_enabled = 0;
+	}
+	mutex_unlock(&pstore_ftrace_lock);
+
+	debugfs_remove_recursive(pstore_ftrace_dir);
 }
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 3adcc46..3586491 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,7 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
 }
 
 static const struct file_operations pstore_file_operations = {
+	.owner		= THIS_MODULE,
 	.open		= pstore_file_open,
 	.read		= pstore_file_read,
 	.llseek		= pstore_file_llseek,
@@ -456,6 +457,7 @@ static void pstore_kill_sb(struct super_block *sb)
 }
 
 static struct file_system_type pstore_fs_type = {
+	.owner          = THIS_MODULE,
 	.name		= "pstore",
 	.mount		= pstore_mount,
 	.kill_sb	= pstore_kill_sb,
@@ -479,5 +481,12 @@ out:
 }
 module_init(init_pstore_fs)
 
+static void __exit exit_pstore_fs(void)
+{
+	unregister_filesystem(&pstore_fs_type);
+	sysfs_remove_mount_point(fs_kobj, "pstore");
+}
+module_exit(exit_pstore_fs)
+
 MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
 MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index c36ba2c..96253c4 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -41,14 +41,18 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
 
 #ifdef CONFIG_PSTORE_FTRACE
 extern void pstore_register_ftrace(void);
+extern void pstore_unregister_ftrace(void);
 #else
 static inline void pstore_register_ftrace(void) {}
+static inline void pstore_unregister_ftrace(void) {}
 #endif
 
 #ifdef CONFIG_PSTORE_PMSG
 extern void pstore_register_pmsg(void);
+extern void pstore_unregister_pmsg(void);
 #else
 static inline void pstore_register_pmsg(void) {}
+static inline void pstore_unregister_pmsg(void) {}
 #endif
 
 extern struct pstore_info *psinfo;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 1b11249..0aab920 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -237,6 +237,14 @@ static void allocate_buf_for_compression(void)
 
 }
 
+static void free_buf_for_compression(void)
+{
+	kfree(stream.workspace);
+	stream.workspace = NULL;
+	kfree(big_oops_buf);
+	big_oops_buf = NULL;
+}
+
 /*
  * Called when compression fails, since the printk buffer
  * would be fetched for compression calling it again when
@@ -358,6 +366,11 @@ static void pstore_register_kmsg(void)
 	kmsg_dump_register(&pstore_dumper);
 }
 
+static void pstore_unregister_kmsg(void)
+{
+	kmsg_dump_unregister(&pstore_dumper);
+}
+
 #ifdef CONFIG_PSTORE_CONSOLE
 static void pstore_console_write(struct console *con, const char *s, unsigned c)
 {
@@ -395,8 +408,14 @@ static void pstore_register_console(void)
 {
 	register_console(&pstore_console);
 }
+
+static void pstore_unregister_console(void)
+{
+	unregister_console(&pstore_console);
+}
 #else
 static void pstore_register_console(void) {}
+static void pstore_unregister_console(void) {}
 #endif
 
 static int pstore_write_compat(enum pstore_type_id type,
@@ -467,12 +486,28 @@ int pstore_register(struct pstore_info *psi)
 	 */
 	backend = psi->name;
 
+	module_put(owner);
+
 	pr_info("Registered %s as persistent store backend\n", psi->name);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_register);
 
+void pstore_unregister(struct pstore_info *psi)
+{
+	pstore_unregister_pmsg();
+	pstore_unregister_ftrace();
+	pstore_unregister_console();
+	pstore_unregister_kmsg();
+
+	free_buf_for_compression();
+
+	psinfo = NULL;
+	backend = NULL;
+}
+EXPORT_SYMBOL_GPL(pstore_unregister);
+
 /*
  * Read all the records from the persistent store. Create
  * files in our filesystem.  Don't warn about -EEXIST errors
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 5a2f05a..7de20cd 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -114,3 +114,10 @@ err_class:
 err:
 	return;
 }
+
+void pstore_unregister_pmsg(void)
+{
+	device_destroy(pmsg_class, MKDEV(pmsg_major, 0));
+	class_destroy(pmsg_class);
+	unregister_chrdev(pmsg_major, PMSG_NAME);
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6c26c4d..319c3a6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -578,30 +578,27 @@ fail_out:
 	return err;
 }
 
-static int __exit ramoops_remove(struct platform_device *pdev)
+static int ramoops_remove(struct platform_device *pdev)
 {
-#if 0
-	/* TODO(kees): We cannot unload ramoops since pstore doesn't support
-	 * unregistering yet.
-	 */
 	struct ramoops_context *cxt = &oops_cxt;
 
-	iounmap(cxt->virt_addr);
-	release_mem_region(cxt->phys_addr, cxt->size);
+	pstore_unregister(&cxt->pstore);
 	cxt->max_dump_cnt = 0;
 
-	/* TODO(kees): When pstore supports unregistering, call it here. */
 	kfree(cxt->pstore.buf);
 	cxt->pstore.bufsize = 0;
 
+	persistent_ram_free(cxt->mprz);
+	persistent_ram_free(cxt->fprz);
+	persistent_ram_free(cxt->cprz);
+	ramoops_free_przs(cxt);
+
 	return 0;
-#endif
-	return -EBUSY;
 }
 
 static struct platform_driver ramoops_driver = {
 	.probe		= ramoops_probe,
-	.remove		= __exit_p(ramoops_remove),
+	.remove		= ramoops_remove,
 	.driver		= {
 		.name	= "ramoops",
 	},
-- 
cgit v1.1


From 7fee740697e0d9a57d618b6fec79e4c4e09fd606 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 22 Oct 2015 18:18:11 +0800
Subject: f2fs: fix to clear GCed flag for atomic written page

Atomic write page can be GCed, after committing this kind of page, we should
clear the GCed flag for it.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7efd96ad..f77b325 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -253,6 +253,7 @@ int commit_inmem_pages(struct inode *inode, bool abort)
 					unlock_page(cur->page);
 					break;
 				}
+				clear_cold_data(cur->page);
 				submit_bio = true;
 			}
 		} else {
-- 
cgit v1.1


From a6be014e1d28339ba7c745fc4ac1efdbf6e2c1a2 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 22 Oct 2015 18:23:08 +0800
Subject: f2fs: fix error path of ->symlink

Now, in ->symlink of f2fs, we kept the fixed invoking order between
f2fs_add_link and page_symlink since we should init node info firstly
in f2fs_add_link, then such node info can be used in page_symlink.

But we didn't fix to release meta info which was done before page_symlink
in our error path, so this will leave us corrupt symlink entry in its
parent's dentry page. Fix this issue by adding f2fs_unlink in the error
path for removing such linking.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index dfa01c8..e48b80c 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -410,11 +410,14 @@ err_out:
 	 * If the symlink path is stored into inline_data, there is no
 	 * performance regression.
 	 */
-	if (!err)
+	if (!err) {
 		filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
 
-	if (IS_DIRSYNC(dir))
-		f2fs_sync_fs(sbi->sb, 1);
+		if (IS_DIRSYNC(dir))
+			f2fs_sync_fs(sbi->sb, 1);
+	} else {
+		f2fs_unlink(dir, dentry);
+	}
 
 	kfree(sd);
 	f2fs_fname_crypto_free_buffer(&disk_link);
-- 
cgit v1.1


From beaa57dd986d4f398728c060692fc2452895cfd8 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 22 Oct 2015 18:24:12 +0800
Subject: f2fs: fix to skip shrinking extent nodes

In f2fs_shrink_extent_tree we should stop shrink flow if we have already
shrunk enough nodes in extent cache.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index a38ee9b..7ddba81 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -620,7 +620,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 			write_unlock(&et->lock);
 
 			if (node_cnt + tree_cnt >= nr_shrink)
-				break;
+				goto unlock_out;
 		}
 	}
 unlock_out:
-- 
cgit v1.1


From 7e26e9ff0a9301398bd7cf8d896536da3a54aa9a Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Thu, 22 Oct 2015 01:02:33 -0700
Subject: pstore: Fix return type of pstore_is_mounted()

This patch changes return type of pstore_is_mounted from int to bool.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/inode.c    | 2 +-
 fs/pstore/internal.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 3586491..d8c439d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -288,7 +288,7 @@ static const struct super_operations pstore_ops = {
 
 static struct super_block *pstore_sb;
 
-int pstore_is_mounted(void)
+bool pstore_is_mounted(void)
 {
 	return pstore_sb != NULL;
 }
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 96253c4..e38a22b 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -63,6 +63,6 @@ extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
 			      int count, char *data, bool compressed,
 			      size_t size, struct timespec time,
 			      struct pstore_info *psi);
-extern int	pstore_is_mounted(void);
+extern bool	pstore_is_mounted(void);
 
 #endif
-- 
cgit v1.1


From e55c34a66f87e78fb1fc6b623b78c5ad74b475af Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 22 Oct 2015 13:38:13 -0400
Subject: locks: introduce locks_lock_inode_wait()

Users of the locks API commonly call either posix_lock_file_wait() or
flock_lock_file_wait() depending upon the lock type.  Add a new function
locks_lock_inode_wait() which will check and call the correct function for
the type of lock passed in.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/locks.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index c74c9df..c174511 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1882,6 +1882,30 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 EXPORT_SYMBOL(flock_lock_inode_wait);
 
 /**
+ * locks_lock_inode_wait - Apply a lock to an inode
+ * @inode: inode of the file to apply to
+ * @fl: The lock to be applied
+ *
+ * Apply a POSIX or FLOCK style lock request to an inode.
+ */
+int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+{
+	int res = 0;
+	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+		case FL_POSIX:
+			res = posix_lock_inode_wait(inode, fl);
+			break;
+		case FL_FLOCK:
+			res = flock_lock_inode_wait(inode, fl);
+			break;
+		default:
+			BUG();
+	}
+	return res;
+}
+EXPORT_SYMBOL(locks_lock_inode_wait);
+
+/**
  *	sys_flock: - flock() system call.
  *	@fd: the file descriptor to lock.
  *	@cmd: the type of lock to apply.
-- 
cgit v1.1


From 4f6563677ae833baad8003e14353241bc25da4fc Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 22 Oct 2015 13:38:14 -0400
Subject: Move locks API users to locks_lock_inode_wait()

Instead of having users check for FL_POSIX or FL_FLOCK to call the correct
locks API function, use the check within locks_lock_inode_wait().  This
allows for some later cleanup.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/9p/vfs_file.c    |  4 ++--
 fs/ceph/locks.c     |  4 ++--
 fs/cifs/file.c      |  2 +-
 fs/dlm/plock.c      |  4 ++--
 fs/fuse/file.c      |  2 +-
 fs/gfs2/file.c      |  8 ++++----
 fs/lockd/clntproc.c | 13 +------------
 fs/locks.c          |  2 +-
 fs/nfs/file.c       | 13 +------------
 fs/nfs/nfs4proc.c   | 13 +------------
 fs/ocfs2/locks.c    |  8 ++++----
 11 files changed, 20 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3abc447..f23fd86 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -161,7 +161,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
 		BUG();
 
-	res = posix_lock_file_wait(filp, fl);
+	res = locks_lock_file_wait(filp, fl);
 	if (res < 0)
 		goto out;
 
@@ -231,7 +231,7 @@ out_unlock:
 	if (res < 0 && fl->fl_type != F_UNLCK) {
 		fl_type = fl->fl_type;
 		fl->fl_type = F_UNLCK;
-		res = posix_lock_file_wait(filp, fl);
+		res = locks_lock_file_wait(filp, fl);
 		fl->fl_type = fl_type;
 	}
 out:
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 6706bde..a2cb0c2 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -228,12 +228,12 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
 				file, lock_cmd, wait, fl);
 	if (!err) {
-		err = flock_lock_file_wait(file, fl);
+		err = locks_lock_file_wait(file, fl);
 		if (err) {
 			ceph_lock_message(CEPH_LOCK_FLOCK,
 					  CEPH_MDS_OP_SETFILELOCK,
 					  file, CEPH_LOCK_UNLOCK, 0, fl);
-			dout("got %d on flock_lock_file_wait, undid lock", err);
+			dout("got %d on locks_lock_file_wait, undid lock", err);
 		}
 	}
 	return err;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2a6af1..6afdad7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1553,7 +1553,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 
 out:
 	if (flock->fl_flags & FL_POSIX && !rc)
-		rc = posix_lock_file_wait(file, flock);
+		rc = locks_lock_file_wait(file, flock);
 	return rc;
 }
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 5532f09..3585cc0 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -172,7 +172,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	rv = op->info.rv;
 
 	if (!rv) {
-		if (posix_lock_file_wait(file, fl) < 0)
+		if (locks_lock_file_wait(file, fl) < 0)
 			log_error(ls, "dlm_posix_lock: vfs lock error %llx",
 				  (unsigned long long)number);
 	}
@@ -262,7 +262,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	/* cause the vfs unlock to return ENOENT if lock is not found */
 	fl->fl_flags |= FL_EXISTS;
 
-	rv = posix_lock_file_wait(file, fl);
+	rv = locks_lock_file_wait(file, fl);
 	if (rv == -ENOENT) {
 		rv = 0;
 		goto out_free;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f523f2f..e0faf8f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2189,7 +2189,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
 	int err;
 
 	if (fc->no_flock) {
-		err = flock_lock_file_wait(file, fl);
+		err = locks_lock_file_wait(file, fl);
 	} else {
 		struct fuse_file *ff = file->private_data;
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cf4ab89..9287a2d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1000,7 +1000,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 	}
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
 		if (fl->fl_type == F_UNLCK)
-			posix_lock_file_wait(file, fl);
+			locks_lock_file_wait(file, fl);
 		return -EIO;
 	}
 	if (IS_GETLK(cmd))
@@ -1031,7 +1031,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (gl) {
 		if (fl_gh->gh_state == state)
 			goto out;
-		flock_lock_file_wait(file,
+		locks_lock_file_wait(file,
 				     &(struct file_lock){.fl_type = F_UNLCK});
 		gfs2_glock_dq(fl_gh);
 		gfs2_holder_reinit(state, flags, fl_gh);
@@ -1056,7 +1056,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 		if (error == GLR_TRYFAILED)
 			error = -EAGAIN;
 	} else {
-		error = flock_lock_file_wait(file, fl);
+		error = locks_lock_file_wait(file, fl);
 		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
 	}
 
@@ -1071,7 +1071,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
 
 	mutex_lock(&fp->f_fl_mutex);
-	flock_lock_file_wait(file, fl);
+	locks_lock_file_wait(file, fl);
 	if (fl_gh->gh_gl) {
 		gfs2_glock_dq(fl_gh);
 		gfs2_holder_uninit(fl_gh);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index acd3947..1129520 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -474,18 +474,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho
 
 static int do_vfs_lock(struct file_lock *fl)
 {
-	int res = 0;
-	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
-		case FL_POSIX:
-			res = posix_lock_file_wait(fl->fl_file, fl);
-			break;
-		case FL_FLOCK:
-			res = flock_lock_file_wait(fl->fl_file, fl);
-			break;
-		default:
-			BUG();
-	}
-	return res;
+	return locks_lock_file_wait(fl->fl_file, fl);
 }
 
 /*
diff --git a/fs/locks.c b/fs/locks.c
index c174511..4667f82 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1961,7 +1961,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 					  (can_sleep) ? F_SETLKW : F_SETLK,
 					  lock);
 	else
-		error = flock_lock_file_wait(f.file, lock);
+		error = locks_lock_file_wait(f.file, lock);
 
  out_free:
 	locks_free_lock(lock);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c0f9b1e..37f639d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -738,18 +738,7 @@ out_noconflict:
 
 static int do_vfs_lock(struct file *file, struct file_lock *fl)
 {
-	int res = 0;
-	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
-		case FL_POSIX:
-			res = posix_lock_file_wait(file, fl);
-			break;
-		case FL_FLOCK:
-			res = flock_lock_file_wait(file, fl);
-			break;
-		default:
-			BUG();
-	}
-	return res;
+	return locks_lock_file_wait(file, fl);
 }
 
 static int
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 693b903..ce64a45 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5472,18 +5472,7 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
 
 static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
 {
-	int res = 0;
-	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
-		case FL_POSIX:
-			res = posix_lock_inode_wait(inode, fl);
-			break;
-		case FL_FLOCK:
-			res = flock_lock_inode_wait(inode, fl);
-			break;
-		default:
-			BUG();
-	}
-	return res;
+	return locks_lock_inode_wait(inode, fl);
 }
 
 struct nfs4_unlockdata {
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 6b6d092..652ece4 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -66,7 +66,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
 		 * level.
 		 */
 
-		flock_lock_file_wait(file,
+		locks_lock_file_wait(file,
 				     &(struct file_lock){.fl_type = F_UNLCK});
 
 		ocfs2_file_unlock(file);
@@ -81,7 +81,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
 		goto out;
 	}
 
-	ret = flock_lock_file_wait(file, fl);
+	ret = locks_lock_file_wait(file, fl);
 	if (ret)
 		ocfs2_file_unlock(file);
 
@@ -98,7 +98,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
 
 	mutex_lock(&fp->fp_mutex);
 	ocfs2_file_unlock(file);
-	ret = flock_lock_file_wait(file, fl);
+	ret = locks_lock_file_wait(file, fl);
 	mutex_unlock(&fp->fp_mutex);
 
 	return ret;
@@ -119,7 +119,7 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
 	    ocfs2_mount_local(osb))
-		return flock_lock_file_wait(file, fl);
+		return locks_lock_file_wait(file, fl);
 
 	if (fl->fl_type == F_UNLCK)
 		return ocfs2_do_funlock(file, cmd, fl);
-- 
cgit v1.1


From 616fb38fa7a9599293e05ae1fa9acfaf73922434 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 22 Oct 2015 13:38:15 -0400
Subject: locks: cleanup posix_lock_inode_wait and flock_lock_inode_wait

All callers use locks_lock_inode_wait() instead.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/locks.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 4667f82..0d2b326 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1171,10 +1171,9 @@ EXPORT_SYMBOL(posix_lock_file);
  * @inode: inode of file to which lock request should be applied
  * @fl: The lock to be applied
  *
- * Variant of posix_lock_file_wait that does not take a filp, and so can be
- * used after the filp has already been torn down.
+ * Apply a POSIX style lock request to an inode.
  */
-int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep ();
@@ -1191,7 +1190,6 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 	}
 	return error;
 }
-EXPORT_SYMBOL(posix_lock_inode_wait);
 
 /**
  * locks_mandatory_locked - Check for an active lock
@@ -1862,7 +1860,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
  *
  * Apply a FLOCK style lock request to an inode.
  */
-int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep();
@@ -1879,7 +1877,6 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 	}
 	return error;
 }
-EXPORT_SYMBOL(flock_lock_inode_wait);
 
 /**
  * locks_lock_inode_wait - Apply a lock to an inode
-- 
cgit v1.1


From b67de018b37a97548645a879c627d4188518e907 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 22 Oct 2015 13:32:29 -0700
Subject: ocfs2/dlm: unlock lockres spinlock before dlm_lockres_put

dlm_lockres_put will call dlm_lockres_release if it is the last
reference, and then it may call dlm_print_one_lock_resource and
take lockres spinlock.

So unlock lockres spinlock before dlm_lockres_put to avoid deadlock.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c   | 3 ++-
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ee5aa4d..ce38b4c 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1658,12 +1658,13 @@ send_response:
 		if (ret < 0) {
 			mlog(ML_ERROR, "failed to dispatch assert master work\n");
 			response = DLM_MASTER_RESP_ERROR;
+			spin_unlock(&res->spinlock);
 			dlm_lockres_put(res);
 		} else {
 			dispatched = 1;
 			__dlm_lockres_grab_inflight_worker(dlm, res);
+			spin_unlock(&res->spinlock);
 		}
-		spin_unlock(&res->spinlock);
 	} else {
 		if (res)
 			dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 3d90ad7..58eaa5c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1723,8 +1723,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 			} else {
 				dispatched = 1;
 				__dlm_lockres_grab_inflight_worker(dlm, res);
+				spin_unlock(&res->spinlock);
 			}
-			spin_unlock(&res->spinlock);
 		} else {
 			/* put.. incase we are not the master */
 			spin_unlock(&res->spinlock);
-- 
cgit v1.1


From 0d0f4aab4e4d290138a4ae7f2ef8469e48c9a669 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 7 Oct 2015 14:39:55 +0300
Subject: lockd: get rid of reference-counted NSM RPC clients

Currently we have reference-counted per-net NSM RPC client
which created on the first monitor request and destroyed
after the last unmonitor request. It's needed because
RPC client need to know 'utsname()->nodename', but utsname()
might be NULL when nsm_unmonitor() called.

So instead of holding the rpc client we could just save nodename
in struct nlm_host and pass it to the rpc_create().
Thus ther is no need in keeping rpc client until last
unmonitor request. We could create separate RPC clients
for each monitor/unmonitor requests.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/host.c  |  1 +
 fs/lockd/mon.c   | 89 ++++++++++----------------------------------------------
 fs/lockd/netns.h |  3 --
 fs/lockd/svc.c   |  1 -
 4 files changed, 16 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b5f3c3ab..d716c99 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
 	host->h_nsmhandle  = nsm;
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	host->net	   = ni->net;
+	strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
 
 out:
 	return host;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6c05cd1..19166d4 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -42,7 +42,7 @@ struct nsm_args {
 	u32			proc;
 
 	char			*mon_name;
-	char			*nodename;
+	const char		*nodename;
 };
 
 struct nsm_res {
@@ -86,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
 	return rpc_create(&args);
 }
 
-static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
-		struct rpc_clnt *clnt)
-{
-	spin_lock(&ln->nsm_clnt_lock);
-	if (ln->nsm_users == 0) {
-		if (clnt == NULL)
-			goto out;
-		ln->nsm_clnt = clnt;
-	}
-	clnt = ln->nsm_clnt;
-	ln->nsm_users++;
-out:
-	spin_unlock(&ln->nsm_clnt_lock);
-	return clnt;
-}
-
-static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
-{
-	struct rpc_clnt	*clnt, *new;
-	struct lockd_net *ln = net_generic(net, lockd_net_id);
-
-	clnt = nsm_client_set(ln, NULL);
-	if (clnt != NULL)
-		goto out;
-
-	clnt = new = nsm_create(net, nodename);
-	if (IS_ERR(clnt))
-		goto out;
-
-	clnt = nsm_client_set(ln, new);
-	if (clnt != new)
-		rpc_shutdown_client(new);
-out:
-	return clnt;
-}
-
-static void nsm_client_put(struct net *net)
-{
-	struct lockd_net *ln = net_generic(net, lockd_net_id);
-	struct rpc_clnt	*clnt = NULL;
-
-	spin_lock(&ln->nsm_clnt_lock);
-	ln->nsm_users--;
-	if (ln->nsm_users == 0) {
-		clnt = ln->nsm_clnt;
-		ln->nsm_clnt = NULL;
-	}
-	spin_unlock(&ln->nsm_clnt_lock);
-	if (clnt != NULL)
-		rpc_shutdown_client(clnt);
-}
-
 static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
-			 struct rpc_clnt *clnt)
+			 const struct nlm_host *host)
 {
 	int		status;
+	struct rpc_clnt *clnt;
 	struct nsm_args args = {
 		.priv		= &nsm->sm_priv,
 		.prog		= NLM_PROGRAM,
 		.vers		= 3,
 		.proc		= NLMPROC_NSM_NOTIFY,
 		.mon_name	= nsm->sm_mon_name,
-		.nodename	= clnt->cl_nodename,
+		.nodename	= host->nodename,
 	};
 	struct rpc_message msg = {
 		.rpc_argp	= &args,
@@ -157,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 
 	memset(res, 0, sizeof(*res));
 
+	clnt = nsm_create(host->net, host->nodename);
+	if (IS_ERR(clnt)) {
+		dprintk("lockd: failed to create NSM upcall transport, "
+			"status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+		return PTR_ERR(clnt);
+	}
+
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
 	if (status == -ECONNREFUSED) {
@@ -170,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 				status);
 	else
 		status = 0;
+
+	rpc_shutdown_client(clnt);
 	return status;
 }
 
@@ -189,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host)
 	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
 	int		status;
-	struct rpc_clnt *clnt;
-	const char *nodename = NULL;
 
 	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
 
 	if (nsm->sm_monitored)
 		return 0;
 
-	if (host->h_rpcclnt)
-		nodename = host->h_rpcclnt->cl_nodename;
-
 	/*
 	 * Choose whether to record the caller_name or IP address of
 	 * this peer in the local rpc.statd's database.
 	 */
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
-	clnt = nsm_client_get(host->net, nodename);
-	if (IS_ERR(clnt)) {
-		status = PTR_ERR(clnt);
-		dprintk("lockd: failed to create NSM upcall transport, "
-				"status=%d, net=%p\n", status, host->net);
-		return status;
-	}
-
-	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host);
 	if (unlikely(res.status != 0))
 		status = -EIO;
 	if (unlikely(status < 0)) {
@@ -246,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host)
 
 	if (atomic_read(&nsm->sm_count) == 1
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
-		struct lockd_net *ln = net_generic(host->net, lockd_net_id);
-
 		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
 
-		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host);
 		if (res.status != 0)
 			status = -EIO;
 		if (status < 0)
@@ -258,8 +201,6 @@ void nsm_unmonitor(const struct nlm_host *host)
 					nsm->sm_name);
 		else
 			nsm->sm_monitored = 0;
-
-		nsm_client_put(host->net);
 	}
 }
 
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 89fe011b..5426189 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,9 +12,6 @@ struct lockd_net {
 	struct delayed_work grace_period_end;
 	struct lock_manager lockd_manager;
 
-	spinlock_t nsm_clnt_lock;
-	unsigned int nsm_users;
-	struct rpc_clnt *nsm_clnt;
 	struct list_head nsm_handles;
 };
 
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0dff13f..5f31ebd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -592,7 +592,6 @@ static int lockd_init_net(struct net *net)
 	INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
 	INIT_LIST_HEAD(&ln->lockd_manager.list);
 	ln->lockd_manager.block_opens = false;
-	spin_lock_init(&ln->nsm_clnt_lock);
 	INIT_LIST_HEAD(&ln->nsm_handles);
 	return 0;
 }
-- 
cgit v1.1


From 825213e59ec24110b0a0f94456db42621f928421 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sat, 3 Oct 2015 08:19:57 -0400
Subject: nfsd: move include of state.h from trace.c to trace.h

Any file which includes trace.h will need to include state.h, even if
they aren't using any state tracepoints. Ensure that we include any
headers that might be needed in trace.h instead of relying on the
*.c files to have the right ones.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/trace.c | 2 --
 fs/nfsd/trace.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 82f8907..9096746 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,5 +1,3 @@
 
-#include "state.h"
-
 #define CREATE_TRACE_POINTS
 #include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c668520..0befe76 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,6 +9,8 @@
 
 #include <linux/tracepoint.h>
 
+#include "state.h"
+
 DECLARE_EVENT_CLASS(nfsd_stateid_class,
 	TP_PROTO(stateid_t *stp),
 	TP_ARGS(stp),
-- 
cgit v1.1


From 2b63482185e6054cc11ca6d6c073f90160c161fd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Oct 2015 15:33:23 -0400
Subject: nfsd: fix clid_inuse on mount with security change

In bakeathon testing Solaris client was getting CLID_INUSE error when
doing a krb5 mount soon after an auth_sys mount, or vice versa.

That's not really necessary since in this case the old client doesn't
have any state any more:

	http://tools.ietf.org/html/rfc7530#page-103

	"when the server gets a SETCLIENTID for a client ID that
	currently has no state, or it has state but the lease has
	expired, rather than returning NFS4ERR_CLID_INUSE, the server
	MUST allow the SETCLIENTID and confirm the new client ID if
	followed by the appropriate SETCLIENTID_CONFIRM."

This doesn't fix the problem completely since our client_has_state()
check counts openowners left around to handle close replays, which we
should probably just remove in this case.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1b39edf..bac6207 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2262,7 +2262,8 @@ static bool client_has_state(struct nfs4_client *clp)
 	 * Note clp->cl_openowners check isn't quite right: there's no
 	 * need to count owners without stateid's.
 	 *
-	 * Also note we should probably be using this in 4.0 case too.
+	 * Also note in 4.0 case should also be checking for openowners
+	 * kept around just for close handling.
 	 */
 	return !list_empty(&clp->cl_openowners)
 #ifdef CONFIG_NFSD_PNFS
@@ -3049,7 +3050,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* Cases below refer to rfc 3530 section 14.2.33: */
 	spin_lock(&nn->client_lock);
 	conf = find_confirmed_client_by_name(&clname, nn);
-	if (conf) {
+	if (conf && client_has_state(conf)) {
 		/* case 0: */
 		status = nfserr_clid_inuse;
 		if (clp_used_exchangeid(conf))
@@ -3136,6 +3137,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	} else { /* case 3: normal case; new or rebooted client */
 		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
 		if (old) {
+			status = nfserr_clid_inuse;
+			if (client_has_state(old)
+					&& !same_creds(&unconf->cl_cred,
+							&old->cl_cred))
+				goto out;
 			status = mark_client_expired_locked(old);
 			if (status) {
 				old = NULL;
-- 
cgit v1.1


From 4eaea13425078272895ec37814c6878d78b8db9f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Oct 2015 16:11:01 -0400
Subject: nfsd: improve client_has_state to check for unused openowners

At least in the v4.0 case openowners can hang around for a while after
last close, but they shouldn't really block (for example), a new mount
with a different principal.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bac6207..f2ea343 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2256,16 +2256,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 	clid->flags = new->cl_exchange_flags;
 }
 
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+	struct nfs4_openowner *oo;
+
+	list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+		if (!list_empty(&oo->oo_owner.so_stateids))
+			return true;
+	}
+	return false;
+}
+
 static bool client_has_state(struct nfs4_client *clp)
 {
-	/*
-	 * Note clp->cl_openowners check isn't quite right: there's no
-	 * need to count owners without stateid's.
-	 *
-	 * Also note in 4.0 case should also be checking for openowners
-	 * kept around just for close handling.
-	 */
-	return !list_empty(&clp->cl_openowners)
+	return client_has_openowners(clp)
 #ifdef CONFIG_NFSD_PNFS
 		|| !list_empty(&clp->cl_lo_states)
 #endif
-- 
cgit v1.1


From cc8a55320b5f1196bee5bd14e4bb2ebd3b983317 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 17 Sep 2015 07:58:24 -0400
Subject: nfsd: serialize layout stateid morphing operations

In order to allow the client to make a sane determination of what
happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
must ensure that the seqids return accurately represent the order of
operations. The simplest way to do that is to ensure that operations on
a single stateid are serialized.

This patch adds a mutex to the layout stateid, and locks it when
checking the layout stateid's seqid. The mutex is held over the entire
operation and released after the seqid is bumped.

Note that in the case of CB_LAYOUTRECALL we must move the increment of
the seqid and setting into a new cb "prepare" operation. The lease
infrastructure will call the lm_break callback with a spinlock held, so
and we can't take the mutex in that codepath.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4layouts.c | 25 +++++++++++++++++++++----
 fs/nfsd/nfs4proc.c    |  4 ++++
 fs/nfsd/state.h       |  1 +
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e4..4a68ab9 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	INIT_LIST_HEAD(&ls->ls_perfile);
 	spin_lock_init(&ls->ls_lock);
 	INIT_LIST_HEAD(&ls->ls_layouts);
+	mutex_init(&ls->ls_mutex);
 	ls->ls_layout_type = layout_type;
 	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
 			NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		status = nfserr_jukebox;
 		if (!ls)
 			goto out;
+		mutex_lock(&ls->ls_mutex);
 	} else {
 		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
 
 		status = nfserr_bad_stateid;
+		mutex_lock(&ls->ls_mutex);
 		if (stateid->si_generation > stid->sc_stateid.si_generation)
-			goto out_put_stid;
+			goto out_unlock_stid;
 		if (layout_type != ls->ls_layout_type)
-			goto out_put_stid;
+			goto out_unlock_stid;
 	}
 
 	*lsp = ls;
 	return 0;
 
+out_unlock_stid:
+	mutex_unlock(&ls->ls_mutex);
 out_put_stid:
 	nfs4_put_stid(stid);
 out:
@@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
 	trace_layout_recall(&ls->ls_stid.sc_stateid);
 
 	atomic_inc(&ls->ls_stid.sc_count);
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
 	nfsd4_run_cb(&ls->ls_recall);
 
 out_unlock:
@@ -494,6 +497,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 	}
 	spin_unlock(&ls->ls_lock);
 
+	mutex_unlock(&ls->ls_mutex);
 	nfs4_put_stid(&ls->ls_stid);
 	nfsd4_free_layouts(&reaplist);
 	return nfs_ok;
@@ -608,6 +612,17 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 	}
 }
 
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+	mutex_lock(&ls->ls_mutex);
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+}
+
 static int
 nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
@@ -649,12 +664,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 
 	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
 
+	mutex_unlock(&ls->ls_mutex);
 	nfsd4_return_all_layouts(ls, &reaplist);
 	nfsd4_free_layouts(&reaplist);
 	nfs4_put_stid(&ls->ls_stid);
 }
 
 static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+	.prepare	= nfsd4_cb_layout_prepare,
 	.done		= nfsd4_cb_layout_done,
 	.release	= nfsd4_cb_layout_release,
 };
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4ce6b97..a9f096c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
 	nfserr = nfsd4_insert_layout(lgp, ls);
 
 out_put_stid:
+	mutex_unlock(&ls->ls_mutex);
 	nfs4_put_stid(&ls->ls_stid);
 out:
 	return nfserr;
@@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
 		goto out;
 	}
 
+	/* LAYOUTCOMMIT does not require any serialization */
+	mutex_unlock(&ls->ls_mutex);
+
 	if (new_size > i_size_read(inode)) {
 		lcp->lc_size_chg = 1;
 		lcp->lc_newsize = new_size;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 31bde12..1fa0f38 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -562,6 +562,7 @@ struct nfs4_layout_stateid {
 	struct nfsd4_callback		ls_recall;
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
+	struct mutex			ls_mutex;
 };
 
 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
-- 
cgit v1.1


From 9767feb2c64b29775f1ea683130b44f95f67d169 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 1 Oct 2015 09:05:50 -0400
Subject: nfsd: ensure that seqid morphing operations are atomic wrt to copies

Bruce points out that the increment of the seqid in stateids is not
serialized in any way, so it's possible for racing calls to bump it
twice and end up sending the same stateid. While we don't have any
reports of this problem it _is_ theoretically possible, and could lead
to spurious state recovery by the client.

In the current code, update_stateid is always followed by a memcpy of
that stateid, so we can combine the two operations. For better
atomicity, we add a spinlock to the nfs4_stid and hold that when bumping
the seqid and copying the stateid.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4layouts.c | 13 ++++---------
 fs/nfsd/nfs4state.c   | 34 +++++++++++++++++++---------------
 fs/nfsd/state.h       | 23 ++++++++---------------
 3 files changed, 31 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 4a68ab9..9ffef06 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -409,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
 	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
 	new = NULL;
 done:
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
 	spin_unlock(&ls->ls_lock);
 out:
 	spin_unlock(&fp->fi_lock);
@@ -484,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 		}
 	}
 	if (!list_empty(&ls->ls_layouts)) {
-		if (found) {
-			update_stateid(&ls->ls_stid.sc_stateid);
-			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
-				sizeof(stateid_t));
-		}
+		if (found)
+			nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
 		lrp->lrs_present = 1;
 	} else {
 		trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
@@ -619,8 +615,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 
 	mutex_lock(&ls->ls_mutex);
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
 }
 
 static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f2ea343..0a69715 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
 	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
 	/* Will be incremented before return to client: */
 	atomic_set(&stid->sc_count, 1);
+	spin_lock_init(&stid->sc_lock);
 
 	/*
 	 * It shouldn't be a problem to reuse an opaque stateid value.
@@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s)
 		put_nfs4_file(fp);
 }
 
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+	stateid_t *src = &stid->sc_stateid;
+
+	spin_lock(&stid->sc_lock);
+	if (unlikely(++src->si_generation == 0))
+		src->si_generation = 1;
+	memcpy(dst, src, sizeof(*dst));
+	spin_unlock(&stid->sc_lock);
+}
+
 static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 {
 	struct file *filp = NULL;
@@ -4221,8 +4234,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (stp->st_clnt_odstate == open->op_odstate)
 			open->op_odstate = NULL;
 	}
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
 	up_read(&stp->st_rwsem);
 
 	if (nfsd4_has_session(&resp->cstate)) {
@@ -4925,8 +4937,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto put_stateid;
 	}
 	oo->oo_flags |= NFS4_OO_CONFIRMED;
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
 	up_write(&stp->st_rwsem);
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -4999,11 +5010,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		goto put_stateid;
 	}
 	nfs4_stateid_downgrade(stp, od->od_share_access);
-
 	reset_union_bmap_deny(od->od_share_deny, stp);
-
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
 	status = nfs_ok;
 put_stateid:
 	up_write(&stp->st_rwsem);
@@ -5058,8 +5066,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
 		goto out; 
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 	up_write(&stp->st_rwsem);
 
 	nfsd4_close_open_stateid(stp);
@@ -5542,9 +5549,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
 	switch (-err) {
 	case 0: /* success! */
-		update_stateid(&lock_stp->st_stid.sc_stateid);
-		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, 
-				sizeof(stateid_t));
+		nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
 		status = 0;
 		break;
 	case (EAGAIN):		/* conflock holds conflicting lock */
@@ -5736,8 +5741,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
-	update_stateid(&stp->st_stid.sc_stateid);
-	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
 fput:
 	fput(filp);
 put_stateid:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1fa0f38..77fdf4d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -84,7 +84,7 @@ struct nfsd4_callback_ops {
  * fields that are of general use to any stateid.
  */
 struct nfs4_stid {
-	atomic_t sc_count;
+	atomic_t		sc_count;
 #define NFS4_OPEN_STID 1
 #define NFS4_LOCK_STID 2
 #define NFS4_DELEG_STID 4
@@ -94,11 +94,12 @@ struct nfs4_stid {
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
 #define NFS4_LAYOUT_STID 64
-	unsigned char sc_type;
-	stateid_t sc_stateid;
-	struct nfs4_client *sc_client;
-	struct nfs4_file *sc_file;
-	void (*sc_free)(struct nfs4_stid *);
+	unsigned char		sc_type;
+	stateid_t		sc_stateid;
+	spinlock_t		sc_lock;
+	struct nfs4_client	*sc_client;
+	struct nfs4_file	*sc_file;
+	void			(*sc_free)(struct nfs4_stid *);
 };
 
 /*
@@ -364,15 +365,6 @@ struct nfs4_client_reclaim {
 	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
 };
 
-static inline void
-update_stateid(stateid_t *stateid)
-{
-	stateid->si_generation++;
-	/* Wraparound recommendation from 3530bis-13 9.1.3.2: */
-	if (stateid->si_generation == 0)
-		stateid->si_generation = 1;
-}
-
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
  * The OPEN response, typically the largest, requires 
  *   4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) +  8(verifier) + 
@@ -595,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
 		struct kmem_cache *slab);
 void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
 extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
-- 
cgit v1.1


From c57d3e7a9391c03ae7ee5572be850284393f5bef Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 8 Sep 2015 11:05:49 +0200
Subject: i2c-dev: Fix typo in ioctl name reference

The ioctl is named I2C_RDWR for "I2C read/write". But references to it
were misspelled "rdrw". Fix them.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 48851f6..dcf2653 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -686,7 +686,7 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
 
 	if (get_user(nmsgs, &udata->nmsgs))
 		return -EFAULT;
-	if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+	if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
 		return -EINVAL;
 
 	if (get_user(datap, &udata->msgs))
-- 
cgit v1.1


From 2c3cf7d5f6105bb957df125dfce61d4483b8742d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 22 Oct 2015 09:47:34 +0100
Subject: Btrfs: fix regression when running delayed references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the kernel 4.2 merge window we had a refactoring/rework of the delayed
references implementation in order to fix certain problems with qgroups.
However that rework introduced one more regression that leads to the
following trace when running delayed references for metadata:

[35908.064664] kernel BUG at fs/btrfs/extent-tree.c:1832!
[35908.065201] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[35908.065201] Modules linked in: dm_flakey dm_mod btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc psmouse i2
[35908.065201] CPU: 14 PID: 15014 Comm: kworker/u32:9 Tainted: G        W       4.3.0-rc5-btrfs-next-17+ #1
[35908.065201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[35908.065201] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
[35908.065201] task: ffff880114b7d780 ti: ffff88010c4c8000 task.ti: ffff88010c4c8000
[35908.065201] RIP: 0010:[<ffffffffa04928b5>]  [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
[35908.065201] RSP: 0018:ffff88010c4cbb08  EFLAGS: 00010293
[35908.065201] RAX: 0000000000000000 RBX: ffff88008a661000 RCX: 0000000000000000
[35908.065201] RDX: ffffffffa04dd58f RSI: 0000000000000001 RDI: 0000000000000000
[35908.065201] RBP: ffff88010c4cbb40 R08: 0000000000001000 R09: ffff88010c4cb9f8
[35908.065201] R10: 0000000000000000 R11: 000000000000002c R12: 0000000000000000
[35908.065201] R13: ffff88020a74c578 R14: 0000000000000000 R15: 0000000000000000
[35908.065201] FS:  0000000000000000(0000) GS:ffff88023edc0000(0000) knlGS:0000000000000000
[35908.065201] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[35908.065201] CR2: 00000000015e8708 CR3: 0000000102185000 CR4: 00000000000006e0
[35908.065201] Stack:
[35908.065201]  ffff88010c4cbb18 0000000000000f37 ffff88020a74c578 ffff88015a408000
[35908.065201]  ffff880154a44000 0000000000000000 0000000000000005 ffff88010c4cbbd8
[35908.065201]  ffffffffa0492b9a 0000000000000005 0000000000000000 0000000000000000
[35908.065201] Call Trace:
[35908.065201]  [<ffffffffa0492b9a>] __btrfs_inc_extent_ref+0x8b/0x208 [btrfs]
[35908.065201]  [<ffffffffa0497117>] ? __btrfs_run_delayed_refs+0x4d4/0xd33 [btrfs]
[35908.065201]  [<ffffffffa049773d>] __btrfs_run_delayed_refs+0xafa/0xd33 [btrfs]
[35908.065201]  [<ffffffffa04a976a>] ? join_transaction.isra.10+0x25/0x41f [btrfs]
[35908.065201]  [<ffffffffa04a97ed>] ? join_transaction.isra.10+0xa8/0x41f [btrfs]
[35908.065201]  [<ffffffffa049914d>] btrfs_run_delayed_refs+0x75/0x1dd [btrfs]
[35908.065201]  [<ffffffffa04992f1>] delayed_ref_async_start+0x3c/0x7b [btrfs]
[35908.065201]  [<ffffffffa04d4b4f>] normal_work_helper+0x14c/0x32a [btrfs]
[35908.065201]  [<ffffffffa04d4e93>] btrfs_extent_refs_helper+0x12/0x14 [btrfs]
[35908.065201]  [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[35908.065201]  [<ffffffff81064285>] worker_thread+0x206/0x2c2
[35908.065201]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[35908.065201]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[35908.065201]  [<ffffffff8106904d>] kthread+0xef/0xf7
[35908.065201]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[35908.065201]  [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[35908.065201]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[35908.065201] Code: 6a 01 41 56 41 54 ff 75 10 41 51 4d 89 c1 49 89 c8 48 8d 4d d0 e8 f6 f1 ff ff 48 83 c4 28 85 c0 75 2c 49 81 fc ff 00 00 00 77 02 <0f> 0b 4c 8b 45 30 8b 4d 28 45 31
[35908.065201] RIP  [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
[35908.065201]  RSP <ffff88010c4cbb08>
[35908.310885] ---[ end trace fe4299baf0666457 ]---

This happens because the new delayed references code no longer merges
delayed references that have different sequence values. The following
steps are an example sequence leading to this issue:

1) Transaction N starts, fs_info->tree_mod_seq has value 0;

2) Extent buffer (btree node) A is allocated, delayed reference Ref1 for
   bytenr A is created, with a value of 1 and a seq value of 0;

3) fs_info->tree_mod_seq is incremented to 1;

4) Extent buffer A is deleted through btrfs_del_items(), which calls
   btrfs_del_leaf(), which in turn calls btrfs_free_tree_block(). The
   later returns the metadata extent associated to extent buffer A to
   the free space cache (the range is not pinned), because the extent
   buffer was created in the current transaction (N) and writeback never
   happened for the extent buffer (flag BTRFS_HEADER_FLAG_WRITTEN not set
   in the extent buffer).
   This creates the delayed reference Ref2 for bytenr A, with a value
   of -1 and a seq value of 1;

5) Delayed reference Ref2 is not merged with Ref1 when we create it,
   because they have different sequence numbers (decided at
   add_delayed_ref_tail_merge());

6) fs_info->tree_mod_seq is incremented to 2;

7) Some task attempts to allocate a new extent buffer (done at
   extent-tree.c:find_free_extent()), but due to heavy fragmentation
   and running low on metadata space the clustered allocation fails
   and we fall back to unclustered allocation, which finds the
   extent at offset A, so a new extent buffer at offset A is allocated.
   This creates delayed reference Ref3 for bytenr A, with a value of 1
   and a seq value of 2;

8) Ref3 is not merged neither with Ref2 nor Ref1, again because they
   all have different seq values;

9) We start running the delayed references (__btrfs_run_delayed_refs());

10) The delayed Ref1 is the first one being applied, which ends up
    creating an inline extent backref in the extent tree;

10) Next the delayed reference Ref3 is selected for execution, and not
    Ref2, because select_delayed_ref() always gives a preference for
    positive references (that have an action of BTRFS_ADD_DELAYED_REF);

11) When running Ref3 we encounter alreay the inline extent backref
    in the extent tree at insert_inline_extent_backref(), which makes
    us hit the following BUG_ON:

        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);

    This is always true because owner corresponds to the level of the
    extent buffer/btree node in the btree.

For the scenario described above we hit the BUG_ON because we never merge
references that have different seq values.

We used to do the merging before the 4.2 kernel, more specifically, before
the commmits:

  c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
  c43d160fcd5e ("btrfs: delayed-ref: Cleanup the unneeded functions.")

This issue became more exposed after the following change that was added
to 4.2 as well:

  cffc3374e567 ("Btrfs: fix order by which delayed references are run")

Which in turn fixed another regression by the two commits previously
mentioned.

So fix this by bringing back the delayed reference merge code, with the
proper adaptations so that it operates against the new data structure
(linked list vs old red black tree implementation).

This issue was hit running fstest btrfs/063 in a loop. Several people have
reported this issue in the mailing list when running on kernels 4.2+.

Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).

Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org  # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/delayed-ref.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent-tree.c |  14 ++++++
 2 files changed, 127 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index bd9b63b..2f41580 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 		trans->delayed_ref_updates--;
 }
 
+static bool merge_ref(struct btrfs_trans_handle *trans,
+		      struct btrfs_delayed_ref_root *delayed_refs,
+		      struct btrfs_delayed_ref_head *head,
+		      struct btrfs_delayed_ref_node *ref,
+		      u64 seq)
+{
+	struct btrfs_delayed_ref_node *next;
+	bool done = false;
+
+	next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+				list);
+	while (!done && &next->list != &head->ref_list) {
+		int mod;
+		struct btrfs_delayed_ref_node *next2;
+
+		next2 = list_next_entry(next, list);
+
+		if (next == ref)
+			goto next;
+
+		if (seq && next->seq >= seq)
+			goto next;
+
+		if (next->type != ref->type || next->no_quota != ref->no_quota)
+			goto next;
+
+		if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+		     ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+		    comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
+				   btrfs_delayed_node_to_tree_ref(next),
+				   ref->type))
+			goto next;
+		if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		     ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
+		    comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
+				   btrfs_delayed_node_to_data_ref(next)))
+			goto next;
+
+		if (ref->action == next->action) {
+			mod = next->ref_mod;
+		} else {
+			if (ref->ref_mod < next->ref_mod) {
+				swap(ref, next);
+				done = true;
+			}
+			mod = -next->ref_mod;
+		}
+
+		drop_delayed_ref(trans, delayed_refs, head, next);
+		ref->ref_mod += mod;
+		if (ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, head, ref);
+			done = true;
+		} else {
+			/*
+			 * Can't have multiples of the same ref on a tree block.
+			 */
+			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+next:
+		next = next2;
+	}
+
+	return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_node *ref;
+	u64 seq = 0;
+
+	assert_spin_locked(&head->lock);
+
+	if (list_empty(&head->ref_list))
+		return;
+
+	/* We don't have too many refs to merge for data. */
+	if (head->is_data)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+			       list);
+	while (&ref->list != &head->ref_list) {
+		if (seq && ref->seq >= seq)
+			goto next;
+
+		if (merge_ref(trans, delayed_refs, head, ref, seq)) {
+			if (list_empty(&head->ref_list))
+				break;
+			ref = list_first_entry(&head->ref_list,
+					       struct btrfs_delayed_ref_node,
+					       list);
+			continue;
+		}
+next:
+		ref = list_next_entry(ref, list);
+	}
+}
+
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 92fdbc6..c0f30f5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2502,7 +2502,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 * Or we can get node references of the same type that weren't
+		 * merged when created due to bumps in the tree mod seq, and
+		 * we need to merge them to prevent adding an inline extent
+		 * backref before dropping it (triggering a BUG_ON at
+		 * insert_inline_extent_backref()).
+		 */
 		spin_lock(&locked_ref->lock);
+		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+					 locked_ref);
 
 		/*
 		 * locked_ref is the head node, so we have to go one
-- 
cgit v1.1


From b06c4bf5c874a57254b197f53ddf588e7a24a2bf Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 23 Oct 2015 07:52:54 +0100
Subject: Btrfs: fix regression running delayed references when using qgroups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:

  commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")

Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:

  static int run_delayed_tree_ref(...)
  {
     (...)
     BUG_ON(node->ref_mod != 1);
     (...)
  }

This happens on a scenario like the following:

  1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.

  2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
     It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.

  3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
     It's not merged with the reference at the tail of the list of refs
     for bytenr X because the reference at the tail, Ref2 is incompatible
     due to Ref2->no_quota != Ref3->no_quota.

  4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
     It's not merged with the reference at the tail of the list of refs
     for bytenr X because the reference at the tail, Ref3 is incompatible
     due to Ref3->no_quota != Ref4->no_quota.

  5) We run delayed references, trigger merging of delayed references,
     through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().

  6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
     all other conditions are satisfied too. So Ref1 gets a ref_mod
     value of 2.

  7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
     all other conditions are satisfied too. So Ref2 gets a ref_mod
     value of 2.

  8) Ref1 and Ref2 aren't merged, because they have different values
     for their no_quota field.

  9) Delayed reference Ref1 is picked for running (select_delayed_ref()
     always prefers references with an action == BTRFS_ADD_DELAYED_REF).
     So run_delayed_tree_ref() is called for Ref1 which triggers the
     BUG_ON because Ref1->red_mod != 1 (equals 2).

So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").

The use of no_quota was also buggy in at least two places:

1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
   no_quota to 0 instead of 1 when the following condition was true:
   is_fstree(ref_root) || !fs_info->quota_enabled

2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
   reset a node's no_quota when the condition "!is_fstree(root_objectid)
   || !root->fs_info->quota_enabled" was true but we did it only in
   an unused local stack variable, that is, we never reset the no_quota
   value in the node itself.

This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.

Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).

Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:

  INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
  (...)
  Call Trace:
    [<ffffffff86999201>] schedule+0x74/0x83
    [<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
    [<ffffffff86137ed7>] ? wait_woken+0x74/0x74
    [<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
    [<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
    [<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
    [<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
    [<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
    [<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
    [<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
    [<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
    [<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
    [<ffffffff863e852e>] check_ref+0x64/0xc4
    [<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
    [<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
    [<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
    [<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
  (...)

The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).

Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org  # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |  4 ++--
 fs/btrfs/delayed-ref.c | 28 +++++++----------------
 fs/btrfs/delayed-ref.h |  7 ++----
 fs/btrfs/extent-tree.c | 45 +++++++++++++-----------------------
 fs/btrfs/file.c        | 10 ++++----
 fs/btrfs/inode.c       |  4 ++--
 fs/btrfs/ioctl.c       | 62 +-------------------------------------------------
 fs/btrfs/relocation.c  | 16 ++++++-------
 fs/btrfs/tree-log.c    |  2 +-
 9 files changed, 44 insertions(+), 134 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bc3c711..3fa3c3b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3422,7 +3422,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int no_quota);
+		      u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
 			       int delalloc);
@@ -3435,7 +3435,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset, int no_quota);
+			 u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2f41580..1c3588a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -220,7 +220,7 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 		if (seq && next->seq >= seq)
 			goto next;
 
-		if (next->type != ref->type || next->no_quota != ref->no_quota)
+		if (next->type != ref->type)
 			goto next;
 
 		if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -405,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
 	exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
 			   list);
 	/* No need to compare bytenr nor is_head */
-	if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
-	    exist->seq != ref->seq)
+	if (exist->type != ref->type || exist->seq != ref->seq)
 		goto add_tail;
 
 	if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -639,7 +638,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		     struct btrfs_delayed_ref_head *head_ref,
 		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
 		     u64 num_bytes, u64 parent, u64 ref_root, int level,
-		     int action, int no_quota)
+		     int action)
 {
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -661,7 +660,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->action = action;
 	ref->is_head = 0;
 	ref->in_tree = 1;
-	ref->no_quota = no_quota;
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -694,7 +692,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		     struct btrfs_delayed_ref_head *head_ref,
 		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
 		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
-		     u64 offset, int action, int no_quota)
+		     u64 offset, int action)
 {
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -717,7 +715,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->action = action;
 	ref->is_head = 0;
 	ref->in_tree = 1;
-	ref->no_quota = no_quota;
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -748,17 +745,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota)
+			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		no_quota = 0;
-
 	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
@@ -787,8 +780,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 					bytenr, num_bytes, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-				   num_bytes, parent, ref_root, level, action,
-				   no_quota);
+			     num_bytes, parent, ref_root, level, action);
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
@@ -809,17 +801,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota)
+			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		no_quota = 0;
-
 	BUG_ON(extent_op && !extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
@@ -855,7 +843,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
-				   action, no_quota);
+				   action);
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d4c41e2..f9cf234 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
 
 	unsigned int action:8;
 	unsigned int type:8;
-	unsigned int no_quota:1;
 	/* is this node still in the rbtree? */
 	unsigned int is_head:1;
 	unsigned int in_tree:1;
@@ -244,15 +243,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota);
+			       struct btrfs_delayed_extent_op *extent_op);
 int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op,
-			       int no_quota);
+			       struct btrfs_delayed_extent_op *extent_op);
 int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
 				     struct btrfs_trans_handle *trans,
 				     u64 ref_root, u64 bytenr, u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0f30f5..c1f8c7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins,
-				     int no_quota);
+				     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 flags,
 			  int force);
@@ -2073,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset,
-			 int no_quota)
+			 u64 root_objectid, u64 owner, u64 offset)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2086,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
 					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+					BTRFS_ADD_DELAYED_REF, NULL);
 	} else {
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 					num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+					BTRFS_ADD_DELAYED_REF, NULL);
 	}
 	return ret;
 }
@@ -2112,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	u64 num_bytes = node->num_bytes;
 	u64 refs;
 	int ret;
-	int no_quota = node->no_quota;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
-		no_quota = 1;
-
 	path->reada = 1;
 	path->leave_spinning = 1;
 	/* this will setup the path even if it fails to insert the back ref */
@@ -2355,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 						parent, ref_root,
 						extent_op->flags_to_set,
 						&extent_op->key,
-						ref->level, &ins,
-						node->no_quota);
+						ref->level, &ins);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, root, node,
 					     parent, ref_root,
@@ -3192,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, int);
+			    u64, u64, u64, u64, u64, u64);
 
 
 	if (btrfs_test_is_dummy_root(root))
@@ -3233,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset, 1);
+					   key.offset);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = root->nodesize;
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0,
-					   1);
+					   parent, ref_root, level - 1, 0);
 			if (ret)
 				goto fail;
 		}
@@ -6431,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
-	int no_quota = node->no_quota;
 	u32 item_size;
 	u64 refs;
 	u64 bytenr = node->bytenr;
@@ -6440,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
 						 SKINNY_METADATA);
 
-	if (!info->quota_enabled || !is_fstree(root_objectid))
-		no_quota = 1;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -6768,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 					buf->start, buf->len,
 					parent, root->root_key.objectid,
 					btrfs_header_level(buf),
-					BTRFS_DROP_DELAYED_REF, NULL, 0);
+					BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 
@@ -6816,7 +6804,7 @@ out:
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-		      u64 owner, u64 offset, int no_quota)
+		      u64 owner, u64 offset)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6839,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
 					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+					BTRFS_DROP_DELAYED_REF, NULL);
 	} else {
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 						num_bytes,
 						parent, root_objectid, owner,
 						offset, BTRFS_DROP_DELAYED_REF,
-						NULL, no_quota);
+						NULL);
 	}
 	return ret;
 }
@@ -7690,8 +7678,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins,
-				     int no_quota)
+				     int level, struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7781,7 +7768,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
 					 ins->offset, 0,
 					 root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+					 BTRFS_ADD_DELAYED_EXTENT, NULL);
 	return ret;
 }
 
@@ -7995,7 +7982,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 						 ins.objectid, ins.offset,
 						 parent, root_objectid, level,
 						 BTRFS_ADD_DELAYED_EXTENT,
-						 extent_op, 0);
+						 extent_op);
 		if (ret)
 			goto out_free_delayed;
 	}
@@ -8544,7 +8531,7 @@ skip:
 			}
 		}
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-				root->root_key.objectid, level - 1, 0, 0);
+				root->root_key.objectid, level - 1, 0);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	btrfs_tree_unlock(next);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1243205..381be79 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset, 1);
+						start - extent_offset);
 				BUG_ON(ret); /* -ENOMEM */
 			}
 			key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset, 0);
+						extent_offset);
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   ino, orig_offset, 1);
+					   ino, orig_offset);
 		BUG_ON(ret); /* -ENOMEM */
 
 		if (split == start) {
@@ -1231,7 +1231,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset, 0);
+					ino, orig_offset);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	other_start = 0;
@@ -1248,7 +1248,7 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset, 0);
+					ino, orig_offset);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	if (del_nr == 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a018e47..6f030c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2595,7 +2595,7 @@ again:
 	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
 			new->disk_len, 0,
 			backref->root_id, backref->inum,
-			new->file_pos, 0);	/* start - extent_offset */
+			new->file_pos);	/* start - extent_offset */
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_free_path;
@@ -4541,7 +4541,7 @@ delete:
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset, 0);
+						ino, extent_offset);
 			BUG_ON(ret);
 			if (btrfs_should_throttle_delayed_refs(trans, root))
 				btrfs_async_run_delayed_refs(root,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7ed033a..4df0f2b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3206,41 +3206,6 @@ out:
 	return ret;
 }
 
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr.  If it does then we need to update the quota for this root.  This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     u64 disko)
-{
-	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
-	struct ulist *roots;
-	struct ulist_iterator uiter;
-	struct ulist_node *root_node = NULL;
-	int ret;
-
-	if (!root->fs_info->quota_enabled)
-		return 1;
-
-	btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-	ret = btrfs_find_all_roots(trans, root->fs_info, disko,
-				   tree_mod_seq_elem.seq, &roots);
-	if (ret < 0)
-		goto out;
-	ret = 0;
-	ULIST_ITER_INIT(&uiter);
-	while ((root_node = ulist_next(roots, &uiter))) {
-		if (root_node->val == root->objectid) {
-			ret = 1;
-			break;
-		}
-	}
-	ulist_free(roots);
-out:
-	btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-	return ret;
-}
-
 static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     u64 endoff,
@@ -3499,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 	u32 nritems;
 	int slot;
 	int ret;
-	int no_quota;
 	const u64 len = olen_aligned;
-	u64 last_disko = 0;
 	u64 last_dest_end = destoff;
 
 	ret = -ENOMEM;
@@ -3547,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
 process_slot:
-		no_quota = 1;
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
 			if (ret < 0)
@@ -3699,35 +3661,13 @@ process_slot:
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								datal);
 
-				/*
-				 * We need to look up the roots that point at
-				 * this bytenr and see if the new root does.  If
-				 * it does not we need to make sure we update
-				 * quotas appropriately.
-				 */
-				if (disko && root != BTRFS_I(src)->root &&
-				    disko != last_disko) {
-					no_quota = check_ref(trans, root,
-							     disko);
-					if (no_quota < 0) {
-						btrfs_abort_transaction(trans,
-									root,
-									ret);
-						btrfs_end_transaction(trans,
-								      root);
-						ret = no_quota;
-						goto out;
-					}
-				}
-
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(inode),
-							new_key.offset - datao,
-							no_quota);
+							new_key.offset - datao);
 					if (ret) {
 						btrfs_abort_transaction(trans,
 									root,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a7dc456..b4ca545 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset, 1);
+					   key.objectid, key.offset);
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset, 1);
+					key.objectid, key.offset);
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			break;
@@ -1900,23 +1900,21 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0,
-					1);
+					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0, 1);
+					0);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0,
-					1);
+					src->root_key.objectid, level - 1, 0);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0, 1);
+					0);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0, 1);
+						node->level, 0);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1fffe88..323e12c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -693,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset, 0);
+						key->objectid, offset);
 				if (ret)
 					goto out;
 			} else {
-- 
cgit v1.1


From 4ac313111018cb44ecc250445de5ccb93026a980 Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@libc.org>
Date: Thu, 20 Aug 2015 15:11:06 -0400
Subject: fs/binfmt_elf_fdpic.c: fix brk area overlap with stack on NOMMU

On NOMMU archs, the FDPIC ELF loader sets up the usable brk range to
overlap with all but the last PAGE_SIZE bytes of the stack. This leads
to catastrophic memory reuse/corruption if brk is used. Fix by setting
the brk area to zero size to disable its use.

Signed-off-by: Rich Felker <dalias@libc.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 fs/binfmt_elf_fdpic.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bf..d2b079a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -374,10 +374,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 		PAGE_ALIGN(current->mm->start_brk);
 
 #else
-	/* create a stack and brk area big enough for everyone
-	 * - the brk heap starts at the bottom and works up
-	 * - the stack starts at the top and works down
-	 */
+	/* create a stack area and zero-size brk area */
 	stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
 	if (stack_size < PAGE_SIZE * 2)
 		stack_size = PAGE_SIZE * 2;
@@ -400,8 +397,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 
 	current->mm->brk = current->mm->start_brk;
 	current->mm->context.end_brk = current->mm->start_brk;
-	current->mm->context.end_brk +=
-		(stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
 	current->mm->start_stack = current->mm->start_brk + stack_size;
 #endif
 
-- 
cgit v1.1


From 849ef9286f30c88113906dc35f44a499c0cb385d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 12 Oct 2015 16:55:54 +0200
Subject: btrfs: check unsupported filters in balance arguments

We don't verify that all the balance filter arguments supplemented by
the flags are actually known to the kernel. Thus we let it silently pass
and do nothing.

At the moment this means only the 'limit' filter, but we're going to add
a few more soon so it's better to have that fixed. Also in older stable
kernels so that it works with newer userspace tools.

Cc: stable@vger.kernel.org # 3.16+
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c   | 5 +++++
 fs/btrfs/volumes.h | 8 ++++++++
 2 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4df0f2b..7375cf2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4691,6 +4691,11 @@ locked:
 		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
 	}
 
+	if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+		ret = -EINVAL;
+		goto out_bargs;
+	}
+
 do_balance:
 	/*
 	 * Ownership of bctl and mutually_exclusive_operation_running
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 99c208f..b8e64ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -381,6 +381,14 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 
+#define BTRFS_BALANCE_ARGS_MASK			\
+	(BTRFS_BALANCE_ARGS_PROFILES |		\
+	 BTRFS_BALANCE_ARGS_USAGE |		\
+	 BTRFS_BALANCE_ARGS_DEVID | 		\
+	 BTRFS_BALANCE_ARGS_DRANGE |		\
+	 BTRFS_BALANCE_ARGS_VRANGE |		\
+	 BTRFS_BALANCE_ARGS_LIMIT)
+
 /*
  * Profile changing flags.  When SOFT is set we won't relocate chunk if
  * it already has the target profile (even though it may be
-- 
cgit v1.1


From 2849a854224487bc578b73b64422c3cb3ef93ff5 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 13 Oct 2015 14:06:48 -0400
Subject: btrfs: fix use after free iterating extrefs

The code for btrfs inode-resolve has never worked properly for
files with enough hard links to trigger extrefs.  It was trying to
get the leaf out of a path after freeing the path:

	btrfs_release_path(path);
	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, slot);

The fix here is to use the extent buffer we cloned just a little higher
up to avoid deadlocks caused by using the leaf in the path.

Signed-off-by: Chris Mason <clm@fb.com>
cc: stable@vger.kernel.org # v3.7+
cc: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2adc152a..6dcdb2e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1834,7 +1834,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 	int found = 0;
 	struct extent_buffer *eb;
 	struct btrfs_inode_extref *extref;
-	struct extent_buffer *leaf;
 	u32 item_size;
 	u32 cur_offset;
 	unsigned long ptr;
@@ -1862,9 +1861,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, slot);
-		ptr = btrfs_item_ptr_offset(leaf, slot);
+		item_size = btrfs_item_size_nr(eb, slot);
+		ptr = btrfs_item_ptr_offset(eb, slot);
 		cur_offset = 0;
 
 		while (cur_offset < item_size) {
@@ -1878,7 +1876,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 			if (ret)
 				break;
 
-			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += btrfs_inode_extref_name_len(eb, extref);
 			cur_offset += sizeof(*extref);
 		}
 		btrfs_tree_read_unlock_blocking(eb);
-- 
cgit v1.1


From 12907fc79818a62a2478f84f7795afa774bf7f9c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 10 Oct 2015 17:16:50 +0200
Subject: btrfs: extend balance filter limit to take minimum and maximum

The 'limit' filter is underdesigned, it should have been a range for
[min,max], with some relaxed semantics when one of the bounds is
missing. Besides that, using a full u64 for a single value is a waste of
bytes.

Let's fix both by extending the use of the u64 bytes for the [min,max]
range. This can be done in a backward compatible way, the range will be
interpreted only if the appropriate flag is set
(BTRFS_BALANCE_ARGS_LIMIT_RANGE).

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   | 14 ++++++++++++--
 fs/btrfs/volumes.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 3 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3fa3c3b..460cd69 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -846,8 +846,18 @@ struct btrfs_disk_balance_args {
 	/* BTRFS_BALANCE_ARGS_* */
 	__le64 flags;
 
-	/* BTRFS_BALANCE_ARGS_LIMIT value */
-	__le64 limit;
+	/*
+	 * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+	 * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+	 * and maximum
+	 */
+	union {
+		__le64 limit;
+		struct {
+			__le32 limit_min;
+			__le32 limit_max;
+		};
+	};
 
 	__le64 unused[7];
 } __attribute__ ((__packed__));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f86d838..e2fdd2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3300,6 +3300,16 @@ static int should_balance_chunk(struct btrfs_root *root,
 			return 0;
 		else
 			bargs->limit--;
+	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+		/*
+		 * Same logic as the 'limit' filter; the minimum cannot be
+		 * determined here because we do not have the global informatoin
+		 * about the count of all chunks that satisfy the filters.
+		 */
+		if (bargs->limit_max == 0)
+			return 0;
+		else
+			bargs->limit_max--;
 	}
 
 	return 1;
@@ -3314,6 +3324,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
+	u64 chunk_type;
 	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -3324,9 +3335,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	int ret;
 	int enospc_errors = 0;
 	bool counting = true;
+	/* The single value limit and min/max limits use the same bytes in the */
 	u64 limit_data = bctl->data.limit;
 	u64 limit_meta = bctl->meta.limit;
 	u64 limit_sys = bctl->sys.limit;
+	u32 count_data = 0;
+	u32 count_meta = 0;
+	u32 count_sys = 0;
 
 	/* step one make some room on all the devices */
 	devices = &fs_info->fs_devices->devices;
@@ -3367,6 +3382,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->balance_lock);
 again:
 	if (!counting) {
+		/*
+		 * The single value limit and min/max limits use the same bytes
+		 * in the
+		 */
 		bctl->data.limit = limit_data;
 		bctl->meta.limit = limit_meta;
 		bctl->sys.limit = limit_sys;
@@ -3414,6 +3433,7 @@ again:
 		}
 
 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
 
 		if (!counting) {
 			spin_lock(&fs_info->balance_lock);
@@ -3434,6 +3454,28 @@ again:
 			spin_lock(&fs_info->balance_lock);
 			bctl->stat.expected++;
 			spin_unlock(&fs_info->balance_lock);
+
+			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+				count_data++;
+			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+				count_sys++;
+			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+				count_meta++;
+
+			goto loop;
+		}
+
+		/*
+		 * Apply limit_min filter, no need to check if the LIMITS
+		 * filter is used, limit_min is 0 by default
+		 */
+		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+					count_data < bctl->data.limit_min)
+				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+					count_meta < bctl->meta.limit_min)
+				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+					count_sys < bctl->sys.limit_min)) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			goto loop;
 		}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b8e64ea..628d4e1 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -380,6 +380,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
-- 
cgit v1.1


From dee32d0ac3719ef8d640efaf0884111df444730f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabr=C3=ADel=20Arth=C3=BAr=20P=C3=A9tursson?=
 <gabriel@system.is>
Date: Mon, 28 Sep 2015 22:32:41 +0000
Subject: btrfs: add balance filter for stripes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Balance block groups which have the given number of stripes, defined by
a range min..max. This is useful to selectively rebalance only chunks
that do not span enough devices, applies to RAID0/10/5/6.

Signed-off-by: Gabríel Arthúr Pétursson <gabriel@system.is>
[ renamed bargs members, added to the UAPI, wrote the changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   |  9 ++++++++-
 fs/btrfs/volumes.c | 19 +++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 460cd69..8ccbf43 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -859,7 +859,14 @@ struct btrfs_disk_balance_args {
 		};
 	};
 
-	__le64 unused[7];
+	/*
+	 * Process chunks that cross stripes_min..stripes_max devices,
+	 * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+	 */
+	__le32 stripes_min;
+	__le32 stripes_max;
+
+	__le64 unused[6];
 } __attribute__ ((__packed__));
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e2fdd2e..9f5d167 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3220,6 +3220,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
 	return 1;
 }
 
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       struct btrfs_balance_args *bargs)
+{
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+	if (bargs->stripes_min <= num_stripes
+			&& num_stripes <= bargs->stripes_max)
+		return 0;
+
+	return 1;
+}
+
 static int chunk_soft_convert_filter(u64 chunk_type,
 				     struct btrfs_balance_args *bargs)
 {
@@ -3286,6 +3299,12 @@ static int should_balance_chunk(struct btrfs_root *root,
 		return 0;
 	}
 
+	/* stripes filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
 	/* soft profile changing mode */
 	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
 	    chunk_soft_convert_filter(chunk_type, bargs)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 628d4e1..a5bf224 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -381,6 +381,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 #define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
-- 
cgit v1.1


From bc3094673f22d00794a8606200788e411b4ec80d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 20 Oct 2015 18:22:13 +0200
Subject: btrfs: extend balance filter usage to take minimum and maximum

Similar to the 'limit' filter, we can enhance the 'usage' filter to
accept a range. The change is backward compatible, the range is applied
only in connection with the BTRFS_BALANCE_ARGS_USAGE_RANGE flag.

We don't have a usecase yet, the current syntax has been sufficient. The
enhancement should provide parity with other range-like filters.

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   | 14 ++++++++++++--
 fs/btrfs/volumes.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h |  1 +
 3 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8ccbf43..4001585 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -823,8 +823,18 @@ struct btrfs_disk_balance_args {
 	 */
 	__le64 profiles;
 
-	/* usage filter */
-	__le64 usage;
+	/*
+	 * usage filter
+	 * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+	 * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+	 */
+	union {
+		__le64 usage;
+		struct {
+			__le32 usage_min;
+			__le32 usage_max;
+		};
+	};
 
 	/* devid filter */
 	__le64 devid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9f5d167..17ed76d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3059,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
 	 * (albeit full) chunks.
 	 */
 	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
 		bctl->data.usage = 90;
 	}
 	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
 		bctl->sys.usage = 90;
 	}
 	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
 		bctl->meta.usage = 90;
@@ -3124,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 			      struct btrfs_balance_args *bargs)
 {
 	struct btrfs_block_group_cache *cache;
+	u64 chunk_used;
+	u64 user_thresh_min;
+	u64 user_thresh_max;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	if (bargs->usage_min == 0)
+		user_thresh_min = 0;
+	else
+		user_thresh_min = div_factor_fine(cache->key.offset,
+					bargs->usage_min);
+
+	if (bargs->usage_max == 0)
+		user_thresh_max = 1;
+	else if (bargs->usage_max > 100)
+		user_thresh_max = cache->key.offset;
+	else
+		user_thresh_max = div_factor_fine(cache->key.offset,
+					bargs->usage_max);
+
+	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
+		u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
 	u64 chunk_used, user_thresh;
 	int ret = 1;
 
 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
 	chunk_used = btrfs_block_group_used(&cache->item);
 
-	if (bargs->usage == 0)
+	if (bargs->usage_min == 0)
 		user_thresh = 1;
 	else if (bargs->usage > 100)
 		user_thresh = cache->key.offset;
@@ -3279,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root,
 	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
 	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
 		return 0;
+	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+	    chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
 	}
 
 	/* devid filter */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a5bf224..2a396cb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -382,6 +382,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 #define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
 #define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE	(1ULL << 8)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
-- 
cgit v1.1


From b66d62ba1e0d7ada5b89afffe19d12662b5c92c9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Oct 2015 19:15:50 +0200
Subject: btrfs: add balance filters limits, stripes and usage to supported
 mask

Enable the extended 'limit' syntax (a range), the new 'stripes' and
extended 'usage' syntax (a range) filters in the filters mask. The patch
comes separate and not within the series that introduced the new filters
because the patch adding the mask was merged in a late rc. The
integration branch was based on an older rc and could not merge the
patch due to the missing changes.

Prerequisities:
* btrfs: check unsupported filters in balance arguments
* btrfs: extend balance filter limit to take minimum and maximum
* btrfs: add balance filter for stripes
* btrfs: extend balance filter usage to take minimum and maximum

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2a396cb..ec57123 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -390,7 +390,10 @@ struct map_lookup {
 	 BTRFS_BALANCE_ARGS_DEVID | 		\
 	 BTRFS_BALANCE_ARGS_DRANGE |		\
 	 BTRFS_BALANCE_ARGS_VRANGE |		\
-	 BTRFS_BALANCE_ARGS_LIMIT)
+	 BTRFS_BALANCE_ARGS_LIMIT |		\
+	 BTRFS_BALANCE_ARGS_LIMIT_RANGE |	\
+	 BTRFS_BALANCE_ARGS_STRIPES_RANGE |	\
+	 BTRFS_BALANCE_ARGS_USAGE_RANGE)
 
 /*
  * Profile changing flags.  When SOFT is set we won't relocate chunk if
-- 
cgit v1.1


From 0a0e8b89389266bed9cc074c57ea662e4b9b2621 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 26 Oct 2015 09:19:43 +0800
Subject: btrfs: qgroup: Don't copy extent buffer to do qgroup rescan

Ancient qgroup code call memcpy() on a extent buffer and use it for leaf
iteration.

As extent buffer contains lock, pointers to pages, it's never sane to do
such copy.

The following bug may be caused by this insane operation:
[92098.841309] general protection fault: 0000 [#1] SMP
[92098.841338] Modules linked in: ...
[92098.841814] CPU: 1 PID: 24655 Comm: kworker/u4:12 Not tainted
4.3.0-rc1 #1
[92098.841868] Workqueue: btrfs-qgroup-rescan btrfs_qgroup_rescan_helper
[btrfs]
[92098.842261] Call Trace:
[92098.842277]  [<ffffffffc035a5d8>] ? read_extent_buffer+0xb8/0x110
[btrfs]
[92098.842304]  [<ffffffffc0396d00>] ? btrfs_find_all_roots+0x60/0x70
[btrfs]
[92098.842329]  [<ffffffffc039af3d>]
btrfs_qgroup_rescan_worker+0x28d/0x5a0 [btrfs]

Where btrfs_qgroup_rescan_worker+0x28d is btrfs_disk_key_to_cpu(),
called in reading key from the copied extent_buffer.

This patch will use btrfs_clone_extent_buffer() to a better copy of
extent buffer to deal such case.

Reported-by: Stephane Lesimple <stephane_btrfs@lesimple.fr>
Suggested-by: Filipe Manana <fdmanana@kernel.org>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 158633c..31d1934 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2192,10 +2192,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-		   struct btrfs_trans_handle *trans,
-		   struct extent_buffer *scratch_leaf)
+		   struct btrfs_trans_handle *trans)
 {
 	struct btrfs_key found;
+	struct extent_buffer *scratch_leaf = NULL;
 	struct ulist *roots = NULL;
 	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	u64 num_bytes;
@@ -2233,7 +2233,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
 
 	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-	memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+	if (!scratch_leaf) {
+		ret = -ENOMEM;
+		mutex_unlock(&fs_info->qgroup_rescan_lock);
+		goto out;
+	}
+	extent_buffer_get(scratch_leaf);
+	btrfs_tree_read_lock(scratch_leaf);
+	btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
 	slot = path->slots[0];
 	btrfs_release_path(path);
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2259,6 +2267,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 			goto out;
 	}
 out:
+	if (scratch_leaf) {
+		btrfs_tree_read_unlock_blocking(scratch_leaf);
+		free_extent_buffer(scratch_leaf);
+	}
 	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 
 	return ret;
@@ -2270,16 +2282,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 						     qgroup_rescan_work);
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans = NULL;
-	struct extent_buffer *scratch_leaf = NULL;
 	int err = -ENOMEM;
 	int ret = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
-	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
-	if (!scratch_leaf)
-		goto out;
 
 	err = 0;
 	while (!err) {
@@ -2291,8 +2299,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 		if (!fs_info->quota_enabled) {
 			err = -EINTR;
 		} else {
-			err = qgroup_rescan_leaf(fs_info, path, trans,
-						 scratch_leaf);
+			err = qgroup_rescan_leaf(fs_info, path, trans);
 		}
 		if (err > 0)
 			btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2301,7 +2308,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	}
 
 out:
-	kfree(scratch_leaf);
 	btrfs_free_path(path);
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
-- 
cgit v1.1


From 696249132158014d594896df3a81390616069c5c Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 26 Oct 2015 15:06:19 +0900
Subject: btrfs: clear PF_NOFREEZE in cleaner_kthread()

cleaner_kthread() kthread calls try_to_freeze() at the beginning of every
cleanup attempt. This operation can't ever succeed though, as the kthread
hasn't marked itself as freezable.

Before (hopefully eventually) kthread freezing gets converted to fileystem
freezing, we'd rather mark cleaner_kthread() freezable (as my
understanding is that it can generate filesystem I/O during suspend).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 86a11a9..2d46675 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1762,6 +1762,7 @@ static int cleaner_kthread(void *arg)
 	int again;
 	struct btrfs_trans_handle *trans;
 
+	set_freezable();
 	do {
 		again = 0;
 
-- 
cgit v1.1


From 5846a3c26873e86b034c702a8bc202aa76082369 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 26 Oct 2015 14:11:18 +0800
Subject: btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans

Between btrfs_allocerved_file_extent() and
btrfs_add_delayed_qgroup_reserve(), there is a window that delayed_refs
are run and delayed ref head maybe freed before
btrfs_add_delayed_qgroup_reserve().

This will cause btrfs_dad_delayed_qgroup_reserve() to return -ENOENT,
and cause transaction to be aborted.

This patch will record qgroup reserve space info into delayed_ref_head
at btrfs_add_delayed_ref(), to eliminate the race window.

Reported-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/delayed-ref.c | 22 +++++++++++++++++-----
 fs/btrfs/delayed-ref.h |  2 +-
 fs/btrfs/extent-tree.c | 14 ++++++++------
 fs/btrfs/inode.c       | 12 ++++--------
 5 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4001585..a2e73f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3430,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
-				     u64 offset, struct btrfs_key *ins);
+				     u64 offset, u64 ram_bytes,
+				     struct btrfs_key *ins);
 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   u64 root_objectid, u64 owner, u64 offset,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 1c3588a..e06dd75 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -535,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		     struct btrfs_trans_handle *trans,
 		     struct btrfs_delayed_ref_node *ref,
 		     struct btrfs_qgroup_extent_record *qrecord,
-		     u64 bytenr, u64 num_bytes, int action, int is_data)
+		     u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+		     int action, int is_data)
 {
 	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -544,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	int count_mod = 1;
 	int must_insert_reserved = 0;
 
+	/* If reserved is provided, it must be a data extent. */
+	BUG_ON(!is_data && reserved);
+
 	/*
 	 * the head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
@@ -593,6 +597,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 
 	/* Record qgroup extent info if provided */
 	if (qrecord) {
+		if (ref_root && reserved) {
+			head_ref->qgroup_ref_root = ref_root;
+			head_ref->qgroup_reserved = reserved;
+		}
+
 		qrecord->bytenr = bytenr;
 		qrecord->num_bytes = num_bytes;
 		qrecord->old_roots = NULL;
@@ -611,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
 	if (existing) {
+		WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+			&& existing->qgroup_reserved);
 		update_existing_head_ref(delayed_refs, &existing->node, ref);
 		/*
 		 * we've updated the existing ref, free the newly
@@ -777,7 +788,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * the spin lock
 	 */
 	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-					bytenr, num_bytes, action, 0);
+					bytenr, num_bytes, 0, 0, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 			     num_bytes, parent, ref_root, level, action);
@@ -800,7 +811,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, int action,
+			       u64 owner, u64 offset, u64 reserved, int action,
 			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_data_ref *ref;
@@ -839,7 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * the spin lock
 	 */
 	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-					bytenr, num_bytes, action, 1);
+					bytenr, num_bytes, ref_root, reserved,
+					action, 1);
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
@@ -894,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	spin_lock(&delayed_refs->lock);
 
 	add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
-			     num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+			     num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
 			     extent_op->is_data);
 
 	spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f9cf234..00ed02c 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -248,7 +248,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
-			       u64 owner, u64 offset, int action,
+			       u64 owner, u64 offset, u64 reserved, int action,
 			       struct btrfs_delayed_extent_op *extent_op);
 int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
 				     struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c1f8c7e..f50c7c2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2087,8 +2087,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 					BTRFS_ADD_DELAYED_REF, NULL);
 	} else {
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-					num_bytes,
-					parent, root_objectid, owner, offset,
+					num_bytes, parent, root_objectid,
+					owner, offset, 0,
 					BTRFS_ADD_DELAYED_REF, NULL);
 	}
 	return ret;
@@ -6832,8 +6832,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 						num_bytes,
 						parent, root_objectid, owner,
-						offset, BTRFS_DROP_DELAYED_REF,
-						NULL);
+						offset, 0,
+						BTRFS_DROP_DELAYED_REF, NULL);
 	}
 	return ret;
 }
@@ -7759,7 +7759,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
-				     u64 offset, struct btrfs_key *ins)
+				     u64 offset, u64 ram_bytes,
+				     struct btrfs_key *ins)
 {
 	int ret;
 
@@ -7768,7 +7769,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
 					 ins->offset, 0,
 					 root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+					 ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+					 NULL);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6f030c2..4439fbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2127,17 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_file_extent(trans, root,
 					root->root_key.objectid,
-					btrfs_ino(inode), file_pos, &ins);
-	if (ret < 0)
-		goto out;
+					btrfs_ino(inode), file_pos,
+					ram_bytes, &ins);
 	/*
-	 * Release the reserved range from inode dirty range map, and
-	 * move it to delayed ref codes, as now accounting only happens at
-	 * commit_transaction() time.
+	 * Release the reserved range from inode dirty range map, as it is
+	 * already moved into delayed_ref_head
 	 */
 	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
-	ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
-			root->objectid, disk_bytenr, ram_bytes);
 out:
 	btrfs_free_path(path);
 
-- 
cgit v1.1


From 90ce321da8ce3d00e19586993887346286208386 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 26 Oct 2015 14:11:19 +0800
Subject: btrfs: qgroup: Fix a rebase bug which will cause qgroup double free

When rebasing my patchset, I forgot to pick up a cleanup patch to remove
old hotfix in 4.2 release.

Witouth the cleanup, it will screw up new qgroup reserve framework and
always cause minus reserved number.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 31d1934..46476c2 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 			}
 		}
 
-		/* For exclusive extent, free its reserved bytes too */
-		if (nr_old_roots == 0 && nr_new_roots == 1 &&
-		    cur_new_count == nr_new_roots)
-			qg->reserved -= num_bytes;
 		if (dirty)
 			qgroup_dirty(fs_info, qg);
 	}
-- 
cgit v1.1


From f2ca379642d7a843be972ea4167abdd3c8c9e5d1 Mon Sep 17 00:00:00 2001
From: Dirk Steinmetz <public@rsjtdrjgfuzkfg.com>
Date: Tue, 20 Oct 2015 16:09:19 +0200
Subject: namei: permit linking with CAP_FOWNER in userns

Attempting to hardlink to an unsafe file (e.g. a setuid binary) from
within an unprivileged user namespace fails, even if CAP_FOWNER is held
within the namespace. This may cause various failures, such as a gentoo
installation within a lxc container failing to build and install specific
packages.

This change permits hardlinking of files owned by mapped uids, if
CAP_FOWNER is held for that namespace. Furthermore, it improves consistency
by using the existing inode_owner_or_capable(), which is aware of
namespaced capabilities as of 23adbe12ef7d3 ("fs,userns: Change
inode_capable to capable_wrt_inode_uidgid").

Signed-off-by: Dirk Steinmetz <public@rsjtdrjgfuzkfg.com>

This is hitting us in Ubuntu during some dpkg upgrades in containers.
When upgrading a file dpkg creates a hard link to the old file to back
it up before overwriting it. When packages upgrade suid files owned by a
non-root user the link isn't permitted, and the package upgrade fails.
This patch fixes our problem.

Tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/namei.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 726d211..29fc6a6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -955,26 +955,23 @@ static bool safe_hardlink_source(struct inode *inode)
  *  - sysctl_protected_hardlinks enabled
  *  - fsuid does not match inode
  *  - hardlink source is unsafe (see safe_hardlink_source() above)
- *  - not CAP_FOWNER
+ *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
  *
  * Returns 0 if successful, -ve on error.
  */
 static int may_linkat(struct path *link)
 {
-	const struct cred *cred;
 	struct inode *inode;
 
 	if (!sysctl_protected_hardlinks)
 		return 0;
 
-	cred = current_cred();
 	inode = link->dentry->d_inode;
 
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
-	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
-	    capable(CAP_FOWNER))
+	if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
 		return 0;
 
 	audit_log_link_denied("linkat", link);
-- 
cgit v1.1


From b33e18f61bd18227a456016a77b1a968f5bc1d65 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Oct 2015 14:19:39 +0900
Subject: fs/writeback, rcu: Don't use list_entry_rcu() for pointer offsetting
 in bdi_split_work_to_wbs()

bdi_split_work_to_wbs() uses list_for_each_entry_rcu_continue()
to walk @bdi->wb_list.  To set up the initial iteration
condition, it uses list_entry_rcu() to calculate the entry
pointer corresponding to the list head; however, this isn't an
actual RCU dereference and using list_entry_rcu() for it ended
up breaking a proposed list_entry_rcu() change because it was
feeding an non-lvalue pointer into the macro.

Don't use the RCU variant for simple pointer offsetting.  Use
list_entry() instead.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Patrick Marlier <patrick.marlier@gmail.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: pranith kumar <bobby.prani@gmail.com>
Link: http://lkml.kernel.org/r/20151027051939.GA19355@mtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 29e4599..7378169 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -779,8 +779,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 				  bool skip_if_busy)
 {
 	struct bdi_writeback *last_wb = NULL;
-	struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
-						struct bdi_writeback, bdi_node);
+	struct bdi_writeback *wb = list_entry(&bdi->wb_list,
+					      struct bdi_writeback, bdi_node);
 
 	might_sleep();
 restart:
-- 
cgit v1.1


From f3dd1649122b9e73f869e5304d5fa8554e128b7a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 29 Oct 2015 10:58:09 -0500
Subject: gfs2: Remove gl_spin define

Commit e66cf161 replaced the gl_spin spinlock in struct gfs2_glock with a
gl_lockref lockref and defined gl_spin as gl_lockref.lock (the spinlock in
gl_lockref).  Remove that define to make the references to gl_lockref.lock more
obvious.

Signed-off-by: Andreas Gruenbacher <andreas.gruenbacher@gmail.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c  | 84 ++++++++++++++++++++++++++++----------------------------
 fs/gfs2/glock.h  |  4 +--
 fs/gfs2/glops.c  | 10 +++----
 fs/gfs2/incore.h |  3 +-
 fs/gfs2/main.c   |  2 +-
 fs/gfs2/rgrp.c   |  4 +--
 6 files changed, 53 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9bd1244..32e7471 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -246,8 +246,8 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
  */
 
 static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh, *tmp;
@@ -260,10 +260,10 @@ restart:
 		if (may_grant(gl, gh)) {
 			if (gh->gh_list.prev == &gl->gl_holders &&
 			    glops->go_lock) {
-				spin_unlock(&gl->gl_spin);
+				spin_unlock(&gl->gl_lockref.lock);
 				/* FIXME: eliminate this eventually */
 				ret = glops->go_lock(gh);
-				spin_lock(&gl->gl_spin);
+				spin_lock(&gl->gl_lockref.lock);
 				if (ret) {
 					if (ret == 1)
 						return 2;
@@ -361,7 +361,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
 	unsigned state = ret & LM_OUT_ST_MASK;
 	int rv;
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	trace_gfs2_glock_state_change(gl, state);
 	state_change(gl, state);
 	gh = find_first_waiter(gl);
@@ -405,7 +405,7 @@ retry:
 			pr_err("wanted %u got %u\n", gl->gl_target, state);
 			GLOCK_BUG_ON(gl, 1);
 		}
-		spin_unlock(&gl->gl_spin);
+		spin_unlock(&gl->gl_lockref.lock);
 		return;
 	}
 
@@ -414,9 +414,9 @@ retry:
 		gfs2_demote_wake(gl);
 	if (state != LM_ST_UNLOCKED) {
 		if (glops->go_xmote_bh) {
-			spin_unlock(&gl->gl_spin);
+			spin_unlock(&gl->gl_lockref.lock);
 			rv = glops->go_xmote_bh(gl, gh);
-			spin_lock(&gl->gl_spin);
+			spin_lock(&gl->gl_lockref.lock);
 			if (rv) {
 				do_error(gl, rv);
 				goto out;
@@ -429,7 +429,7 @@ retry:
 out:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 out_locked:
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 }
 
 /**
@@ -441,8 +441,8 @@ out_locked:
  */
 
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -464,7 +464,7 @@ __acquires(&gl->gl_spin)
 	    (gl->gl_state == LM_ST_EXCLUSIVE) ||
 	    (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
 		clear_bit(GLF_BLOCKING, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 	if (glops->go_sync)
 		glops->go_sync(gl);
 	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -485,7 +485,7 @@ __acquires(&gl->gl_spin)
 			gfs2_glock_put(gl);
 	}
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 }
 
 /**
@@ -513,8 +513,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
  */
 
 static void run_queue(struct gfs2_glock *gl, const int nonblock)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
 {
 	struct gfs2_holder *gh = NULL;
 	int ret;
@@ -596,7 +596,7 @@ static void glock_work_func(struct work_struct *work)
 		finish_xmote(gl, gl->gl_reply);
 		drop_ref = 1;
 	}
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_state != LM_ST_UNLOCKED &&
 	    gl->gl_demote_state != LM_ST_EXCLUSIVE) {
@@ -612,7 +612,7 @@ static void glock_work_func(struct work_struct *work)
 		}
 	}
 	run_queue(gl, 0);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 	if (!delay)
 		gfs2_glock_put(gl);
 	else {
@@ -876,8 +876,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
  */
 
 static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -926,10 +926,10 @@ fail:
 do_cancel:
 	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
 	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
-		spin_unlock(&gl->gl_spin);
+		spin_unlock(&gl->gl_lockref.lock);
 		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
 			sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
-		spin_lock(&gl->gl_spin);
+		spin_lock(&gl->gl_lockref.lock);
 	}
 	return;
 
@@ -967,7 +967,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 	if (test_bit(GLF_LRU, &gl->gl_flags))
 		gfs2_glock_remove_from_lru(gl);
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	add_to_queue(gh);
 	if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
 		     test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
@@ -977,7 +977,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 			gl->gl_lockref.count--;
 	}
 	run_queue(gl, 1);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 
 	if (!(gh->gh_flags & GL_ASYNC))
 		error = gfs2_glock_wait(gh);
@@ -1010,7 +1010,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	unsigned delay = 0;
 	int fast_path = 0;
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	if (gh->gh_flags & GL_NOCACHE)
 		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 
@@ -1018,9 +1018,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	if (find_first_holder(gl) == NULL) {
 		if (glops->go_unlock) {
 			GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
-			spin_unlock(&gl->gl_spin);
+			spin_unlock(&gl->gl_lockref.lock);
 			glops->go_unlock(gh);
-			spin_lock(&gl->gl_spin);
+			spin_lock(&gl->gl_lockref.lock);
 			clear_bit(GLF_LOCK, &gl->gl_flags);
 		}
 		if (list_empty(&gl->gl_holders) &&
@@ -1033,7 +1033,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		gfs2_glock_add_to_lru(gl);
 
 	trace_gfs2_glock_queue(gh, 0);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 	if (likely(fast_path))
 		return;
 
@@ -1217,9 +1217,9 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 			delay = gl->gl_hold_time;
 	}
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	handle_callback(gl, state, delay, true);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
 }
@@ -1259,7 +1259,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
  * @gl: Pointer to the glock
  * @ret: The return value from the dlm
  *
- * The gl_reply field is under the gl_spin lock so that it is ok
+ * The gl_reply field is under the gl_lockref.lock lock so that it is ok
  * to use a bitfield shared with other glock state fields.
  */
 
@@ -1267,20 +1267,20 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
 	struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	gl->gl_reply = ret;
 
 	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
 		if (gfs2_should_freeze(gl)) {
 			set_bit(GLF_FROZEN, &gl->gl_flags);
-			spin_unlock(&gl->gl_spin);
+			spin_unlock(&gl->gl_lockref.lock);
 			return;
 		}
 	}
 
 	gl->gl_lockref.count++;
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
@@ -1326,14 +1326,14 @@ __acquires(&lru_lock)
 	while(!list_empty(list)) {
 		gl = list_entry(list->next, struct gfs2_glock, gl_lru);
 		list_del_init(&gl->gl_lru);
-		if (!spin_trylock(&gl->gl_spin)) {
+		if (!spin_trylock(&gl->gl_lockref.lock)) {
 add_back_to_lru:
 			list_add(&gl->gl_lru, &lru_list);
 			atomic_inc(&lru_count);
 			continue;
 		}
 		if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-			spin_unlock(&gl->gl_spin);
+			spin_unlock(&gl->gl_lockref.lock);
 			goto add_back_to_lru;
 		}
 		clear_bit(GLF_LRU, &gl->gl_flags);
@@ -1343,7 +1343,7 @@ add_back_to_lru:
 		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gl->gl_lockref.count--;
-		spin_unlock(&gl->gl_spin);
+		spin_unlock(&gl->gl_lockref.lock);
 		cond_resched_lock(&lru_lock);
 	}
 }
@@ -1461,10 +1461,10 @@ static void clear_glock(struct gfs2_glock *gl)
 {
 	gfs2_glock_remove_from_lru(gl);
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	if (gl->gl_state != LM_ST_UNLOCKED)
 		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
 }
@@ -1482,9 +1482,9 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 
 static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	gfs2_dump_glock(seq, gl);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 }
 
 static void dump_glock_func(struct gfs2_glock *gl)
@@ -1518,10 +1518,10 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
 	ret = gfs2_truncatei_resume(ip);
 	gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	run_queue(gl, 1);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 }
 
 static const char *state2str(unsigned state)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 32572f7..f7cdaa8 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -141,7 +141,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
 	struct pid *pid;
 
 	/* Look in glock's list of holders for one with current task as owner */
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	pid = task_pid(current);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
@@ -151,7 +151,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
 	}
 	gh = NULL;
 out:
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 
 	return gh;
 }
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1f6c9c3..f348cfb 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -146,11 +146,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 	struct gfs2_rgrpd *rgd;
 	int error;
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	rgd = gl->gl_object;
 	if (rgd)
 		gfs2_rgrp_brelse(rgd);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		return;
@@ -162,11 +162,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 	mapping_set_error(mapping, error);
 	gfs2_ail_empty_gl(gl);
 
-	spin_lock(&gl->gl_spin);
+	spin_lock(&gl->gl_lockref.lock);
 	rgd = gl->gl_object;
 	if (rgd)
 		gfs2_free_clones(rgd);
-	spin_unlock(&gl->gl_spin);
+	spin_unlock(&gl->gl_lockref.lock);
 }
 
 /**
@@ -542,7 +542,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
  * iopen_go_callback - schedule the dcache entry for the inode to be deleted
  * @gl: the glock
  *
- * gl_spin lock is held while calling this
+ * gl_lockref.lock lock is held while calling this
  */
 static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 121ed08..de7b4f9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -334,9 +334,8 @@ struct gfs2_glock {
 	struct lm_lockname gl_name;
 
 	struct lockref gl_lockref;
-#define gl_spin gl_lockref.lock
 
-	/* State fields protected by gl_spin */
+	/* State fields protected by gl_lockref.lock */
 	unsigned int gl_state:2,	/* Current state */
 		     gl_target:2,	/* Target state */
 		     gl_demote_state:2,	/* State requested by remote node */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 241a399..fb2b42c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -50,7 +50,7 @@ static void gfs2_init_glock_once(void *foo)
 	struct gfs2_glock *gl = foo;
 
 	INIT_HLIST_BL_NODE(&gl->gl_list);
-	spin_lock_init(&gl->gl_spin);
+	spin_lock_init(&gl->gl_lockref.lock);
 	INIT_LIST_HEAD(&gl->gl_holders);
 	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 475985d..d29dd0c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -729,9 +729,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 		rb_erase(n, &sdp->sd_rindex_tree);
 
 		if (gl) {
-			spin_lock(&gl->gl_spin);
+			spin_lock(&gl->gl_lockref.lock);
 			gl->gl_object = NULL;
-			spin_unlock(&gl->gl_spin);
+			spin_unlock(&gl->gl_lockref.lock);
 			gfs2_glock_add_to_lru(gl);
 			gfs2_glock_put(gl);
 		}
-- 
cgit v1.1


From be69e1c19f0efb091f29521715c380842fa4cd7e Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Thu, 29 Oct 2015 14:18:13 -0400
Subject: fs/ext4: remove unnecessary new_valid_dev check

As new_valid_dev always returns 1, so !new_valid_dev check is not
needed, remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 52a79d4..19ce345 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2468,9 +2468,6 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err, credits, retries = 0;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
-- 
cgit v1.1


From f3f86e33dc3da437fa4f204588ce7c78ea756982 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 30 Oct 2015 16:53:57 -0700
Subject: vfs: Fix pathological performance case for __alloc_fd()

Al Viro points out that:
> >     * [Linux-specific aside] our __alloc_fd() can degrade quite badly
> > with some use patterns.  The cacheline pingpong in the bitmap is probably
> > inevitable, unless we accept considerably heavier memory footprint,
> > but we also have a case when alloc_fd() takes O(n) and it's _not_ hard
> > to trigger - close(3);open(...); will have the next open() after that
> > scanning the entire in-use bitmap.

And Eric Dumazet has a somewhat realistic multithreaded microbenchmark
that opens and closes a lot of sockets with minimal work per socket.

This patch largely fixes it.  We keep a 2nd-level bitmap of the open
file bitmaps, showing which words are already full.  So then we can
traverse that second-level bitmap to efficiently skip already allocated
file descriptors.

On his benchmark, this improves performance by up to an order of
magnitude, by avoiding the excessive open file bitmap scanning.

Tested-and-acked-by: Eric Dumazet <edumazet@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 6c672ad..6f6eb2b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -56,6 +56,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 	__free_fdtable(container_of(rcu, struct fdtable, rcu));
 }
 
+#define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
+#define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
+
 /*
  * Expand the fdset in the files_struct.  Called with the files spinlock
  * held for write.
@@ -77,6 +80,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 	memset((char *)(nfdt->open_fds) + cpy, 0, set);
 	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
 	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+
+	cpy = BITBIT_SIZE(ofdt->max_fds);
+	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
+	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
+	memset(cpy+(char *)nfdt->full_fds_bits, 0, set);
 }
 
 static struct fdtable * alloc_fdtable(unsigned int nr)
@@ -115,12 +123,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	fdt->fd = data;
 
 	data = alloc_fdmem(max_t(size_t,
-				 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
 	if (!data)
 		goto out_arr;
 	fdt->open_fds = data;
 	data += nr / BITS_PER_BYTE;
 	fdt->close_on_exec = data;
+	data += nr / BITS_PER_BYTE;
+	fdt->full_fds_bits = data;
 
 	return fdt;
 
@@ -229,14 +239,18 @@ static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
 	__clear_bit(fd, fdt->close_on_exec);
 }
 
-static inline void __set_open_fd(int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
 {
 	__set_bit(fd, fdt->open_fds);
+	fd /= BITS_PER_LONG;
+	if (!~fdt->open_fds[fd])
+		__set_bit(fd, fdt->full_fds_bits);
 }
 
-static inline void __clear_open_fd(int fd, struct fdtable *fdt)
+static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
 {
 	__clear_bit(fd, fdt->open_fds);
+	__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
 }
 
 static int count_open_files(struct fdtable *fdt)
@@ -280,6 +294,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
 	new_fdt->close_on_exec = newf->close_on_exec_init;
 	new_fdt->open_fds = newf->open_fds_init;
+	new_fdt->full_fds_bits = newf->full_fds_bits_init;
 	new_fdt->fd = &newf->fd_array[0];
 
 	spin_lock(&oldf->file_lock);
@@ -323,6 +338,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 
 	memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
 	memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
+	memcpy(new_fdt->full_fds_bits, old_fdt->full_fds_bits, BITBIT_SIZE(open_files));
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
@@ -454,10 +470,25 @@ struct files_struct init_files = {
 		.fd		= &init_files.fd_array[0],
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
+		.full_fds_bits	= init_files.full_fds_bits_init,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 };
 
+static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
+{
+	unsigned long maxfd = fdt->max_fds;
+	unsigned long maxbit = maxfd / BITS_PER_LONG;
+	unsigned long bitbit = start / BITS_PER_LONG;
+
+	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
+	if (bitbit > maxfd)
+		return maxfd;
+	if (bitbit > start)
+		start = bitbit;
+	return find_next_zero_bit(fdt->open_fds, maxfd, start);
+}
+
 /*
  * allocate a file descriptor, mark it busy.
  */
@@ -476,7 +507,7 @@ repeat:
 		fd = files->next_fd;
 
 	if (fd < fdt->max_fds)
-		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
+		fd = find_next_fd(fdt, fd);
 
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
-- 
cgit v1.1


From fc90888d07b8e17eec49c04bdb26344fdea96c3b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 31 Oct 2015 16:06:40 -0700
Subject: vfs: conditionally clear close-on-exec flag

We clear the close-on-exec flag when opening and closing files, and the
bit was almost always already clear before.  Avoid dirtying the
cacheline if the clearning isn't necessary.  That avoids unnecessary
cacheline dirtying and bouncing in multi-socket environments.

Eric Dumazet has a file descriptor benchmark that goes 4% faster from
this on his two-socket machine.  It's probably partly superlinear
improvement due to getting slightly less spinlock contention on the
file_lock spinlock due to less work in the critical section.

Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 6f6eb2b..c6986dc 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -236,7 +236,8 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
 
 static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
 {
-	__clear_bit(fd, fdt->close_on_exec);
+	if (test_bit(fd, fdt->close_on_exec))
+		__clear_bit(fd, fdt->close_on_exec);
 }
 
 static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
-- 
cgit v1.1


From ca9e7a1c85594f61d7ffb414071e6cae82eae23a Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Thu, 1 Oct 2015 21:40:10 -0500
Subject: Allow duplicate extents in SMB3 not just SMB3.1.1

Enable duplicate extents (cp --reflink) ioctl for SMB3.0 not just
SMB3.1.1 since have verified that this works to Windows 2016
(REFS) and additional testing done at recent plugfest with
SMB3.0 not just SMB3.1.1  This will also make it easier
for Samba.

Signed-off-by: Steve French <steve.french@primarydata.com>
Reviewed-by: David Disseldorp <ddiss@suse.de>
---
 fs/cifs/smb2ops.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 18da19f..2f169eb 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -810,7 +810,6 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
 			    cfile->fid.volatile_fid, cfile->pid, &eof, false);
 }
 
-#ifdef CONFIG_CIFS_SMB311
 static int
 smb2_duplicate_extents(const unsigned int xid,
 			struct cifsFileInfo *srcfile,
@@ -854,8 +853,6 @@ smb2_duplicate_extents(const unsigned int xid,
 duplicate_extents_out:
 	return rc;
 }
-#endif /* CONFIG_CIFS_SMB311 */
-
 
 static int
 smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1703,6 +1700,7 @@ struct smb_version_operations smb30_operations = {
 	.create_lease_buf = smb3_create_lease_buf,
 	.parse_lease_buf = smb3_parse_lease_buf,
 	.clone_range = smb2_clone_range,
+	.duplicate_extents = smb2_duplicate_extents,
 	.validate_negotiate = smb3_validate_negotiate,
 	.wp_retry_size = smb2_wp_retry_size,
 	.dir_needs_close = smb2_dir_needs_close,
-- 
cgit v1.1


From a5ad88ce8c7fae7ddc72ee49a11a75aa837788e0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 1 Nov 2015 17:09:15 -0800
Subject: mm: get rid of 'vmalloc_info' from /proc/meminfo

It turns out that at least some versions of glibc end up reading
/proc/meminfo at every single startup, because glibc wants to know the
amount of memory the machine has.  And while that's arguably insane,
it's just how things are.

And it turns out that it's not all that expensive most of the time, but
the vmalloc information statistics (amount of virtual memory used in the
vmalloc space, and the biggest remaining chunk) can be rather expensive
to compute.

The 'get_vmalloc_info()' function actually showed up on my profiles as
4% of the CPU usage of "make test" in the git source repository, because
the git tests are lots of very short-lived shell-scripts etc.

It turns out that apparently this same silly vmalloc info gathering
shows up on the facebook servers too, according to Dave Jones.  So it's
not just "make test" for git.

We had two patches to just cache the information (one by me, one by
Ingo) to mitigate this issue, but the whole vmalloc information of of
rather dubious value to begin with, and people who *actually* want to
know what the situation is wrt the vmalloc area should just look at the
much more complete /proc/vmallocinfo instead.

In fact, according to my testing - and perhaps more importantly,
according to that big search engine in the sky: Google - there is
nothing out there that actually cares about those two expensive fields:
VmallocUsed and VmallocChunk.

So let's try to just remove them entirely.  Actually, this just removes
the computation and reports the numbers as zero for now, just to try to
be minimally intrusive.

If this breaks anything, we'll obviously have to re-introduce the code
to compute this all and add the caching patches on top.  But if given
the option, I'd really prefer to just remove this bad idea entirely
rather than add even more code to work around our historical mistake
that likely nobody really cares about.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d3ebf2e..9155a5a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 {
 	struct sysinfo i;
 	unsigned long committed;
-	struct vmalloc_info vmi;
 	long cached;
 	long available;
 	unsigned long pagecache;
@@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	if (cached < 0)
 		cached = 0;
 
-	get_vmalloc_info(&vmi);
-
 	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
 		pages[lru] = global_page_state(NR_LRU_BASE + lru);
 
@@ -191,8 +188,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(vm_commit_limit()),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
-		vmi.used >> 10,
-		vmi.largest_chunk >> 10
+		0ul, // used to be vmalloc 'used'
+		0ul  // used to be vmalloc 'largest_chunk'
 #ifdef CONFIG_MEMORY_FAILURE
 		, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
 #endif
-- 
cgit v1.1


From 24ba16bb3d499c49974669cd8429c3e4138ab102 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 2 Nov 2015 13:46:58 +1100
Subject: xfs: clear PF_NOFREEZE for xfsaild kthread

Since xfsaild has been converted to kthread in 0030807c, it calls
try_to_freeze() during every AIL push iteration. It however doesn't set
itself as freezable, and therefore this try_to_freeze() will never do
anything.

Before (hopefully eventually) kthread freezing gets converted to fileystem
freezing, we'd rather mark xfsaild freezable (as it can generate I/O
during suspend).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_trans_ail.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098cf4..06d1a29 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,6 +497,7 @@ xfsaild(
 	long		tout = 0;	/* milliseconds */
 
 	current->flags |= PF_MEMALLOC;
+	set_freezable();
 
 	while (!kthread_should_stop()) {
 		if (tout && tout <= 20)
-- 
cgit v1.1


From 135444126a1175912b43366f6109cb297018f034 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 2 Nov 2015 10:11:17 -0500
Subject: pNFS/flexfiles: When mirrored, retry failed reads by switching
 mirrors

If the pNFS/flexfiles file is mirrored, and a read to one mirror fails,
then we should bump the mirror index, so that we retry to a different
mirror. Once we've iterated through all mirrors and all failed, we can
return the layout and issue a new LAYOUTGET.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index fbc5a56..7fc14b9 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -741,17 +741,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 }
 
 static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+				  int start_idx,
 				  int *best_idx)
 {
-	struct nfs4_ff_layout_segment *fls;
+	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 	struct nfs4_pnfs_ds *ds;
 	int idx;
 
-	fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
 	/* mirrors are sorted by efficiency */
-	for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
-		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+		ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
 		if (ds) {
 			*best_idx = idx;
 			return ds;
@@ -782,7 +782,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
 
-	ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+	ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
 	if (!ds)
 		goto out_mds;
 	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -1171,6 +1171,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
+		if (ff_layout_choose_best_ds_for_read(hdr->lseg,
+					hdr->pgio_mirror_idx + 1,
+					&hdr->pgio_mirror_idx))
+			goto out_eagain;
 		set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
 			&hdr->lseg->pls_layout->plh_flags);
 		pnfs_read_resend_pnfs(hdr);
@@ -1179,11 +1183,13 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 		ff_layout_reset_read(hdr);
 		return task->tk_status;
 	case -EAGAIN:
-		rpc_restart_call_prepare(task);
-		return -EAGAIN;
+		goto out_eagain;
 	}
 
 	return 0;
+out_eagain:
+	rpc_restart_call_prepare(task);
+	return -EAGAIN;
 }
 
 static bool
-- 
cgit v1.1


From 260074cd8413489903d4484058e61649d6e08580 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 2 Nov 2015 09:59:00 -0500
Subject: pNFS/flexfiles: Add support for FF_FLAGS_NO_IO_THRU_MDS

For loosely coupled pNFS/flexfiles systems, there is often no advantage
at all in going through the MDS for I/O, since the MDS is subject to
the same limitations as all other clients when talking to DSes. If a
DS is unresponsive, I/O through the MDS will fail.

For such systems, the only scalable solution is to have the pNFS clients
retry doing pNFS, and so the protocol now provides a flag that allows
the pNFS server to signal this.

If LAYOUTGET returns FF_FLAGS_NO_IO_THRU_MDS, then we should assume that
the MDS wants the client to retry using these devices, even if they were
previously marked as being unavailable. To do so, we add a helper,
ff_layout_mark_devices_valid() that will be called from layoutget.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 18 ++++++++++++++++--
 fs/nfs/flexfilelayout/flexfilelayout.h |  7 +++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7fc14b9..03516c8 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -339,6 +339,19 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 	}
 }
 
+static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
+{
+	struct nfs4_deviceid_node *node;
+	int i;
+
+	if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
+		return;
+	for (i = 0; i < fls->mirror_array_cnt; i++) {
+		node = &fls->mirror_array[i]->mirror_ds->id_node;
+		clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+	}
+}
+
 static struct pnfs_layout_segment *
 ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		     struct nfs4_layoutget_res *lgr,
@@ -499,6 +512,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 	rc = ff_layout_check_layout(lgr);
 	if (rc)
 		goto out_err_free;
+	ff_layout_mark_devices_valid(fls);
 
 	ret = &fls->generic_hdr;
 	dprintk("<-- %s (success)\n", __func__);
@@ -1035,7 +1049,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 		rpc_wake_up(&tbl->slot_tbl_waitq);
 		/* fall through */
 	default:
-		if (ff_layout_has_available_ds(lseg))
+		if (ff_layout_no_fallback_to_mds(lseg) ||
+		    ff_layout_has_available_ds(lseg))
 			return -NFS4ERR_RESET_TO_PNFS;
 reset:
 		dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1153,7 +1168,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 }
 
 /* NFS_PROTO call done callback routines */
-
 static int ff_layout_read_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 68cc0d9..2bb08bc 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,6 +10,7 @@
 #define FS_NFS_NFS4FLEXFILELAYOUT_H
 
 #define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
 
 #include "../pnfs.h"
 
@@ -146,6 +147,12 @@ FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
 }
 
 static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+	return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
 ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
 {
 	return nfs4_test_deviceid_unavailable(node);
-- 
cgit v1.1


From 76566773a1f1c2295ed901b6f1241cfe10d99029 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 24 Oct 2015 17:28:32 -0400
Subject: NFS: Enable client side NFSv4.1 backchannel to use other transports

Forechannel transports get their own "bc_up" method to create an
endpoint for the backchannel service.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
[Anna Schumaker: Add forward declaration of struct net to xprt.h]
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/callback.c | 40 ++++++++--------------------------------
 1 file changed, 8 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f7c0a..a7f2e6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,17 +99,6 @@ nfs4_callback_up(struct svc_serv *serv)
 }
 
 #if defined(CONFIG_NFS_V4_1)
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
-	/*
-	 * Create an svc_sock for the back channel service that shares the
-	 * fore channel connection.
-	 * Returns the input port (0) and sets the svc_serv bc_xprt on success
-	 */
-	return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
-			      SVC_SOCK_ANONYMOUS);
-}
-
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -184,11 +173,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		xprt->bc_serv = serv;
 }
 #else
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
-	return 0;
-}
-
 static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
 		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
 {
@@ -259,7 +243,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
 	svc_shutdown_net(serv, net);
 }
 
-static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+			       struct net *net, struct rpc_xprt *xprt)
 {
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	int ret;
@@ -275,20 +260,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
 		goto err_bind;
 	}
 
-	switch (minorversion) {
-		case 0:
-			ret = nfs4_callback_up_net(serv, net);
-			break;
-		case 1:
-		case 2:
-			ret = nfs41_callback_up_net(serv, net);
-			break;
-		default:
-			printk(KERN_ERR "NFS: unknown callback version: %d\n",
-					minorversion);
-			ret = -EINVAL;
-			break;
-	}
+	ret = -EPROTONOSUPPORT;
+	if (minorversion == 0)
+		ret = nfs4_callback_up_net(serv, net);
+	else if (xprt->ops->bc_up)
+		ret = xprt->ops->bc_up(serv, net);
 
 	if (ret < 0) {
 		printk(KERN_ERR "NFS: callback service start failed\n");
@@ -364,7 +340,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 		goto err_create;
 	}
 
-	ret = nfs_callback_up_net(minorversion, serv, net);
+	ret = nfs_callback_up_net(minorversion, serv, net, xprt);
 	if (ret < 0)
 		goto err_net;
 
-- 
cgit v1.1


From 306e5c2a3cb45a0256ae6677501d1144e93efa2f Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Sat, 31 Oct 2015 23:23:15 +0800
Subject: pstore: fix code comment to match code

Fix code comment about kmsg_dump register so it matches the code.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/platform.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0aab920..588461b 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -361,6 +361,9 @@ static struct kmsg_dumper pstore_dumper = {
 	.dump = pstore_dump,
 };
 
+/*
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
 static void pstore_register_kmsg(void)
 {
 	kmsg_dump_register(&pstore_dumper);
@@ -434,8 +437,6 @@ static int pstore_write_compat(enum pstore_type_id type,
  * read function right away to populate the file system. If not
  * then the pstore mount code will call us later to fill out
  * the file system.
- *
- * Register with kmsg_dump to save last part of console log on panic.
  */
 int pstore_register(struct pstore_info *psi)
 {
-- 
cgit v1.1


From 1291fb950f12005600eb410c206fffd7231dee6f Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Wed, 30 Sep 2015 11:41:05 +0800
Subject: ceph: fix a comment typo

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 834f9f3..a4766de 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
 	const struct ceph_inode_info* ci = cookie_netfs_data;
 	uint16_t klen;
 
-	/* use ceph virtual inode (id + snaphot) */
+	/* use ceph virtual inode (id + snapshot) */
 	klen = sizeof(ci->i_vino);
 	if (klen > maxbuf)
 		return 0;
-- 
cgit v1.1


From 777d738a5e58ba3b6f3932ab1543ce93703f4873 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 30 Sep 2015 15:04:42 +0200
Subject: ceph: fix message length computation

create_request_message() computes the maximum length of a message,
but uses the wrong type for the time stamp: sizeof(struct timespec)
may be 8 or 16 depending on the architecture, while sizeof(struct
ceph_timespec) is always 8, and that is what gets put into the
message.

Found while auditing the uses of timespec for y2038 problems.

Fixes: b8e69066d8af ("ceph: include time stamp in every MDS request")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 51cb02d..fe2c982 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1935,7 +1935,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	len = sizeof(*head) +
 		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
-		sizeof(struct timespec);
+		sizeof(struct ceph_timespec);
 
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
-- 
cgit v1.1


From b5b98989dc7ed2093aeb76f2d0db79888582b0a2 Mon Sep 17 00:00:00 2001
From: "Zhu, Caifeng" <zhucaifeng@unissoft-nj.com>
Date: Thu, 8 Oct 2015 15:26:15 +0800
Subject: ceph: combine as many iovec as possile into one OSD request

Both ceph_sync_direct_write and ceph_sync_read iterate iovec elements
one by one, send one OSD request for each iovec. This is sub-optimal,
We can combine serveral iovec into one page vector, and send an OSD
request for the whole page vector.

Signed-off-by: Zhu, Caifeng <zhucaifeng@unissoft-nj.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/file.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 77 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0c62868..3c68e6a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -34,6 +34,74 @@
  * need to wait for MDS acknowledgement.
  */
 
+/*
+ * Calculate the length sum of direct io vectors that can
+ * be combined into one page vector.
+ */
+static size_t dio_get_pagev_size(const struct iov_iter *it)
+{
+    const struct iovec *iov = it->iov;
+    const struct iovec *iovend = iov + it->nr_segs;
+    size_t size;
+
+    size = iov->iov_len - it->iov_offset;
+    /*
+     * An iov can be page vectored when both the current tail
+     * and the next base are page aligned.
+     */
+    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
+           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
+        size += iov->iov_len;
+    }
+    dout("dio_get_pagevlen len = %zu\n", size);
+    return size;
+}
+
+/*
+ * Allocate a page vector based on (@it, @nbytes).
+ * The return value is the tuple describing a page vector,
+ * that is (@pages, @page_align, @num_pages).
+ */
+static struct page **
+dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
+		    size_t *page_align, int *num_pages)
+{
+	struct iov_iter tmp_it = *it;
+	size_t align;
+	struct page **pages;
+	int ret = 0, idx, npages;
+
+	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+		(PAGE_SIZE - 1);
+	npages = calc_pages_for(align, nbytes);
+	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+	if (!pages) {
+		pages = vmalloc(sizeof(*pages) * npages);
+		if (!pages)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	for (idx = 0; idx < npages; ) {
+		size_t start;
+		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
+					 npages - idx, &start);
+		if (ret < 0)
+			goto fail;
+
+		iov_iter_advance(&tmp_it, ret);
+		nbytes -= ret;
+		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+	}
+
+	BUG_ON(nbytes != 0);
+	*num_pages = npages;
+	*page_align = align;
+	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
+	return pages;
+fail:
+	ceph_put_page_vector(pages, idx, false);
+	return ERR_PTR(ret);
+}
 
 /*
  * Prepare an open request.  Preallocate ceph_cap to avoid an
@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 			size_t start;
 			ssize_t n;
 
-			n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
-			if (n < 0)
-				return n;
-
-			num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+			n = dio_get_pagev_size(i);
+			pages = dio_get_pages_alloc(i, n, &start, &num_pages);
+			if (IS_ERR(pages))
+				return PTR_ERR(pages);
 
 			ret = striped_read(inode, off, n,
 					   pages, num_pages, checkeof,
@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		CEPH_OSD_FLAG_WRITE;
 
 	while (iov_iter_count(from) > 0) {
-		u64 len = iov_iter_single_seg_count(from);
+		u64 len = dio_get_pagev_size(from);
 		size_t start;
 		ssize_t n;
 
@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 
 		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
-		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
-		if (unlikely(n < 0)) {
-			ret = n;
+		n = len;
+		pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+		if (IS_ERR(pages)) {
 			ceph_osdc_put_request(req);
+			ret = PTR_ERR(pages);
 			break;
 		}
 
-		num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
 		/*
 		 * throw out any page cache pages in this range. this
 		 * may block.
-- 
cgit v1.1


From 5e804ac4824302efc3038e086cb21f2e93ab8900 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 26 Oct 2015 16:08:43 +0800
Subject: ceph: don't invalidate page cache when inode is no longer used

ceph_check_caps() invalidate page cache when inode is not used
by any open file. This behaviour is not friendly for workload
that repeatly read files.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/caps.c       | 5 ++---
 fs/ceph/mds_client.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 27b5668..3493153 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1655,9 +1655,8 @@ retry_locked:
 	    !S_ISDIR(inode->i_mode) &&		/* ignore readdir cache */
 	    ci->i_wrbuffer_ref == 0 &&		/* no dirty pages... */
 	    inode->i_data.nrpages &&		/* have cached pages */
-	    (file_wanted == 0 ||		/* no open files */
-	     (revoking & (CEPH_CAP_FILE_CACHE|
-			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
+	    (revoking & (CEPH_CAP_FILE_CACHE|
+			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
 	    !tried_invalidate) {
 		dout("check_caps trying to invalidate on %p\n", inode);
 		if (try_nonblocking_invalidate(inode) < 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fe2c982..a7a967a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1430,6 +1430,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 		if ((used | wanted) & CEPH_CAP_ANY_WR)
 			goto out;
 	}
+	/* The inode has cached pages, but it's no longer used.
+	 * we can safely drop it */
+	if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+	    !(oissued & CEPH_CAP_FILE_CACHE)) {
+	  used = 0;
+	  oissued = 0;
+	}
 	if ((used | wanted) & ~oissued & mine)
 		goto out;   /* we need these caps */
 
@@ -1438,7 +1445,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 		/* we aren't the only cap.. just remove us */
 		__ceph_remove_cap(cap, true);
 	} else {
-		/* try to drop referring dentries */
+		/* try dropping referring dentries */
 		spin_unlock(&ci->i_ceph_lock);
 		d_prune_aliases(inode);
 		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
-- 
cgit v1.1


From 4c06ace81a60636dec358c288ef6aaf3aa6dc599 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 27 Oct 2015 17:18:00 +0800
Subject: ceph: add request to i_unsafe_dirops when getting unsafe reply

Previously we add request to i_unsafe_dirops when registering
request. So ceph_fsync() also waits for imcomplete requests.
This is unnecessary, ceph_fsync() only needs to wait unsafe
requests.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a7a967a..1e47a3d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
 		mdsc->oldest_tid = req->r_tid;
 
 	if (dir) {
-		struct ceph_inode_info *ci = ceph_inode(dir);
-
 		ihold(dir);
-		spin_lock(&ci->i_unsafe_lock);
 		req->r_unsafe_dir = dir;
-		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
-		spin_unlock(&ci->i_unsafe_lock);
 	}
 }
 
@@ -665,13 +660,14 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 	rb_erase(&req->r_node, &mdsc->request_tree);
 	RB_CLEAR_NODE(&req->r_node);
 
-	if (req->r_unsafe_dir) {
+	if (req->r_unsafe_dir && req->r_got_unsafe) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
 		spin_lock(&ci->i_unsafe_lock);
 		list_del_init(&req->r_unsafe_dir_item);
 		spin_unlock(&ci->i_unsafe_lock);
+	}
 
+	if (req->r_unsafe_dir) {
 		iput(req->r_unsafe_dir);
 		req->r_unsafe_dir = NULL;
 	}
@@ -2484,6 +2480,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	} else {
 		req->r_got_unsafe = true;
 		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+		if (req->r_unsafe_dir) {
+			struct ceph_inode_info *ci =
+					ceph_inode(req->r_unsafe_dir);
+			spin_lock(&ci->i_unsafe_lock);
+			list_add_tail(&req->r_unsafe_dir_item,
+				      &ci->i_unsafe_dirops);
+			spin_unlock(&ci->i_unsafe_lock);
+		}
 	}
 
 	dout("handle_reply tid %lld result %d\n", tid, result);
-- 
cgit v1.1


From 68cd5b4b7612c2956d8553dfb39490b29f32566d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 27 Oct 2015 18:36:06 +0800
Subject: ceph: make fsync() wait unsafe requests that created/modified inode

If we get a unsafe reply for request that created/modified inode,
add the unsafe request to a list in the newly created/modified
inode. So we can make fsync() wait these unsafe requests.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/caps.c       | 71 +++++++++++++++++++++++++---------------------------
 fs/ceph/inode.c      |  1 +
 fs/ceph/mds_client.c | 14 +++++++++++
 fs/ceph/mds_client.h |  3 +++
 fs/ceph/super.h      |  1 +
 5 files changed, 53 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3493153..c69e125 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1970,49 +1970,46 @@ out:
 }
 
 /*
- * wait for any uncommitted directory operations to commit.
+ * wait for any unsafe requests to complete.
  */
-static int unsafe_dirop_wait(struct inode *inode)
+static int unsafe_request_wait(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct list_head *head = &ci->i_unsafe_dirops;
-	struct ceph_mds_request *req;
-	u64 last_tid;
-	int ret = 0;
-
-	if (!S_ISDIR(inode->i_mode))
-		return 0;
+	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+	int ret, err = 0;
 
 	spin_lock(&ci->i_unsafe_lock);
-	if (list_empty(head))
-		goto out;
-
-	req = list_last_entry(head, struct ceph_mds_request,
-			      r_unsafe_dir_item);
-	last_tid = req->r_tid;
-
-	do {
-		ceph_mdsc_get_request(req);
-		spin_unlock(&ci->i_unsafe_lock);
+	if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+		req1 = list_last_entry(&ci->i_unsafe_dirops,
+					struct ceph_mds_request,
+					r_unsafe_dir_item);
+		ceph_mdsc_get_request(req1);
+	}
+	if (!list_empty(&ci->i_unsafe_iops)) {
+		req2 = list_last_entry(&ci->i_unsafe_iops,
+					struct ceph_mds_request,
+					r_unsafe_target_item);
+		ceph_mdsc_get_request(req2);
+	}
+	spin_unlock(&ci->i_unsafe_lock);
 
-		dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
-		     inode, req->r_tid, last_tid);
-		ret = !wait_for_completion_timeout(&req->r_safe_completion,
-					ceph_timeout_jiffies(req->r_timeout));
+	dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+	if (req1) {
+		ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+					ceph_timeout_jiffies(req1->r_timeout));
 		if (ret)
-			ret = -EIO;  /* timed out */
-
-		ceph_mdsc_put_request(req);
-
-		spin_lock(&ci->i_unsafe_lock);
-		if (ret || list_empty(head))
-			break;
-		req = list_first_entry(head, struct ceph_mds_request,
-				       r_unsafe_dir_item);
-	} while (req->r_tid < last_tid);
-out:
-	spin_unlock(&ci->i_unsafe_lock);
-	return ret;
+			err = -EIO;
+		ceph_mdsc_put_request(req1);
+	}
+	if (req2) {
+		ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+					ceph_timeout_jiffies(req2->r_timeout));
+		if (ret)
+			err = -EIO;
+		ceph_mdsc_put_request(req2);
+	}
+	return err;
 }
 
 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2038,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
-	ret = unsafe_dirop_wait(inode);
+	ret = unsafe_request_wait(inode);
 
 	/*
 	 * only wait on non-file metadata writeback (the mds
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 96d2bd8..498dcfa 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	INIT_LIST_HEAD(&ci->i_unsafe_writes);
 	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+	INIT_LIST_HEAD(&ci->i_unsafe_iops);
 	spin_lock_init(&ci->i_unsafe_lock);
 
 	ci->i_snap_realm = NULL;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1e47a3d..89838a2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -666,6 +666,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		list_del_init(&req->r_unsafe_dir_item);
 		spin_unlock(&ci->i_unsafe_lock);
 	}
+	if (req->r_target_inode && req->r_got_unsafe) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_target_item);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
 
 	if (req->r_unsafe_dir) {
 		iput(req->r_unsafe_dir);
@@ -1707,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	req->r_started = jiffies;
 	req->r_resend_mds = -1;
 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+	INIT_LIST_HEAD(&req->r_unsafe_target_item);
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
 	INIT_LIST_HEAD(&req->r_wait);
@@ -2529,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	up_read(&mdsc->snap_rwsem);
 	if (realm)
 		ceph_put_snap_realm(mdsc, realm);
+
+	if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+		spin_lock(&ci->i_unsafe_lock);
+		list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
 out_err:
 	mutex_lock(&mdsc->mutex);
 	if (!req->r_aborted) {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f575eaf..ccf11ef 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -236,6 +236,9 @@ struct ceph_mds_request {
 	struct inode	*r_unsafe_dir;
 	struct list_head r_unsafe_dir_item;
 
+	/* unsafe requests that modify the target inode */
+	struct list_head r_unsafe_target_item;
+
 	struct ceph_mds_session *r_session;
 
 	int               r_attempts;   /* resend attempts */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2f2460d..75b7d12 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -342,6 +342,7 @@ struct ceph_inode_info {
 
 	struct list_head i_unsafe_writes; /* uncommitted sync writes */
 	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
 	spinlock_t i_unsafe_lock;
 
 	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
-- 
cgit v1.1


From 79dbd1baa651cece408e68a1b445f3628c4b5bdc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2015 22:23:56 +0100
Subject: libceph: msg signing callouts don't need con argument

We can use msg->con instead - at the point we sign an outgoing message
or check the signature on the incoming one, msg->con is always set.  We
wouldn't know how to sign a message without an associated session (i.e.
msg->con == NULL) and being able to sign a message using an explicitly
provided authorizer is of no use.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 89838a2..e7b130a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3942,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
 	return msg;
 }
 
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_sign_message(struct ceph_msg *msg)
 {
-       struct ceph_mds_session *s = con->private;
+       struct ceph_mds_session *s = msg->con->private;
        struct ceph_auth_handshake *auth = &s->s_auth;
+
        return ceph_auth_sign_message(auth, msg);
 }
 
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_check_message_signature(struct ceph_msg *msg)
 {
-       struct ceph_mds_session *s = con->private;
+       struct ceph_mds_session *s = msg->con->private;
        struct ceph_auth_handshake *auth = &s->s_auth;
+
        return ceph_auth_check_message_signature(auth, msg);
 }
 
@@ -3965,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.invalidate_authorizer = invalidate_authorizer,
 	.peer_reset = peer_reset,
 	.alloc_msg = mds_alloc_msg,
-	.sign_message = sign_message,
-	.check_message_signature = check_message_signature,
+	.sign_message = mds_sign_message,
+	.check_message_signature = mds_check_message_signature,
 };
 
 /* eof */
-- 
cgit v1.1


From 3e12dbbdbd8809f0455920e42fdbf9eddc002651 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 12:27:22 +1100
Subject: xfs: fix inode size update overflow in xfs_map_direct()

Both direct IO and DAX pass an offset and count into get_blocks that
will overflow a s64 variable when an IO goes into the last supported
block in a file (i.e. at offset 2^63 - 1FSB bytes). This can be seen
from the tracing:

xfs_get_blocks_alloc: [...] offset 0x7ffffffffffff000 count 4096
xfs_gbmap_direct:     [...] offset 0x7ffffffffffff000 count 4096
xfs_gbmap_direct_none:[...] offset 0x7ffffffffffff000 count 4096

0x7ffffffffffff000 + 4096 = 0x8000000000000000, and hence that
overflows the s64 offset and we fail to detect the need for a
filesize update and an ioend is not allocated.

This is *mostly* avoided for direct IO because such extending IOs
occur with full block allocation, and so the "IS_UNWRITTEN()" check
still evaluates as true and we get an ioend that way. However, doing
single sector extending IOs to this last block will expose the fact
that file size updates will not occur after the first allocating
direct IO as the overflow will then be exposed.

There is one further complexity: the DAX page fault path also
exposes the same issue in block allocation. However, page faults
cannot extend the file size, so in this case we want to allocate the
block but do not want to allocate an ioend to enable file size
update at IO completion. Hence we now need to distinguish between
the direct IO patch allocation and dax fault path allocation to
avoid leaking ioend structures.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_aops.h |  2 ++
 fs/xfs/xfs_file.c |  6 +++---
 3 files changed, 49 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 50ab287..e747d6a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1250,13 +1250,28 @@ xfs_vm_releasepage(
  * the DIO. There is only going to be one reference to the ioend and its life
  * cycle is constrained by the DIO completion code. hence we don't need
  * reference counting here.
+ *
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
+ *
+ * For DAX page faults, we are preparing to never see unwritten extents here,
+ * nor should we ever extend the inode size. Hence we will soon have nothing to
+ * do here for this case, ensuring we don't have to provide an IO completion
+ * callback to free an ioend that we don't actually need for a fault into the
+ * page at offset (2^63 - 1FSB) bytes.
  */
+
 static void
 xfs_map_direct(
 	struct inode		*inode,
 	struct buffer_head	*bh_result,
 	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
+	xfs_off_t		offset,
+	bool			dax_fault)
 {
 	struct xfs_ioend	*ioend;
 	xfs_off_t		size = bh_result->b_size;
@@ -1269,6 +1284,16 @@ xfs_map_direct(
 
 	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
 
+	/* XXX: preparation for removing unwritten extents in DAX */
+#if 0
+	if (dax_fault) {
+		ASSERT(type == XFS_IO_OVERWRITE);
+		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+					    imap);
+		return;
+	}
+#endif
+
 	if (bh_result->b_private) {
 		ioend = bh_result->b_private;
 		ASSERT(ioend->io_size > 0);
@@ -1283,7 +1308,8 @@ xfs_map_direct(
 					      ioend->io_size, ioend->io_type,
 					      imap);
 	} else if (type == XFS_IO_UNWRITTEN ||
-		   offset + size > i_size_read(inode)) {
+		   offset + size > i_size_read(inode) ||
+		   offset + size < 0) {
 		ioend = xfs_alloc_ioend(inode, type);
 		ioend->io_offset = offset;
 		ioend->io_size = size;
@@ -1345,7 +1371,8 @@ __xfs_get_blocks(
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
-	bool			direct)
+	bool			direct,
+	bool			dax_fault)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -1458,7 +1485,8 @@ __xfs_get_blocks(
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
 		if (create && direct)
-			xfs_map_direct(inode, bh_result, &imap, offset);
+			xfs_map_direct(inode, bh_result, &imap, offset,
+				       dax_fault);
 	}
 
 	/*
@@ -1505,7 +1533,7 @@ xfs_get_blocks(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
 }
 
 int
@@ -1515,7 +1543,17 @@ xfs_get_blocks_direct(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+}
+
+int
+xfs_get_blocks_dax_fault(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
 }
 
 static void
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1a..d39ba25 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -58,6 +58,8 @@ int	xfs_get_blocks(struct inode *inode, sector_t offset,
 		       struct buffer_head *map_bh, int create);
 int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 			      struct buffer_head *map_bh, int create);
+int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+			         struct buffer_head *map_bh, int create);
 void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb4..27abe1c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1503,7 +1503,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault,
 				    xfs_end_io_dax_write);
 	} else {
 		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
@@ -1538,7 +1538,7 @@ xfs_filemap_fault(
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
-		ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1565,7 +1565,7 @@ xfs_filemap_pmd_fault(
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
 				    xfs_end_io_dax_write);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
-- 
cgit v1.1


From 3fbbbea34bac049c0b5938dc065f7d8ee1ef7e67 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 12:27:22 +1100
Subject: xfs: introduce BMAPI_ZERO for allocating zeroed extents

To enable DAX to do atomic allocation of zeroed extents, we need to
drive the block zeroing deep into the allocator. Because
xfs_bmapi_write() can return merged extents on allocation that were
only partially allocated (i.e. requested range spans allocated and
hole regions, allocation into the hole was contiguous), we cannot
zero the extent returned from xfs_bmapi_write() as that can
overwrite existing data with zeros.

Hence we have to drive the extent zeroing into the allocation code,
prior to where we merge the extents into the BMBT and return the
resultant map. This means we need to propagate this need down to
the xfs_alloc_vextent() and issue the block zeroing at this point.

While this functionality is being introduced for DAX, there is no
reason why it is specific to DAX - we can per-zero blocks during the
allocation transaction on any type of device. It's just slow (and
usually slower than unwritten allocation and conversion) on
traditional block devices so doesn't tend to get used. We can,
however, hook hardware zeroing optimisations via sb_issue_zeroout()
to this operation, so it may be useful in future and hence the
"allocate zeroed blocks" API needs to be implementation neutral.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_alloc.c | 10 +++++++++-
 fs/xfs/libxfs/xfs_alloc.h |  8 +++++---
 fs/xfs/libxfs/xfs_bmap.c  | 35 +++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_bmap.h  | 13 +++++++++++--
 fs/xfs/xfs_bmap_util.c    | 36 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_mount.h        |  3 +++
 6 files changed, 97 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ffad7f2..4cffc17 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2503,7 +2503,7 @@ xfs_alloc_vextent(
 		 * Try near allocation first, then anywhere-in-ag after
 		 * the first a.g. fails.
 		 */
-		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+		if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
 		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
 			args->fsbno = XFS_AGB_TO_FSB(mp,
 					((mp->m_agfrotor / rotorstep) %
@@ -2634,6 +2634,14 @@ xfs_alloc_vextent(
 		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
 			args->len);
 #endif
+
+		/* Zero the extent if we were asked to do so */
+		if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+			error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+			if (error)
+				goto error0;
+		}
+
 	}
 	xfs_perag_put(args->pag);
 	return 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ca1c816..0ecde4d 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
 	struct xfs_mount *mp;		/* file system mount point */
 	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
 	struct xfs_perag *pag;		/* per-ag struct for this agno */
+	struct xfs_inode *ip;		/* for userdata zeroing method */
 	xfs_fsblock_t	fsbno;		/* file system block number */
 	xfs_agnumber_t	agno;		/* allocation group number */
 	xfs_agblock_t	agbno;		/* allocation group-relative block # */
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
 	char		wasdel;		/* set if allocation was prev delayed */
 	char		wasfromfl;	/* set if allocation is from freelist */
 	char		isfl;		/* set if is freelist blocks - !acctg */
-	char		userdata;	/* set if this is user data */
+	char		userdata;	/* mask defining userdata treatment */
 	xfs_fsblock_t	firstblock;	/* io first block allocated */
 } xfs_alloc_arg_t;
 
 /*
  * Defines for userdata
  */
-#define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
+#define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO		(1 << 2)/* zero extent on allocation */
 
 xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 		struct xfs_perag *pag, xfs_extlen_t need);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8e2010d..9390b7c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3800,8 +3800,13 @@ xfs_bmap_btalloc(
 	args.wasdel = ap->wasdel;
 	args.isfl = 0;
 	args.userdata = ap->userdata;
-	if ((error = xfs_alloc_vextent(&args)))
+	if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+		args.ip = ap->ip;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
 		return error;
+
 	if (tryagain && args.fsbno == NULLFSBLOCK) {
 		/*
 		 * Exact allocation failed. Now try with alignment
@@ -4300,11 +4305,14 @@ xfs_bmapi_allocate(
 
 	/*
 	 * Indicate if this is the first user data in the file, or just any
-	 * user data.
+	 * user data. And if it is userdata, indicate whether it needs to
+	 * be initialised to zero during allocation.
 	 */
 	if (!(bma->flags & XFS_BMAPI_METADATA)) {
 		bma->userdata = (bma->offset == 0) ?
 			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+		if (bma->flags & XFS_BMAPI_ZERO)
+			bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
 	}
 
 	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4419,6 +4427,17 @@ xfs_bmapi_convert_unwritten(
 	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
 				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
 
+	/*
+	 * Before insertion into the bmbt, zero the range being converted
+	 * if required.
+	 */
+	if (flags & XFS_BMAPI_ZERO) {
+		error = xfs_zero_extent(bma->ip, mval->br_startblock,
+					mval->br_blockcount);
+		if (error)
+			return error;
+	}
+
 	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
 			&bma->cur, mval, bma->firstblock, bma->flist,
 			&tmp_logflags);
@@ -4512,6 +4531,18 @@ xfs_bmapi_write(
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
+	/* zeroing is for currently only for data extents, not metadata */
+	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+			(XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+	/*
+	 * we can allocate unwritten extents or pre-zero allocated blocks,
+	 * but it makes no sense to do both at once. This would result in
+	 * zeroing the unwritten extent twice, but it still being an
+	 * unwritten extent....
+	 */
+	ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 6aaa0c1..a160f8a 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
 	xfs_extlen_t		minleft; /* amount must be left after alloc */
 	bool			eof;	/* set if allocating past last extent */
 	bool			wasdel;	/* replacing a delayed allocation */
-	bool			userdata;/* set if is user data */
 	bool			aeof;	/* allocated space at eof */
 	bool			conv;	/* overwriting unwritten extents */
+	char			userdata;/* userdata mask */
 	int			flags;
 };
 
@@ -109,6 +109,14 @@ typedef	struct xfs_bmap_free
  */
 #define XFS_BMAPI_CONVERT	0x040
 
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO		0x080
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -116,7 +124,8 @@ typedef	struct xfs_bmap_free
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
-	{ XFS_BMAPI_CONVERT,	"CONVERT" }
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
+	{ XFS_BMAPI_ZERO,	"ZERO" }
 
 
 static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3bf4ad0..3f59698 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -57,6 +57,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 }
 
 /*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+	struct xfs_inode *ip,
+	xfs_fsblock_t	start_fsb,
+	xfs_off_t	count_fsb)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	xfs_daddr_t	sector = xfs_fsb_to_db(ip, start_fsb);
+	sector_t	block = XFS_BB_TO_FSBT(mp, sector);
+	ssize_t		size = XFS_FSB_TO_B(mp, count_fsb);
+
+	if (IS_DAX(VFS_I(ip)))
+		return dax_clear_blocks(VFS_I(ip), block, size);
+
+	/*
+	 * let the block layer decide on the fastest method of
+	 * implementing the zeroing.
+	 */
+	return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
+/*
  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  * caller.  Frees all the extents that need freeing, which must be done
  * last due to locking considerations.  We never free any extents in
@@ -229,6 +258,13 @@ xfs_bmap_rtalloc(
 		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+		/* Zero the extent if we were asked to do so */
+		if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+			error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+			if (error)
+				return error;
+		}
 	} else {
 		ap->length = 0;
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91..404bfa5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -336,4 +336,7 @@ extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 
 extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 
+int	xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+			xfs_off_t count_fsb);
+
 #endif	/* __XFS_MOUNT_H__ */
-- 
cgit v1.1


From 1ca191576fc862b4766f58e41aa362b28a7c1866 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 12:37:00 +1100
Subject: xfs: Don't use unwritten extents for DAX

DAX has a page fault serialisation problem with block allocation.
Because it allows concurrent page faults and does not have a page
lock to serialise faults to the same page, it can get two concurrent
faults to the page that race.

When two read faults race, this isn't a huge problem as the data
underlying the page is not changing and so "detect and drop" works
just fine. The issues are to do with write faults.

When two write faults occur, we serialise block allocation in
get_blocks() so only one faul will allocate the extent. It will,
however, be marked as an unwritten extent, and that is where the
problem lies - the DAX fault code cannot differentiate between a
block that was just allocated and a block that was preallocated and
needs zeroing. The result is that both write faults end up zeroing
the block and attempting to convert it back to written.

The problem is that the first fault can zero and convert before the
second fault starts zeroing, resulting in the zeroing for the second
fault overwriting the data that the first fault wrote with zeros.
The second fault then attempts to convert the unwritten extent,
which is then a no-op because it's already written. Data loss occurs
as a result of this race.

Because there is no sane locking construct in the page fault code
that we can use for serialisation across the page faults, we need to
ensure block allocation and zeroing occurs atomically in the
filesystem. This means we can still take concurrent page faults and
the only time they will serialise is in the filesystem
mapping/allocation callback. The page fault code will always see
written, initialised extents, so we will be able to remove the
unwritten extent handling from the DAX code when all filesystems are
converted.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c           |  5 +++++
 fs/xfs/xfs_aops.c  | 13 +++++++++----
 fs/xfs/xfs_iomap.c | 23 +++++++++++++++++++++--
 3 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 7ae6df7..74033ad 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -29,6 +29,11 @@
 #include <linux/uio.h>
 #include <linux/vmstat.h>
 
+/*
+ * dax_clear_blocks() is called from within transaction context from XFS,
+ * and hence this means the stack from this point must follow GFP_NOFS
+ * semantics for all operations.
+ */
 int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 {
 	struct block_device *bdev = inode->i_sb->s_bdev;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e747d6a..df3dabd 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1284,15 +1284,12 @@ xfs_map_direct(
 
 	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
 
-	/* XXX: preparation for removing unwritten extents in DAX */
-#if 0
 	if (dax_fault) {
 		ASSERT(type == XFS_IO_OVERWRITE);
 		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
 					    imap);
 		return;
 	}
-#endif
 
 	if (bh_result->b_private) {
 		ioend = bh_result->b_private;
@@ -1420,10 +1417,12 @@ __xfs_get_blocks(
 	if (error)
 		goto out_unlock;
 
+	/* for DAX, we convert unwritten extents directly */
 	if (create &&
 	    (!nimaps ||
 	     (imap.br_startblock == HOLESTARTBLOCK ||
-	      imap.br_startblock == DELAYSTARTBLOCK))) {
+	      imap.br_startblock == DELAYSTARTBLOCK) ||
+	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
 		if (direct || xfs_get_extsz_hint(ip)) {
 			/*
 			 * Drop the ilock in preparation for starting the block
@@ -1468,6 +1467,12 @@ __xfs_get_blocks(
 		goto out_unlock;
 	}
 
+	if (IS_DAX(inode) && create) {
+		ASSERT(!ISUNWRITTEN(&imap));
+		/* zeroing is not needed at a higher layer */
+		new = 0;
+	}
+
 	/* trim mapping down to size requested */
 	if (direct || size > (1 << inode->i_blkbits))
 		xfs_map_trim_size(inode, iblock, bh_result,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033..b48c6b52 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -131,6 +131,7 @@ xfs_iomap_write_direct(
 	uint		qblocks, resblks, resrtextents;
 	int		committed;
 	int		error;
+	int		bmapi_flags = XFS_BMAPI_PREALLOC;
 
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
@@ -177,6 +178,23 @@ xfs_iomap_write_direct(
 	 * Allocate and setup the transaction
 	 */
 	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+	/*
+	 * For DAX, we do not allocate unwritten extents, but instead we zero
+	 * the block before we commit the transaction.  Ideally we'd like to do
+	 * this outside the transaction context, but if we commit and then crash
+	 * we may not have zeroed the blocks and this will be exposed on
+	 * recovery of the allocation. Hence we must zero before commit.
+	 * Further, if we are mapping unwritten extents here, we need to zero
+	 * and convert them to written so that we don't need an unwritten extent
+	 * callback for DAX. This also means that we need to be able to dip into
+	 * the reserve block pool if there is no space left but we need to do
+	 * unwritten extent conversion.
+	 */
+	if (IS_DAX(VFS_I(ip))) {
+		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	}
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
 				  resblks, resrtextents);
 	/*
@@ -202,8 +220,8 @@ xfs_iomap_write_direct(
 	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-				XFS_BMAPI_PREALLOC, &firstfsb, 0,
-				imap, &nimaps, &free_list);
+				bmapi_flags, &firstfsb, 0, imap,
+				&nimaps, &free_list);
 	if (error)
 		goto out_bmap_cancel;
 
@@ -213,6 +231,7 @@ xfs_iomap_write_direct(
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error)
 		goto out_bmap_cancel;
+
 	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_unlock;
-- 
cgit v1.1


From 01a155e6cf7db1a8ff2aa73162d7d9ec05ad298f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 12:37:02 +1100
Subject: xfs: DAX does not use IO completion callbacks

For DAX, we are now doing block zeroing during allocation. This
means we no longer need a special DAX fault IO completion callback
to do unwritten extent conversion. Because mmap never extends the
file size (it SEGVs the process) we don't need a callback to update
the file size, either. Hence we can remove the completion callbacks
from the __dax_fault and __dax_mkwrite calls.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 39 ---------------------------------------
 fs/xfs/xfs_aops.h |  1 -
 fs/xfs/xfs_file.c |  5 ++---
 3 files changed, 2 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index df3dabd..69c2dbc 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1657,45 +1657,6 @@ xfs_end_io_direct_write(
 	__xfs_end_io_direct_write(inode, ioend, offset, size);
 }
 
-/*
- * For DAX we need a mapping buffer callback for unwritten extent conversion
- * when page faults allocate blocks and then zero them. Note that in this
- * case the mapping indicated by the ioend may extend beyond EOF. We most
- * definitely do not want to extend EOF here, so we trim back the ioend size to
- * EOF.
- */
-#ifdef CONFIG_FS_DAX
-void
-xfs_end_io_dax_write(
-	struct buffer_head	*bh,
-	int			uptodate)
-{
-	struct xfs_ioend	*ioend = bh->b_private;
-	struct inode		*inode = ioend->io_inode;
-	ssize_t			size = ioend->io_size;
-
-	ASSERT(IS_DAX(ioend->io_inode));
-
-	/* if there was an error zeroing, then don't convert it */
-	if (!uptodate)
-		ioend->io_error = -EIO;
-
-	/*
-	 * Trim update to EOF, so we don't extend EOF during unwritten extent
-	 * conversion of partial EOF blocks.
-	 */
-	spin_lock(&XFS_I(inode)->i_flags_lock);
-	if (ioend->io_offset + size > i_size_read(inode))
-		size = i_size_read(inode) - ioend->io_offset;
-	spin_unlock(&XFS_I(inode)->i_flags_lock);
-
-	__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
-
-}
-#else
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
-#endif
-
 static inline ssize_t
 xfs_vm_do_dio(
 	struct inode		*inode,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index d39ba25..f6ffc9a 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,7 +60,6 @@ int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 			      struct buffer_head *map_bh, int create);
 int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
 			         struct buffer_head *map_bh, int create);
-void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 27abe1c..9c8eef7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1503,8 +1503,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault,
-				    xfs_end_io_dax_write);
+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
 	} else {
 		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
 		ret = block_page_mkwrite_return(ret);
@@ -1566,7 +1565,7 @@ xfs_filemap_pmd_fault(
 	file_update_time(vma->vm_file);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
-				    xfs_end_io_dax_write);
+			      NULL);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 
-- 
cgit v1.1


From 3af49285854df66260a263198cc15abb07b95287 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 12:37:02 +1100
Subject: xfs: add ->pfn_mkwrite support for DAX

->pfn_mkwrite support is needed so that when a page with allocated
backing store takes a write fault we can check that the fault has
not raced with a truncate and is pointing to a region beyond the
current end of file.

This also allows us to update the timestamp on the inode, too, which
fixes a generic/080 failure.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c  | 35 +++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h |  1 +
 2 files changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9c8eef7..f429662 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1572,11 +1572,46 @@ xfs_filemap_pmd_fault(
 	return ret;
 }
 
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+	struct vm_area_struct	*vma,
+	struct vm_fault		*vmf)
+{
+
+	struct inode		*inode = file_inode(vma->vm_file);
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			ret = VM_FAULT_NOPAGE;
+	loff_t			size;
+
+	trace_xfs_filemap_pfn_mkwrite(ip);
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+
+	/* check if the faulting page hasn't raced with truncate */
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+
+}
+
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
 	.pmd_fault	= xfs_filemap_pmd_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
+	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
 };
 
 STATIC int
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5ed36b1..c53beda 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 DEFINE_INODE_EVENT(xfs_filemap_fault);
 DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
 DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
 
 DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
-- 
cgit v1.1


From 13ad4fe3e087ab66a140f1e00d98f28aa4e3bb28 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 12:37:02 +1100
Subject: xfs: xfs_filemap_pmd_fault treats read faults as write faults

The code initially committed didn't have the same checks for write
faults as the dax_pmd_fault code and hence treats all faults as
write faults. We can get read faults through this path because they
is no pmd_mkwrite path for write faults similar to the normal page
fault path. Hence we need to ensure that we only do c/mtime updates
on write faults, and freeze protection is unnecessary for read
faults.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f429662..ce208e3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1477,7 +1477,7 @@ xfs_file_llseek(
  *
  * mmap_sem (MM)
  *   sb_start_pagefault(vfs, freeze)
- *     i_mmap_lock (XFS - truncate serialisation)
+ *     i_mmaplock (XFS - truncate serialisation)
  *       page_lock (MM)
  *         i_lock (XFS - extent map serialisation)
  */
@@ -1545,6 +1545,13 @@ xfs_filemap_fault(
 	return ret;
 }
 
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
 STATIC int
 xfs_filemap_pmd_fault(
 	struct vm_area_struct	*vma,
@@ -1561,13 +1568,18 @@ xfs_filemap_pmd_fault(
 
 	trace_xfs_filemap_pmd_fault(ip);
 
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	if (flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vma->vm_file);
+	}
+
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
 			      NULL);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
+
+	if (flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(inode->i_sb);
 
 	return ret;
 }
-- 
cgit v1.1


From 67d8e04e345eafcb2940066f435815032eec467d Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 3 Nov 2015 12:40:59 +1100
Subject: xfs: invalidate cached acl if set directly via xattr

ACLs are stored as extended attributes of the inode to which they apply.
XFS converts the standard "system.posix_acl_[access|default]" attribute
names used to control ACLs to "trusted.SGI_ACL_[FILE|DEFAULT]" as stored
on-disk. These xattrs are directly exposed in on-disk format via
getxattr/setxattr, without any ACL aware code in the path to perform
validation, etc. This is partly historical and supports backup/restore
applications such as xfsdump to back up and restore the binary blob that
represents ACLs as-is.

Andreas reports that the ACLs observed via the getfacl interface is not
consistent when ACLs are set directly via the setxattr path. This occurs
because the ACLs are cached in-core against the inode and the xattr path
has no knowledge that the operation relates to ACLs.

Update the xattr set codepath to trap writes of the special XFS ACL
attributes and invalidate the associated cached ACL when this occurs.
This ensures that the correct ACLs are used on a subsequent operation
through the actual ACL interface.

Note that this does not update or add support for setting the ACL xattrs
directly beyond the restore use case that requires a correctly formatted
binary blob and to restore a consistent i_mode at the same time. It is
still possible for a root user to set an invalid or inconsistent (with
i_mode) ACL blob on-disk and potentially cause corruption.

[ With fixes from Andreas Gruenbacher. ]

Reported-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_xattr.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c0368151..1e08d3e 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -57,7 +57,8 @@ static int
 xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags, int xflags)
 {
-	struct xfs_inode *ip = XFS_I(d_inode(dentry));
+	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	int			error;
 
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -70,8 +71,24 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 
 	if (!value)
 		return xfs_attr_remove(ip, (unsigned char *)name, xflags);
-	return xfs_attr_set(ip, (unsigned char *)name,
+	error = xfs_attr_set(ip, (unsigned char *)name,
 				(void *)value, size, xflags);
+	/*
+	 * Invalidate any cached ACLs if the user has bypassed the ACL
+	 * interface. We don't validate the content whatsoever so it is caller
+	 * responsibility to provide data in valid format and ensure i_mode is
+	 * consistent.
+	 */
+#ifdef CONFIG_XFS_POSIX_ACL
+	if (!error && (xflags & ATTR_ROOT)) {
+		if (!strcmp(name, SGI_ACL_FILE))
+			forget_cached_acl(VFS_I(ip), ACL_TYPE_ACCESS);
+		else if (!strcmp(name, SGI_ACL_DEFAULT))
+			forget_cached_acl(VFS_I(ip), ACL_TYPE_DEFAULT);
+	}
+#endif
+
+	return error;
 }
 
 static const struct xattr_handler xfs_xattr_user_handler = {
-- 
cgit v1.1


From 86a21c79745ca97676cbd47f8608839382cc0448 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 12:41:59 +1100
Subject: xfs: Validate the length of on-disk ACLs

In xfs_acl_from_disk, instead of trusting that xfs_acl.acl_cnt is correct,
make sure that the length of the attributes is correct as well.  Also, turn
the aclp parameter into a const pointer.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_format.h |  8 ++++++--
 fs/xfs/xfs_acl.c           | 13 ++++++++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9590a06..0e62682 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1487,9 +1487,13 @@ struct xfs_acl {
 						sizeof(struct xfs_acl_entry) \
 		: 25)
 
-#define XFS_ACL_MAX_SIZE(mp) \
+#define XFS_ACL_SIZE(cnt) \
 	(sizeof(struct xfs_acl) + \
-		sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+		sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+	XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
 
 /* On-disk XFS extended attribute names */
 #define SGI_ACL_FILE		"SGI_ACL_FILE"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4b64167..763e365 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,16 +37,19 @@
 
 STATIC struct posix_acl *
 xfs_acl_from_disk(
-	struct xfs_acl	*aclp,
-	int		max_entries)
+	const struct xfs_acl	*aclp,
+	int			len,
+	int			max_entries)
 {
 	struct posix_acl_entry *acl_e;
 	struct posix_acl *acl;
-	struct xfs_acl_entry *ace;
+	const struct xfs_acl_entry *ace;
 	unsigned int count, i;
 
+	if (len < sizeof(*aclp))
+		return ERR_PTR(-EFSCORRUPTED);
 	count = be32_to_cpu(aclp->acl_cnt);
-	if (count > max_entries)
+	if (count > max_entries || XFS_ACL_SIZE(count) != len)
 		return ERR_PTR(-EFSCORRUPTED);
 
 	acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -163,7 +166,7 @@ xfs_get_acl(struct inode *inode, int type)
 		goto out;
 	}
 
-	acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+	acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
 	if (IS_ERR(acl))
 		goto out;
 
-- 
cgit v1.1


From 09cb22d2a57b51d7d052dfe508f260abc67b69b6 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 12:53:54 +1100
Subject: xfs: Plug memory leak in xfs_attrmulti_attr_set

When setting attributes via XFS_IOC_ATTRMULTI_BY_HANDLE, the user-space
buffer is copied into a new kernel-space buffer via memdup_user; that
buffer then isn't freed.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85a..e939c20 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -482,6 +482,7 @@ xfs_attrmulti_attr_set(
 	__uint32_t		flags)
 {
 	unsigned char		*kbuf;
+	int			error;
 
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		return -EPERM;
@@ -492,7 +493,9 @@ xfs_attrmulti_attr_set(
 	if (IS_ERR(kbuf))
 		return PTR_ERR(kbuf);
 
-	return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+	kfree(kbuf);
+	return error;
 }
 
 int
-- 
cgit v1.1


From 47e1bf640558237b79d3009fb7dfe157f12f4f7a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 12:56:17 +1100
Subject: xfs: invalidate cached acl if set via ioctl

Setting or removing the "SGI_ACL_[FILE|DEFAULT]" attributes via the
XFS_IOC_ATTRMULTI_BY_HANDLE ioctl completely bypasses the POSIX ACL
infrastructure, like setting the "trusted.SGI_ACL_[FILE|DEFAULT]" xattrs
did until commit 6caa1056.  Similar to that commit, invalidate cached
acls when setting/removing them via the ioctl as well.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_acl.h   |  3 +++
 fs/xfs/xfs_ioctl.c | 10 +++++++++-
 fs/xfs/xfs_xattr.c | 38 ++++++++++++++++++++++++--------------
 3 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 3841b07..75af0a4 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,4 +36,7 @@ static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
 # define posix_acl_access_exists(inode)			0
 # define posix_acl_default_exists(inode)		0
 #endif /* CONFIG_XFS_POSIX_ACL */
+
+extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
+
 #endif	/* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index e939c20..67bb298 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
 #include "xfs_pnfs.h"
+#include "xfs_acl.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -494,6 +495,8 @@ xfs_attrmulti_attr_set(
 		return PTR_ERR(kbuf);
 
 	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+	if (!error)
+		xfs_forget_acl(inode, name, flags);
 	kfree(kbuf);
 	return error;
 }
@@ -504,9 +507,14 @@ xfs_attrmulti_attr_remove(
 	unsigned char		*name,
 	__uint32_t		flags)
 {
+	int			error;
+
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		return -EPERM;
-	return xfs_attr_remove(XFS_I(inode), name, flags);
+	error = xfs_attr_remove(XFS_I(inode), name, flags);
+	if (!error)
+		xfs_forget_acl(inode, name, flags);
+	return error;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 1e08d3e..8294f86 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -53,6 +53,28 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
 	return asize;
 }
 
+void
+xfs_forget_acl(
+	struct inode		*inode,
+	const char		*name,
+	int			xflags)
+{
+	/*
+	 * Invalidate any cached ACLs if the user has bypassed the ACL
+	 * interface. We don't validate the content whatsoever so it is caller
+	 * responsibility to provide data in valid format and ensure i_mode is
+	 * consistent.
+	 */
+	if (xflags & ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+		if (!strcmp(name, SGI_ACL_FILE))
+			forget_cached_acl(inode, ACL_TYPE_ACCESS);
+		else if (!strcmp(name, SGI_ACL_DEFAULT))
+			forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+#endif
+	}
+}
+
 static int
 xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags, int xflags)
@@ -73,20 +95,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 		return xfs_attr_remove(ip, (unsigned char *)name, xflags);
 	error = xfs_attr_set(ip, (unsigned char *)name,
 				(void *)value, size, xflags);
-	/*
-	 * Invalidate any cached ACLs if the user has bypassed the ACL
-	 * interface. We don't validate the content whatsoever so it is caller
-	 * responsibility to provide data in valid format and ensure i_mode is
-	 * consistent.
-	 */
-#ifdef CONFIG_XFS_POSIX_ACL
-	if (!error && (xflags & ATTR_ROOT)) {
-		if (!strcmp(name, SGI_ACL_FILE))
-			forget_cached_acl(VFS_I(ip), ACL_TYPE_ACCESS);
-		else if (!strcmp(name, SGI_ACL_DEFAULT))
-			forget_cached_acl(VFS_I(ip), ACL_TYPE_DEFAULT);
-	}
-#endif
+	if (!error)
+		xfs_forget_acl(d_inode(dentry), name, xflags);
 
 	return error;
 }
-- 
cgit v1.1


From af3b63822e73b66f3ca9927b46df8b873ab8c6ec Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 3 Nov 2015 13:06:34 +1100
Subject: xfs: don't leak uuid table on rmmod

Don't leak the UUID table when the module is unloaded.
(Found with kmemleak.)

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_mount.c | 10 ++++++++++
 fs/xfs/xfs_mount.h |  1 +
 fs/xfs/xfs_super.c |  1 +
 3 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bf92e0c..fe6fe8a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
 
+void
+xfs_uuid_table_free(void)
+{
+	if (xfs_uuid_table_size == 0)
+		return;
+	kmem_free(xfs_uuid_table);
+	xfs_uuid_table = NULL;
+	xfs_uuid_table_size = 0;
+}
+
 /*
  * See if the UUID is unique among mounted XFS filesystems.
  * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91..d6cbdcb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -312,6 +312,7 @@ typedef struct xfs_perag {
 	int		pagb_count;	/* pagb slots in use */
 } xfs_perag_t;
 
+extern void	xfs_uuid_table_free(void);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int	xfs_mountfs(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 904f637c..29531ec 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1896,6 +1896,7 @@ exit_xfs_fs(void)
 	xfs_mru_cache_uninit();
 	xfs_destroy_workqueues();
 	xfs_destroy_zones();
+	xfs_uuid_table_free();
 }
 
 module_init(init_xfs_fs);
-- 
cgit v1.1


From fc0561cefc04e7803c0f6501ca4f310a502f65b8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Nov 2015 13:14:59 +1100
Subject: xfs: optimise away log forces on timestamp updates for fdatasync

xfs: timestamp updates cause excessive fdatasync log traffic

Sage Weil reported that a ceph test workload was writing to the
log on every fdatasync during an overwrite workload. Event tracing
showed that the only metadata modification being made was the
timestamp updates during the write(2) syscall, but fdatasync(2)
is supposed to ignore them. The key observation was that the
transactions in the log all looked like this:

INODE: #regs: 4   ino: 0x8b  flags: 0x45   dsize: 32

And contained a flags field of 0x45 or 0x85, and had data and
attribute forks following the inode core. This means that the
timestamp updates were triggering dirty relogging of previously
logged parts of the inode that hadn't yet been flushed back to
disk.

There are two parts to this problem. The first is that XFS relogs
dirty regions in subsequent transactions, so it carries around the
fields that have been dirtied since the last time the inode was
written back to disk, not since the last time the inode was forced
into the log.

The second part is that on v5 filesystems, the inode change count
update during inode dirtying also sets the XFS_ILOG_CORE flag, so
on v5 filesystems this makes a timestamp update dirty the entire
inode.

As a result when fdatasync is run, it looks at the dirty fields in
the inode, and sees more than just the timestamp flag, even though
the only metadata change since the last fdatasync was just the
timestamps. Hence we force the log on every subsequent fdatasync
even though it is not needed.

To fix this, add a new field to the inode log item that tracks
changes since the last time fsync/fdatasync forced the log to flush
the changes to the journal. This flag is updated when we dirty the
inode, but we do it before updating the change count so it does not
carry the "core dirty" flag from timestamp updates. The fields are
zeroed when the inode is marked clean (due to writeback/freeing) or
when an fsync/datasync forces the log. Hence if we only dirty the
timestamps on the inode between fsync/fdatasync calls, the fdatasync
will not trigger another log force.

Over 100 runs of the test program:

Ext4 baseline:
	runtime: 1.63s +/- 0.24s
	avg lat: 1.59ms +/- 0.24ms
	iops: ~2000

XFS, vanilla kernel:
        runtime: 2.45s +/- 0.18s
	avg lat: 2.39ms +/- 0.18ms
	log forces: ~400/s
	iops: ~1000

XFS, patched kernel:
        runtime: 1.49s +/- 0.26s
	avg lat: 1.46ms +/- 0.25ms
	log forces: ~30/s
	iops: ~1500

Reported-by: Sage Weil <sage@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c        | 21 ++++++++++++++++-----
 fs/xfs/xfs_inode.c       |  2 ++
 fs/xfs/xfs_inode_item.c  |  1 +
 fs/xfs/xfs_inode_item.h  |  1 +
 fs/xfs/xfs_trans_inode.c |  9 +++++++++
 5 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb4..c94699c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -242,19 +242,30 @@ xfs_file_fsync(
 	}
 
 	/*
-	 * All metadata updates are logged, which means that we just have
-	 * to flush the log up to the latest LSN that touched the inode.
+	 * All metadata updates are logged, which means that we just have to
+	 * flush the log up to the latest LSN that touched the inode. If we have
+	 * concurrent fsync/fdatasync() calls, we need them to all block on the
+	 * log force before we clear the ili_fsync_fields field. This ensures
+	 * that we don't get a racing sync operation that does not wait for the
+	 * metadata to hit the journal before returning. If we race with
+	 * clearing the ili_fsync_fields, then all that will happen is the log
+	 * force will do nothing as the lsn will already be on disk. We can't
+	 * race with setting ili_fsync_fields because that is done under
+	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+	 * until after the ili_fsync_fields is cleared.
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	if (xfs_ipincount(ip)) {
 		if (!datasync ||
-		    (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+		    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 			lsn = ip->i_itemp->ili_last_lsn;
 	}
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
-	if (lsn)
+	if (lsn) {
 		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+		ip->i_itemp->ili_fsync_fields = 0;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	/*
 	 * If we only have a single device, and the log force about was
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dc40a6d..ff629d5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2365,6 +2365,7 @@ retry:
 
 			iip->ili_last_fields = iip->ili_fields;
 			iip->ili_fields = 0;
+			iip->ili_fsync_fields = 0;
 			iip->ili_logged = 1;
 			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
 						&iip->ili_item.li_lsn);
@@ -3560,6 +3561,7 @@ xfs_iflush_int(
 	 */
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
+	iip->ili_fsync_fields = 0;
 	iip->ili_logged = 1;
 
 	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 62bd80f..d14b12b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -719,6 +719,7 @@ xfs_iflush_abort(
 		 * attempted.
 		 */
 		iip->ili_fields = 0;
+		iip->ili_fsync_fields = 0;
 	}
 	/*
 	 * Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 488d812..4c7722e 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item {
 	unsigned short		ili_logged;	   /* flushed logged data */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
+	unsigned int		ili_fsync_fields;  /* logged since last fsync */
 } xfs_inode_log_item_t;
 
 static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 17280cd..b97f1df 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -108,6 +108,15 @@ xfs_trans_log_inode(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	/*
+	 * Record the specific change for fdatasync optimisation. This
+	 * allows fdatasync to skip log forces for inodes that are only
+	 * timestamp dirty. We do this before the change count so that
+	 * the core being logged in this case does not impact on fdatasync
+	 * behaviour.
+	 */
+	ip->i_itemp->ili_fsync_fields |= flags;
+
+	/*
 	 * First time we log the inode in a transaction, bump the inode change
 	 * counter if it is configured for this to occur. We don't use
 	 * inode_inc_version() because there is no need for extra locking around
-- 
cgit v1.1


From b2a3077414fd6ff1de8972ea55e91f27bcabd913 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 29 Sep 2015 21:49:28 -0500
Subject: [SMB3] Add parsing for new mount option controlling persistent
 handles

"nopersistenthandles" and "persistenthandles" mount options added.
The former will not request persistent handles on open even when
SMB3 negotiated and Continuous Availability share.  The latter
will request persistent handles (as long as server notes the
capability in protocol negotiation) even if share is not Continuous
Availability share.

Signed-off-by: Steve French <steve.french@primarydata.com>
Reviewed-by: Pavel Shilovsky <pshilovsky@samba.org>
---
 fs/cifs/cifsglob.h |  4 +++-
 fs/cifs/connect.c  | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b406a32..1849e29 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -493,7 +493,9 @@ struct smb_vol {
 	bool mfsymlinks:1; /* use Minshall+French Symlinks */
 	bool multiuser:1;
 	bool rwpidforward:1; /* pid forward for read/write operations */
-	bool nosharesock;
+	bool nosharesock:1;
+	bool persistent:1;
+	bool nopersistent:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 773f4dc..9a9a543 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,6 +87,7 @@ enum {
 	Opt_sign, Opt_seal, Opt_noac,
 	Opt_fsc, Opt_mfsymlinks,
 	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+	Opt_persistent, Opt_nopersistent,
 
 	/* Mount options which take numeric value */
 	Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -169,6 +170,8 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_multiuser, "multiuser" },
 	{ Opt_sloppy, "sloppy" },
 	{ Opt_nosharesock, "nosharesock" },
+	{ Opt_persistent, "persistenthandles"},
+	{ Opt_nopersistent, "nopersistenthandles"},
 
 	{ Opt_backupuid, "backupuid=%s" },
 	{ Opt_backupgid, "backupgid=%s" },
@@ -1497,6 +1500,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		case Opt_nosharesock:
 			vol->nosharesock = true;
 			break;
+		case Opt_nopersistent:
+			vol->nopersistent = true;
+			if (vol->persistent) {
+				cifs_dbg(VFS,
+				  "persistenthandles mount options conflict\n");
+				goto cifs_parse_mount_err;
+			}
+			break;
+		case Opt_persistent:
+			vol->persistent = true;
+			if (vol->nopersistent) {
+				cifs_dbg(VFS,
+				  "persistenthandles mount options conflict\n");
+				goto cifs_parse_mount_err;
+			}
+			break;
 
 		/* Numeric Values */
 		case Opt_backupuid:
-- 
cgit v1.1


From b618f001a20e44f691dd0e2ffea651a40a651871 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 3 Nov 2015 09:15:03 -0600
Subject: [SMB3] Enable checking for continuous availability and persistent
 handle support

Validate "persistenthandles" and "nopersistenthandles" mount options against
the support the server claims in negotiate and tree connect SMB3 responses.

Signed-off-by: Steve French <steve.french@primarydata.com>
Reviewed-by: Pavel Shilovsky <pshilovsky@samba.org>
---
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  | 30 ++++++++++++++++++++++++++++++
 fs/cifs/smb2ops.c  |  6 +++---
 3 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1849e29..28bd477 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -897,6 +897,7 @@ struct cifs_tcon {
 	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool broken_sparse_sup; /* if server or share does not support sparse */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
+	bool use_persistent:1; /* use persistent instead of durable handles */
 #ifdef CONFIG_CIFS_SMB2
 	bool print:1;		/* set if connection to printer share */
 	bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9a9a543..bb35ae7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2673,6 +2673,30 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 		cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
 	}
 	tcon->seal = volume_info->seal;
+	tcon->use_persistent = false;
+	/* check if SMB2 or later, CIFS does not support persistent handles */
+	if (volume_info->persistent) {
+		if (ses->server->vals->protocol_id == 0) {
+			cifs_dbg(VFS,
+			     "SMB3 or later required for persistent handles\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+		} else if (ses->server->capabilities &
+			   SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+			tcon->use_persistent = true;
+		else /* persistent handles requested but not supported */ {
+			cifs_dbg(VFS,
+				"Persistent handles not supported on share\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+		}
+	} else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+	     && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+	     && (volume_info->nopersistent == false)) {
+		cifs_dbg(FYI, "enabling persistent handles\n");
+		tcon->use_persistent = true;
+	}
+
 	/*
 	 * We can have only one retry value for a connection to a share so for
 	 * resources mounted more than once to the same server share the last
@@ -3521,6 +3545,12 @@ try_mount_again:
 		goto mount_fail_check;
 	}
 
+	if ((volume_info->persistent == true) && ((ses->server->capabilities &
+		SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
+		cifs_dbg(VFS, "persistent handles not supported by server\n");
+		rc = -EOPNOTSUPP;
+		goto mount_fail_check;
+	}
 	/* search for existing tcon to this server share */
 	tcon = cifs_get_tcon(ses, volume_info);
 	if (IS_ERR(tcon)) {
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 2f169eb..53ccdde 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1838,7 +1838,7 @@ struct smb_version_values smb21_values = {
 struct smb_version_values smb30_values = {
 	.version_string = SMB30_VERSION_STRING,
 	.protocol_id = SMB30_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1858,7 +1858,7 @@ struct smb_version_values smb30_values = {
 struct smb_version_values smb302_values = {
 	.version_string = SMB302_VERSION_STRING,
 	.protocol_id = SMB302_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1879,7 +1879,7 @@ struct smb_version_values smb302_values = {
 struct smb_version_values smb311_values = {
 	.version_string = SMB311_VERSION_STRING,
 	.protocol_id = SMB311_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
-- 
cgit v1.1


From f16dfa7cd1b588e5d7ef4b5a19ee579f11b7a41f Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 30 Sep 2015 21:07:59 -0500
Subject: [SMB3] Display persistenthandles in /proc/mounts for SMB3 shares if
 enabled

Signed-off-by: Steve French <steve.french@primarydata.com>
Reviewed-by: Pavel Shilovsky <pshilovsky@samba.org>
---
 fs/cifs/cifsfs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e739950..13302e9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -454,6 +454,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",nocase");
 	if (tcon->retry)
 		seq_puts(s, ",hard");
+	if (tcon->use_persistent)
+		seq_puts(s, ",persistenthandles");
 	if (tcon->unix_ext)
 		seq_puts(s, ",unix");
 	else
-- 
cgit v1.1


From b56eae4df9ef922d5785ec4a15d54d9527cedb13 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 3 Nov 2015 09:26:27 -0600
Subject: [SMB3] Send durable handle v2 contexts when use of persistent handles
 required

Version 2 of the patch. Thanks to Dan Carpenter and the smatch
tool for finding a problem in the first version of this patch.

CC: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Pavel Shilovsky <pshilovsky@samba.org>
Signed-off-by: Steve French <steve.french@primarydata.com>
---
 fs/cifs/cifsglob.h |   1 +
 fs/cifs/smb2pdu.c  | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/cifs/smb2pdu.h  |  45 ++++++++++++++++++++
 fs/cifs/smbfsctl.h |   2 +-
 4 files changed, 168 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 28bd477..81273fe 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1018,6 +1018,7 @@ struct cifs_fid {
 	__u64 persistent_fid;	/* persist file id for smb2 */
 	__u64 volatile_fid;	/* volatile file id for smb2 */
 	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for smb2 */
+	__u8 create_guid[16];
 #endif
 	struct cifs_pending_open *pending_open;
 	unsigned int epoch;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 597a417..2520ed5 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1151,13 +1151,130 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
 	return 0;
 }
 
+static struct create_durable_v2 *
+create_durable_v2_buf(struct cifs_fid *pfid)
+{
+	struct create_durable_v2 *buf;
+
+	buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_durable_v2, dcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct durable_context_v2));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_durable_v2, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+
+	buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+	buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+	get_random_bytes(buf->dcontext.CreateGuid, 16);
+	memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+	/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
+	buf->Name[0] = 'D';
+	buf->Name[1] = 'H';
+	buf->Name[2] = '2';
+	buf->Name[3] = 'Q';
+	return buf;
+}
+
+static struct create_durable_handle_reconnect_v2 *
+create_reconnect_durable_v2_buf(struct cifs_fid *fid)
+{
+	struct create_durable_handle_reconnect_v2 *buf;
+
+	buf = kzalloc(sizeof(struct create_durable_handle_reconnect_v2),
+			GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset =
+		cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+				     dcontext));
+	buf->ccontext.DataLength =
+		cpu_to_le32(sizeof(struct durable_reconnect_context_v2));
+	buf->ccontext.NameOffset =
+		cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+			    Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+
+	buf->dcontext.Fid.PersistentFileId = fid->persistent_fid;
+	buf->dcontext.Fid.VolatileFileId = fid->volatile_fid;
+	buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+	memcpy(buf->dcontext.CreateGuid, fid->create_guid, 16);
+
+	/* SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 is "DH2C" */
+	buf->Name[0] = 'D';
+	buf->Name[1] = 'H';
+	buf->Name[2] = '2';
+	buf->Name[3] = 'C';
+	return buf;
+}
+
 static int
-add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
 		    struct cifs_open_parms *oparms)
 {
 	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
+	iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = sizeof(struct create_durable_v2);
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset =
+			cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+								iov[1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
+	inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2));
+	*num_iovec = num + 1;
+	return 0;
+}
+
+static int
+add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
+		    struct cifs_open_parms *oparms)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	/* indicate that we don't need to relock the file */
+	oparms->reconnect = false;
+
+	iov[num].iov_base = create_reconnect_durable_v2_buf(oparms->fid);
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset =
+			cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+								iov[1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength,
+			sizeof(struct create_durable_handle_reconnect_v2));
+	inc_rfc1001_len(&req->hdr,
+			sizeof(struct create_durable_handle_reconnect_v2));
+	*num_iovec = num + 1;
+	return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+		    struct cifs_open_parms *oparms, bool use_persistent)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	if (use_persistent) {
+		if (oparms->reconnect)
+			return add_durable_reconnect_v2_context(iov, num_iovec,
+								oparms);
+		else
+			return add_durable_v2_context(iov, num_iovec, oparms);
+	}
+
 	if (oparms->reconnect) {
 		iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
 		/* indicate that we don't need to relock the file */
@@ -1275,7 +1392,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 			ccontext->Next =
 				cpu_to_le32(server->vals->create_lease_size);
 		}
-		rc = add_durable_context(iov, &num_iovecs, oparms);
+
+		rc = add_durable_context(iov, &num_iovecs, oparms,
+					tcon->use_persistent);
 		if (rc) {
 			cifs_small_buf_release(req);
 			kfree(copy_path);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4511082..4af5278 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -590,6 +590,44 @@ struct create_durable {
 	} Data;
 } __packed;
 
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT	0x00000002
+struct durable_context_v2 {
+	__le32 Timeout;
+	__le32 Flags;
+	__u64 Reserved;
+	__u8 CreateGuid[16];
+} __packed;
+
+struct create_durable_v2 {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct durable_context_v2 dcontext;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.12 */
+struct durable_reconnect_context_v2 {
+	struct {
+		__u64 PersistentFileId;
+		__u64 VolatileFileId;
+	} Fid;
+	__u8 CreateGuid[16];
+	__le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.12 */
+struct durable_reconnect_context_v2_rsp {
+	__le32 Timeout;
+	__le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+struct create_durable_handle_reconnect_v2 {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct durable_reconnect_context_v2 dcontext;
+} __packed;
+
 #define COPY_CHUNK_RES_KEY_SIZE	24
 struct resume_key_req {
 	char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
@@ -643,6 +681,13 @@ struct fsctl_get_integrity_information_rsp {
 /* Integrity flags for above */
 #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF	0x00000001
 
+/* See MS-SMB2 2.2.31.3 */
+struct network_resiliency_req {
+	__le32 Timeout;
+	__le32 Reserved;
+} __packed;
+/* There is no buffer for the response ie no struct network_resiliency_rsp */
+
 
 struct validate_negotiate_info_req {
 	__le32 Capabilities;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a639d0d..f996dae 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
 #define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
 /* Retrieve an opaque file reference for server-side data movement ie copy */
 #define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
-#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
 #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
 #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
 #define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
-- 
cgit v1.1


From 485290a734f14279fa9376b3d6021a2dc1f82356 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 29 Oct 2015 17:28:46 +0800
Subject: btrfs: Fix a data space underflow warning

Even with quota disabled, generic/127 will trigger a kernel warning by
underflow data space info.

The bug is caused by buffered write, which in case of short copy, the
start parameter for btrfs_delalloc_release_space() is wrong, and
round_up/down() in btrfs_delalloc_release() extents the range to page
aligned, decreasing one more page than expected.

This patch will fix it by passing correct start.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 381be79..e6df6f1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1604,12 +1604,17 @@ again:
 				BTRFS_I(inode)->outstanding_extents++;
 				spin_unlock(&BTRFS_I(inode)->lock);
 			}
-			if (only_release_metadata)
+			if (only_release_metadata) {
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
-			else
-				btrfs_delalloc_release_space(inode, pos,
+			} else {
+				u64 __pos;
+
+				__pos = round_down(pos, root->sectorsize) +
+					(dirty_pages << PAGE_CACHE_SHIFT);
+				btrfs_delalloc_release_space(inode, __pos,
 							     release_bytes);
+			}
 		}
 
 		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
-- 
cgit v1.1


From 13a0db5a53f56d1f77e8902dd23258c99c2154b8 Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Mon, 2 Nov 2015 13:59:46 +0530
Subject: Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT
 state

When executing generic/001 in a loop on a ppc64 machine (with both sectorsize
and nodesize set to 64k), the following call trace is observed,

WARNING: at /root/repos/linux/fs/btrfs/locking.c:253
Modules linked in:
CPU: 2 PID: 8353 Comm: umount Not tainted 4.3.0-rc5-13676-ga5e681d #54
task: c0000000f2b1f560 ti: c0000000f6008000 task.ti: c0000000f6008000
NIP: c000000000520c88 LR: c0000000004a3b34 CTR: 0000000000000000
REGS: c0000000f600a820 TRAP: 0700   Not tainted  (4.3.0-rc5-13676-ga5e681d)
MSR: 8000000102029032 <SF,VEC,EE,ME,IR,DR,RI>  CR: 24444884  XER: 00000000
CFAR: c0000000004a3b30 SOFTE: 1
GPR00: c0000000004a3b34 c0000000f600aaa0 c00000000108ac00 c0000000f5a808c0
GPR04: 0000000000000000 c0000000f600ae60 0000000000000000 0000000000000005
GPR08: 00000000000020a1 0000000000000001 c0000000f2b1f560 0000000000000030
GPR12: 0000000084842882 c00000000fdc0900 c0000000f600ae60 c0000000f070b800
GPR16: 0000000000000000 c0000000f3c8a000 0000000000000000 0000000000000049
GPR20: 0000000000000001 0000000000000001 c0000000f5aa01f8 0000000000000000
GPR24: 0f83e0f83e0f83e1 c0000000f5a808c0 c0000000f3c8d000 c000000000000000
GPR28: c0000000f600ae74 0000000000000001 c0000000f3c8d000 c0000000f5a808c0
NIP [c000000000520c88] .btrfs_tree_lock+0x48/0x2a0
LR [c0000000004a3b34] .btrfs_lock_root_node+0x44/0x80
Call Trace:
[c0000000f600aaa0] [c0000000f600ab80] 0xc0000000f600ab80 (unreliable)
[c0000000f600ab80] [c0000000004a3b34] .btrfs_lock_root_node+0x44/0x80
[c0000000f600ac00] [c0000000004a99dc] .btrfs_search_slot+0xa8c/0xc00
[c0000000f600ad40] [c0000000004ab878] .btrfs_insert_empty_items+0x98/0x120
[c0000000f600adf0] [c00000000050da44] .btrfs_finish_chunk_alloc+0x1d4/0x620
[c0000000f600af20] [c0000000004be854] .btrfs_create_pending_block_groups+0x1d4/0x2c0
[c0000000f600b020] [c0000000004bf188] .do_chunk_alloc+0x3c8/0x420
[c0000000f600b100] [c0000000004c27cc] .find_free_extent+0xbfc/0x1030
[c0000000f600b260] [c0000000004c2ce8] .btrfs_reserve_extent+0xe8/0x250
[c0000000f600b330] [c0000000004c2f90] .btrfs_alloc_tree_block+0x140/0x590
[c0000000f600b440] [c0000000004a47b4] .__btrfs_cow_block+0x124/0x780
[c0000000f600b530] [c0000000004a4fc0] .btrfs_cow_block+0xf0/0x250
[c0000000f600b5e0] [c0000000004a917c] .btrfs_search_slot+0x22c/0xc00
[c0000000f600b720] [c00000000050aa40] .btrfs_remove_chunk+0x1b0/0x9f0
[c0000000f600b850] [c0000000004c4e04] .btrfs_delete_unused_bgs+0x434/0x570
[c0000000f600b950] [c0000000004d3cb8] .close_ctree+0x2e8/0x3b0
[c0000000f600ba20] [c00000000049d178] .btrfs_put_super+0x18/0x30
[c0000000f600ba90] [c000000000243cd4] .generic_shutdown_super+0xa4/0x1a0
[c0000000f600bb10] [c0000000002441d8] .kill_anon_super+0x18/0x30
[c0000000f600bb90] [c00000000049c898] .btrfs_kill_super+0x18/0xc0
[c0000000f600bc10] [c0000000002444f8] .deactivate_locked_super+0x98/0xe0
[c0000000f600bc90] [c000000000269f94] .cleanup_mnt+0x54/0xa0
[c0000000f600bd10] [c0000000000bd744] .task_work_run+0xc4/0x100
[c0000000f600bdb0] [c000000000016334] .do_notify_resume+0x74/0x80
[c0000000f600be30] [c0000000000098b8] .ret_from_except_lite+0x64/0x68
Instruction dump:
fba1ffe8 fbc1fff0 fbe1fff8 7c791b78 f8010010 f821ff21 e94d0290 81030040
812a04e8 7d094a78 7d290034 5529d97e <0b090000> 3b400000 3be30050 3bc3004c

The above call trace is seen even on x86_64; albeit very rarely and that too
with nodesize set to 64k and with nospace_cache mount option being used.

The reason for the above call trace is,
btrfs_remove_chunk
  check_system_chunk
    Allocate chunk if required
  For each physical stripe on underlying device,
    btrfs_free_dev_extent
      ...
      Take lock on Device tree's root node
      btrfs_cow_block("dev tree's root node");
        btrfs_reserve_extent
          find_free_extent
	    index = BTRFS_RAID_DUP;
	    have_caching_bg = false;

            When in LOOP_CACHING_NOWAIT state, Assume we find a block group
	    which is being cached; Hence have_caching_bg is set to true

            When repeating the search for the next RAID index, we set
	    have_caching_bg to false.

Hence right after completing the LOOP_CACHING_NOWAIT state, we incorrectly
skip LOOP_CACHING_WAIT state and move to LOOP_ALLOC_CHUNK state where we
allocate a chunk and try to add entries corresponding to the chunk's physical
stripe into the device tree. When doing so the task deadlocks itself waiting
for the blocking lock on the root node of the device tree.

This commit fixes the issue by introducing a new local variable to help
indicate as to whether a block group of any RAID type is being cached.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f50c7c2..99a8e57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7029,6 +7029,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	bool failed_alloc = false;
 	bool use_cluster = true;
 	bool have_caching_bg = false;
+	bool orig_have_caching_bg = false;
 	bool full_search = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
@@ -7378,6 +7379,10 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
+	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+		&& !orig_have_caching_bg)
+		orig_have_caching_bg = true;
+
 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
 		goto search;
 
@@ -7400,7 +7405,7 @@ loop:
 			 * don't have any unached bgs and we've alrelady done a
 			 * full search through.
 			 */
-			if (have_caching_bg || !full_search)
+			if (orig_have_caching_bg || !full_search)
 				loop = LOOP_CACHING_WAIT;
 			else
 				loop = LOOP_ALLOC_CHUNK;
-- 
cgit v1.1


From 2959a32a858a2c44bbbce83d19c158d54cc5998a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 2 Nov 2015 12:32:44 +0000
Subject: Btrfs: fix hole punching when using the no-holes feature

When we are using the no-holes feature, if we punch a hole into a file
range that already contains a hole which overlaps the range we are passing
to fallocate(), we end up removing the extent map that represents the
existing hole without adding a new one. This happens because with the
no-holes feature we do not have explicit extent items to represent holes
and therefore the call to __btrfs_drop_extents(), made from
btrfs_punch_hole(), returns an end offset to the variable drop_end that
is smaller than the end of the range passed to fallocate(), while it
drops all existing extent maps in that range.
Normally having a missing extent map is not a problem, for example for
a readpages() operation we just end up building the extent map by
looking at the fs/subvol tree for a matching extent item (or a lack of
one for implicit holes). However for an fsync that uses the fast path,
which needs to look at the list of modified extent maps, this means
the fsync will not record information about the complete hole we had
before the fallocate() call into the log tree, resulting in a file with
content/layout that does not match what we had neither before nor after
the hole punch operation.

The following test case for fstests reproduces the issue. It fails without
this change because we get a file with a different digest after the fsync
log replay and also with a different extent/hole layout.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
     _cleanup_flakey
     rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/punch
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs generic
  _supported_os Linux
  _require_scratch
  _require_xfs_io_command "fpunch"
  _require_xfs_io_command "fiemap"
  _require_dm_target flakey
  _require_metadata_journaling $SCRATCH_DEV

  # This test was motivated by an issue found in btrfs when the btrfs
  # no-holes feature is enabled (introduced in kernel 3.14). So enable
  # the feature if the fs being tested is btrfs.
  if [ $FSTYP == "btrfs" ]; then
      _require_btrfs_fs_feature "no_holes"
      _require_btrfs_mkfs_feature "no-holes"
      MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes"
  fi

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create out test file with some data and then fsync it.
  # We do the fsync only to make sure the last fsync we do in this test
  # triggers the fast code path of btrfs' fsync implementation, a
  # condition necessary to trigger the bug btrfs had.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 128K" \
                  -c "fsync"                  \
                  $SCRATCH_MNT/foobar | _filter_xfs_io

  # Now punch a hole against the range [96K, 128K[.
  $XFS_IO_PROG -c "fpunch 96K 32K" $SCRATCH_MNT/foobar

  # Punch another hole against a range that overlaps the previous range
  # and ends beyond eof.
  $XFS_IO_PROG -c "fpunch 64K 128K" $SCRATCH_MNT/foobar

  # Punch another hole against a range that overlaps the first range
  # ([96K, 128K[) and ends at eof.
  $XFS_IO_PROG -c "fpunch 32K 96K" $SCRATCH_MNT/foobar

  # Fsync our file. We want to verify that, after a power failure and
  # mounting the filesystem again, the file content reflects all the hole
  # punch operations.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar

  echo "File digest before power failure:"
  md5sum $SCRATCH_MNT/foobar | _filter_scratch

  echo "Fiemap before power failure:"
  $XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/foobar | _filter_fiemap

  # Silently drop all writes and umount to simulate a crash/power failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again, mount to trigger log replay and validate file
  # contents.
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  echo "File digest after log replay:"
  # Must match the same digest we got before the power failure.
  md5sum $SCRATCH_MNT/foobar | _filter_scratch

  echo "Fiemap after log replay:"
  # Must match the same extent listing we got before the power failure.
  $XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/foobar | _filter_fiemap

  _unmount_flakey

  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e6df6f1..cfa243c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2494,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	/*
+	 * If we are using the NO_HOLES feature we might have had already an
+	 * hole that overlaps a part of the region [lockstart, lockend] and
+	 * ends at (or beyond) lockend. Since we have no file extent items to
+	 * represent holes, drop_end can be less than lockend and so we must
+	 * make sure we have an extent map representing the existing hole (the
+	 * call to __btrfs_drop_extents() might have dropped the existing extent
+	 * map representing the existing hole), otherwise the fast fsync path
+	 * will not record the existence of the hole region
+	 * [existing_hole_start, lockend].
+	 */
+	if (drop_end <= lockend)
+		drop_end = lockend + 1;
+	/*
 	 * Don't insert file hole extent item if it's for a range beyond eof
 	 * (because it's useless) or if it represents a 0 bytes range (when
 	 * cur_offset == drop_end).
-- 
cgit v1.1


From 592fafe644bf3a48b9e00e182a67d301493634fc Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 3 Nov 2015 10:08:53 -0600
Subject: Add resilienthandles mount parm

Since many servers (Windows clients, and non-clustered servers) do not
support persistent handles but do support resilient handles, allow
the user to specify a mount option "resilienthandles" in order
to get more reliable connections and less chance of data loss
(at least when SMB2.1 or later).  Default resilient handle
timeout (120 seconds to recent Windows server) is used.

Reviewed-by: Pavel Shilovsky <pshilovsky@samba.org>
Signed-off-by: Steve French <steve.french@primarydata.com>
---
 fs/cifs/cifsfs.c   |  2 ++
 fs/cifs/cifsglob.h |  2 ++
 fs/cifs/connect.c  | 31 ++++++++++++++++++++++++++++++-
 fs/cifs/smb2file.c | 19 +++++++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 13302e9..f578ef9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -456,6 +456,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",hard");
 	if (tcon->use_persistent)
 		seq_puts(s, ",persistenthandles");
+	else if (tcon->use_resilient)
+		seq_puts(s, ",resilienthandles");
 	if (tcon->unix_ext)
 		seq_puts(s, ",unix");
 	else
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 81273fe..2b510c5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -496,6 +496,7 @@ struct smb_vol {
 	bool nosharesock:1;
 	bool persistent:1;
 	bool nopersistent:1;
+	bool resilient:1; /* noresilient not required since not fored for CA */
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
@@ -897,6 +898,7 @@ struct cifs_tcon {
 	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool broken_sparse_sup; /* if server or share does not support sparse */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
+	bool use_resilient:1; /* use resilient instead of durable handles */
 	bool use_persistent:1; /* use persistent instead of durable handles */
 #ifdef CONFIG_CIFS_SMB2
 	bool print:1;		/* set if connection to printer share */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index bb35ae7..4042996 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -88,6 +88,7 @@ enum {
 	Opt_fsc, Opt_mfsymlinks,
 	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
 	Opt_persistent, Opt_nopersistent,
+	Opt_resilient, Opt_noresilient,
 
 	/* Mount options which take numeric value */
 	Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -172,6 +173,8 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_nosharesock, "nosharesock" },
 	{ Opt_persistent, "persistenthandles"},
 	{ Opt_nopersistent, "nopersistenthandles"},
+	{ Opt_resilient, "resilienthandles"},
+	{ Opt_noresilient, "noresilienthandles"},
 
 	{ Opt_backupuid, "backupuid=%s" },
 	{ Opt_backupgid, "backupgid=%s" },
@@ -1510,12 +1513,23 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			break;
 		case Opt_persistent:
 			vol->persistent = true;
-			if (vol->nopersistent) {
+			if ((vol->nopersistent) || (vol->resilient)) {
 				cifs_dbg(VFS,
 				  "persistenthandles mount options conflict\n");
 				goto cifs_parse_mount_err;
 			}
 			break;
+		case Opt_resilient:
+			vol->resilient = true;
+			if (vol->persistent) {
+				cifs_dbg(VFS,
+				  "persistenthandles mount options conflict\n");
+				goto cifs_parse_mount_err;
+			}
+			break;
+		case Opt_noresilient:
+			vol->resilient = false; /* already the default */
+			break;
 
 		/* Numeric Values */
 		case Opt_backupuid:
@@ -2681,6 +2695,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 			     "SMB3 or later required for persistent handles\n");
 			rc = -EOPNOTSUPP;
 			goto out_fail;
+#ifdef CONFIG_CIFS_SMB2
 		} else if (ses->server->capabilities &
 			   SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
 			tcon->use_persistent = true;
@@ -2689,12 +2704,23 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 				"Persistent handles not supported on share\n");
 			rc = -EOPNOTSUPP;
 			goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
 		}
+#ifdef CONFIG_CIFS_SMB2
 	} else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
 	     && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
 	     && (volume_info->nopersistent == false)) {
 		cifs_dbg(FYI, "enabling persistent handles\n");
 		tcon->use_persistent = true;
+#endif /* CONFIG_CIFS_SMB2 */
+	} else if (volume_info->resilient) {
+		if (ses->server->vals->protocol_id == 0) {
+			cifs_dbg(VFS,
+			     "SMB2.1 or later required for resilient handles\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+		}
+		tcon->use_resilient = true;
 	}
 
 	/*
@@ -3545,12 +3571,15 @@ try_mount_again:
 		goto mount_fail_check;
 	}
 
+#ifdef CONFIG_CIFS_SMB2
 	if ((volume_info->persistent == true) && ((ses->server->capabilities &
 		SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
 		cifs_dbg(VFS, "persistent handles not supported by server\n");
 		rc = -EOPNOTSUPP;
 		goto mount_fail_check;
 	}
+#endif /* CONFIG_CIFS_SMB2*/
+
 	/* search for existing tcon to this server share */
 	tcon = cifs_get_tcon(ses, volume_info);
 	if (IS_ERR(tcon)) {
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2ab297d..f9e766f 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,6 +43,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
 	struct smb2_file_all_info *smb2_data = NULL;
 	__u8 smb2_oplock[17];
 	struct cifs_fid *fid = oparms->fid;
+	struct network_resiliency_req nr_ioctl_req;
 
 	smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
 	if (smb2_path == NULL) {
@@ -67,6 +68,24 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
 	if (rc)
 		goto out;
 
+
+	 if (oparms->tcon->use_resilient) {
+		nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+		nr_ioctl_req.Reserved = 0;
+		rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
+			fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true,
+			(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
+			NULL, NULL /* no return info */);
+		if (rc == -EOPNOTSUPP) {
+			cifs_dbg(VFS,
+			     "resiliency not supported by server, disabling\n");
+			oparms->tcon->use_resilient = false;
+		} else if (rc)
+			cifs_dbg(FYI, "error %d setting resiliency\n", rc);
+
+		rc = 0;
+	}
+
 	if (buf) {
 		/* open response does not have IndexNumber field - get it */
 		rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
-- 
cgit v1.1


From a6b1533e9a57d76cd3d9b7649d29ac604b1874b8 Mon Sep 17 00:00:00 2001
From: Eric Ren <zren@suse.com>
Date: Wed, 14 Oct 2015 23:28:26 +0800
Subject: dlm: make posix locks interruptible

Replace wait_event_killable with wait_event_interruptible
so that a program waiting for a posix lock can be
interrupted by a signal.  With the killable version,
a program was not interruptible by a signal if it
had a signal handler set for it, overriding the default
action of terminating the process.

Signed-off-by: Eric Ren <zren@suse.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/plock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 5532f09..88f1036 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -145,7 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	send_op(op);
 
 	if (xop->callback == NULL) {
-		rv = wait_event_killable(recv_wq, (op->done != 0));
+		rv = wait_event_interruptible(recv_wq, (op->done != 0));
 		if (rv == -ERESTARTSYS) {
 			log_debug(ls, "dlm_posix_lock: wait killed %llx",
 				  (unsigned long long)number);
-- 
cgit v1.1


From 118c9163562426face9ff0903f1cbd3e1a959ca9 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 28 Sep 2015 22:30:32 +0800
Subject: fs/nfs: remove unnecessary new_valid_dev check

As new_valid_dev always returns 1, so !new_valid_dev check is not
needed, remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3d8e4ff..ce5a218 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1714,9 +1714,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 	dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
 
-- 
cgit v1.1


From 8fbcf237439f841e7e9c4675790e08ea1c295bd3 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 18:25:34 +0100
Subject: nfs: Remove unused xdr page offsets in getacl/setacl arguments

The arguments passed around for getacl and setacl xdr encoding, struct
nfs_setaclargs and struct nfs_getaclargs, both contain an array of
pages, an offset into the first page, and the length of the page data.
The offset is unused as it is always zero; remove it.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 5 ++---
 fs/nfs/nfs4xdr.c  | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 87a081c..7ed8f2c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4603,7 +4603,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
 #define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
 
 static int buf_to_pages_noslab(const void *buf, size_t buflen,
-		struct page **pages, unsigned int *pgbase)
+		struct page **pages)
 {
 	struct page *newpage, **spages;
 	int rc = 0;
@@ -4747,7 +4747,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		goto out_free;
 
 	args.acl_len = npages * PAGE_SIZE;
-	args.acl_pgbase = 0;
 
 	dprintk("%s  buf %p buflen %zu npages %d args.acl_len %zu\n",
 		__func__, buf, buflen, npages, args.acl_len);
@@ -4839,7 +4838,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		return -EOPNOTSUPP;
 	if (npages > ARRAY_SIZE(pages))
 		return -ERANGE;
-	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
 	if (i < 0)
 		return i;
 	nfs4_inode_return_delegation(inode);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f65679..22a1ddd 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1659,7 +1659,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	*p = cpu_to_be32(FATTR4_WORD0_ACL);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(arg->acl_len);
-	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+	xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
 }
 
 static void
@@ -2491,7 +2491,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-		args->acl_pages, args->acl_pgbase, args->acl_len);
+		args->acl_pages, 0, args->acl_len);
 
 	encode_nops(&hdr);
 }
-- 
cgit v1.1


From 1ca843a2d28dec89e58e7227c27a9d55f21f59e1 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 18:25:33 +0100
Subject: nfs: Fix GETATTR bitmap verification

When decoding GETATTR replies, the client checks the attribute bitmap
for which attributes the server has sent.  It misses bits at the word
boundaries, though; fix that.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4xdr.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 22a1ddd..dfed4f5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4375,6 +4375,11 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 		goto xdr_error;
 	if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
 		goto xdr_error;
+
+	status = -EIO;
+	if (unlikely(bitmap[0]))
+		goto xdr_error;
+
 	if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
@@ -4574,6 +4579,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = -EIO;
+	if (unlikely(bitmap[0]))
+		goto xdr_error;
+
 	status = decode_attr_mode(xdr, bitmap, &fmode);
 	if (status < 0)
 		goto xdr_error;
@@ -4627,6 +4636,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = -EIO;
+	if (unlikely(bitmap[1]))
+		goto xdr_error;
+
 	status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
 	if (status < 0)
 		goto xdr_error;
@@ -4811,12 +4824,22 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 	if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
 		goto xdr_error;
 	fsinfo->wtpref = fsinfo->wtmax;
+
+	status = -EIO;
+	if (unlikely(bitmap[0]))
+		goto xdr_error;
+
 	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
 	if (status != 0)
 		goto xdr_error;
 	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
 	if (status != 0)
 		goto xdr_error;
+
+	status = -EIO;
+	if (unlikely(bitmap[1]))
+		goto xdr_error;
+
 	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
 	if (status)
 		goto xdr_error;
-- 
cgit v1.1


From c36b97e9430defab4b52ee370c18745343d92f6d Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 29 Oct 2015 10:03:41 -0500
Subject: GFS2: Protect freeing directory hash table with i_lock spin_lock

This patch changes function gfs2_dir_hash_inval so it uses the
i_lock spin_lock to protect the in-core hash table, i_hash_cache.
This will prevent double-frees due to a race between gfs2_evict_inode
and inode invalidation.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/dir.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b..ad8a5b7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -388,8 +388,13 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
  */
 void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
-	__be64 *hc = ip->i_hash_cache;
+	__be64 *hc;
+
+	spin_lock(&ip->i_inode.i_lock);
+	hc = ip->i_hash_cache;
 	ip->i_hash_cache = NULL;
+	spin_unlock(&ip->i_inode.i_lock);
+
 	kvfree(hc);
 }
 
-- 
cgit v1.1


From d227c3ae4e94e5eb11dd780a811f59e1a7b74ccd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 4 Nov 2015 23:33:17 +0100
Subject: tracefs: Fix refcount imbalance in start_creating()

In tracefs' start_creating(), we pin the file system to safely access
its root. When we failed to create a file, we unpin the file system via
failed_creating() to release the mount count and eventually the reference
of the singleton vfsmount.

However, when we run into an error during lookup_one_len() when still
in start_creating(), we only release the parent's mutex but not so the
reference on the mount.

F.e., in securityfs_create_file(), after doing simple_pin_fs() when
lookup_one_len() fails there, we infact do simple_release_fs(). This
seems necessary here as well.

Same issue seen in debugfs due to 190afd81e4a5 ("debugfs: split the
beginning and the end of __create_file() off"), which seemed to got
carried over into tracefs, too. Noticed during code review.

Link: http://lkml.kernel.org/r/68efa86101b778cf7517ed7c6ad573bd69f60ec6.1446672850.git.daniel@iogearbox.net

Fixes: 4282d60689d4 ("tracefs: Add new tracefs file system")
Cc: stable@vger.kernel.org # 4.1+
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 fs/tracefs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cbc8d5d..c66f242 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 		dput(dentry);
 		dentry = ERR_PTR(-EEXIST);
 	}
-	if (IS_ERR(dentry))
+
+	if (IS_ERR(dentry)) {
 		mutex_unlock(&parent->d_inode->i_mutex);
+		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+	}
+
 	return dentry;
 }
 
-- 
cgit v1.1


From 9c9464cc92668984ebed79e22b5063877a8d97db Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 4 Nov 2015 09:52:04 +0000
Subject: Btrfs: fix extent accounting for partial direct IO writes

When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:

  extents        [ prealloc extent ] [ compressed extent ]
  offsets        A        B          C       D           E

and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:

  [125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
  [125471.338844] ------------[ cut here ]------------
  [125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
  [125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  [125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
  [125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G        W       4.3.0-rc5-btrfs-next-17+ #1
  [125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
  [125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
  [125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
  [125471.340745] RIP: 0010:[<ffffffffa0550da1>]  [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
  [125471.340745] RSP: 0018:ffff88040a11bc78  EFLAGS: 00010296
  [125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
  [125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
  [125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
  [125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
  [125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
  [125471.340745] FS:  0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
  [125471.340745] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  [125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
  [125471.340745] Stack:
  [125471.340745]  ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
  [125471.340745]  ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
  [125471.340745]  0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
  [125471.340745] Call Trace:
  [125471.340745]  [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
  [125471.340745]  [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
  [125471.340745]  [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
  [125471.340745]  [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
  [125471.340745]  [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
  [125471.340745]  [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
  [125471.340745]  [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
  [125471.340745]  [<ffffffff81064285>] worker_thread+0x206/0x2c2
  [125471.340745]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
  [125471.340745]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
  [125471.340745]  [<ffffffff8106904d>] kthread+0xef/0xf7
  [125471.340745]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
  [125471.340745]  [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
  [125471.340745]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
  [125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
  [125471.340745] RIP  [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
  [125471.340745]  RSP <ffff88040a11bc78>
  [125471.539620] ---[ end trace 144259f7838b4aa4 ]---

So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.

We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.

The following test case for fstests reproduces this issue:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_xfs_io_command "falloc"

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _scratch_mount "-o compress"

  # Create a compressed extent covering the range [700K, 800K[.
  $XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
      $SCRATCH_MNT/foo | _filter_xfs_io

  # Create prealloc extent covering the range [600K, 700K[.
  $XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo

  # Write 80K of data to the range [640K, 720K[ using direct IO. This
  # range covers both the prealloc extent and the compressed extent.
  # Because there's a compressed extent in the range we are writing to,
  # the DIO write code path ends up only writing the first 60k of data,
  # which goes to the prealloc extent, and then falls back to buffered IO
  # for writing the remaining 20K of data - because that remaining data
  # maps to a file range containing a compressed extent.
  # When falling back to buffered IO, we used to trigger an assertion when
  # releasing reserved space due to bad accounting of the inode's
  # outstanding extents counter, which was set to 1 but we ended up
  # decrementing it by 1 twice, once through the ordered extent for the
  # 60K of data we wrote using direct IO, and once through the main direct
  # IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
  # wrote less than 80K of data (60K).
  $XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
      $SCRATCH_MNT/foo | _filter_xfs_io

  # Now similar test as above but for very large write operations. This
  # triggers special cases for an inode's outstanding extents accounting,
  # as internally btrfs logically splits extents into 128Mb units.
  $XFS_IO_PROG -f -s \
      -c "pwrite -S 0xaa -b 128M 258M 128M" \
      -c "falloc 0 258M" \
      $SCRATCH_MNT/bar | _filter_xfs_io
  $XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
      | _filter_xfs_io

  # Now verify the file contents are correct and that they are the same
  # even after unmounting and mounting the fs again (or evicting the page
  # cache).
  #
  # For file foo, all bytes in the range [0, 640K[ must have a value of
  # 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
  # and all bytes in the range [720K, 800K[ must have a value of 0xaa.
  #
  # For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
  # all bytes in the range [3M, 259M[ must have a value of 0xbb and all
  # bytes in the range [259M, 386M[ must have a value of 0xaa.
  #
  echo "File digests before remounting the file system:"
  md5sum $SCRATCH_MNT/foo | _filter_scratch
  md5sum $SCRATCH_MNT/bar | _filter_scratch
  _scratch_remount
  echo "File digests after remounting the file system:"
  md5sum $SCRATCH_MNT/foo | _filter_scratch
  md5sum $SCRATCH_MNT/bar | _filter_scratch

  status=0
  exit

Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 52 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4439fbb..6138eea 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7506,6 +7506,28 @@ struct btrfs_dio_data {
 	u64 reserve;
 };
 
+static void adjust_dio_outstanding_extents(struct inode *inode,
+					   struct btrfs_dio_data *dio_data,
+					   const u64 len)
+{
+	unsigned num_extents;
+
+	num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+					   BTRFS_MAX_EXTENT_SIZE);
+	/*
+	 * If we have an outstanding_extents count still set then we're
+	 * within our reservation, otherwise we need to adjust our inode
+	 * counter appropriately.
+	 */
+	if (dio_data->outstanding_extents) {
+		dio_data->outstanding_extents -= num_extents;
+	} else {
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents += num_extents;
+		spin_unlock(&BTRFS_I(inode)->lock);
+	}
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -7541,8 +7563,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	 * If this errors out it's because we couldn't invalidate pagecache for
 	 * this range and we need to fallback to buffered.
 	 */
-	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
-		return -ENOTBLK;
+	if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+			       create)) {
+		ret = -ENOTBLK;
+		goto err;
+	}
 
 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 	if (IS_ERR(em)) {
@@ -7660,19 +7685,7 @@ unlock:
 		if (start + len > i_size_read(inode))
 			i_size_write(inode, start + len);
 
-		/*
-		 * If we have an outstanding_extents count still set then we're
-		 * within our reservation, otherwise we need to adjust our inode
-		 * counter appropriately.
-		 */
-		if (dio_data->outstanding_extents) {
-			(dio_data->outstanding_extents)--;
-		} else {
-			spin_lock(&BTRFS_I(inode)->lock);
-			BTRFS_I(inode)->outstanding_extents++;
-			spin_unlock(&BTRFS_I(inode)->lock);
-		}
-
+		adjust_dio_outstanding_extents(inode, dio_data, len);
 		btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
@@ -7699,8 +7712,17 @@ unlock:
 unlock_err:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+err:
 	if (dio_data)
 		current->journal_info = dio_data;
+	/*
+	 * Compensate the delalloc release we do in btrfs_direct_IO() when we
+	 * write less data then expected, so that we don't underflow our inode's
+	 * outstanding extents counter.
+	 */
+	if (create && dio_data)
+		adjust_dio_outstanding_extents(inode, dio_data, len);
+
 	return ret;
 }
 
-- 
cgit v1.1


From 7343dd61fd1b57c40cc06a5b5b5386df7f73c3ac Mon Sep 17 00:00:00 2001
From: Justin Maggard <jmaggard10@gmail.com>
Date: Wed, 4 Nov 2015 15:56:16 -0800
Subject: btrfs: qgroup: exit the rescan worker during umount

I was hitting a consistent NULL pointer dereference during shutdown that
showed the trace running through end_workqueue_bio().  I traced it back to
the endio_meta_workers workqueue being poked after it had already been
destroyed.

Eventually I found that the root cause was a qgroup rescan that was still
in progress while we were stopping all the btrfs workers.

Currently we explicitly pause balance and scrub operations in
close_ctree(), but we do nothing to stop the qgroup rescan.  We should
probably be doing the same for qgroup rescan, but that's a much larger
change.  This small change is good enough to allow me to unmount without
crashing.

Signed-off-by: Justin Maggard <jmaggard@netgear.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/disk-io.c | 3 +++
 fs/btrfs/qgroup.c  | 9 ++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2d46675..1eb0839 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3780,6 +3780,9 @@ void close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
+	/* wait for the qgroup rescan worker to stop */
+	btrfs_qgroup_wait_for_completion(fs_info);
+
 	/* wait for the uuid_scan task to finish */
 	down(&fs_info->uuid_tree_rescan_sem);
 	/* avoid complains from lockdep et al., set sem back to initial state */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 46476c2..75c0249 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2286,7 +2286,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 		goto out;
 
 	err = 0;
-	while (!err) {
+	while (!err && !btrfs_fs_closing(fs_info)) {
 		trans = btrfs_start_transaction(fs_info->fs_root, 0);
 		if (IS_ERR(trans)) {
 			err = PTR_ERR(trans);
@@ -2307,7 +2307,8 @@ out:
 	btrfs_free_path(path);
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
-	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	if (!btrfs_fs_closing(fs_info))
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 
 	if (err > 0 &&
 	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2336,7 +2337,9 @@ out:
 	}
 	btrfs_end_transaction(trans, fs_info->quota_root);
 
-	if (err >= 0) {
+	if (btrfs_fs_closing(fs_info)) {
+		btrfs_info(fs_info, "qgroup scan paused");
+	} else if (err >= 0) {
 		btrfs_info(fs_info, "qgroup scan completed%s",
 			err > 0 ? " (inconsistency flag cleared)" : "");
 	} else {
-- 
cgit v1.1


From 190631f1c8cb7e4d5c27ff87e22ed53817e78759 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 5 Nov 2015 10:06:23 +0000
Subject: Btrfs: fix race waiting for qgroup rescan worker

We were initializing the completion (fs_info->qgroup_rescan_completion)
object after releasing the qgroup rescan lock, which gives a small time
window for a rescan waiter to not actually wait for the rescan worker
to finish. Example:

         CPU 1                                                     CPU 2

 fs_info->qgroup_rescan_completion->done is 0

 btrfs_qgroup_rescan_worker()
   complete_all(&fs_info->qgroup_rescan_completion)
     sets fs_info->qgroup_rescan_completion->done
     to UINT_MAX / 2

 ... do some other stuff ....

 qgroup_rescan_init()
   mutex_lock(&fs_info->qgroup_rescan_lock)
   set flag BTRFS_QGROUP_STATUS_FLAG_RESCAN
     in fs_info->qgroup_flags
   mutex_unlock(&fs_info->qgroup_rescan_lock)

                                                       btrfs_qgroup_wait_for_completion()
                                                         mutex_lock(&fs_info->qgroup_rescan_lock)
                                                         sees flag BTRFS_QGROUP_STATUS_FLAG_RESCAN
                                                           in fs_info->qgroup_flags
                                                         mutex_unlock(&fs_info->qgroup_rescan_lock)

                                                         wait_for_completion_interruptible(
                                                           &fs_info->qgroup_rescan_completion)

                                                           fs_info->qgroup_rescan_completion->done
                                                           is > 0 so it returns immediately

  init_completion(&fs_info->qgroup_rescan_completion)
    sets fs_info->qgroup_rescan_completion->done to 0

So fix this by initializing the completion object while holding the mutex
fs_info->qgroup_rescan_lock.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/qgroup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 75c0249..75bb4af9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2387,12 +2387,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 	memset(&fs_info->qgroup_rescan_progress, 0,
 		sizeof(fs_info->qgroup_rescan_progress));
 	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+	init_completion(&fs_info->qgroup_rescan_completion);
 
 	spin_unlock(&fs_info->qgroup_lock);
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
-	init_completion(&fs_info->qgroup_rescan_completion);
-
 	memset(&fs_info->qgroup_rescan_work, 0,
 	       sizeof(fs_info->qgroup_rescan_work));
 	btrfs_init_work(&fs_info->qgroup_rescan_work,
-- 
cgit v1.1


From 3b2ba7b31d56c3d8f57cd5d32b8fb5101ab446e4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 5 Nov 2015 10:51:37 +0000
Subject: Btrfs: fix sleeping inside atomic context in qgroup rescan worker

We are holding a btree path with spinning locks and then we attempt to
clone an extent buffer, which calls kmem_cache_alloc() and this function
can sleep, causing the following trace to be reported on a debug kernel:

[107118.218536] BUG: sleeping function called from invalid context at mm/slab.c:2871
[107118.224110] in_atomic(): 1, irqs_disabled(): 0, pid: 19148, name: kworker/u32:3
[107118.226120] INFO: lockdep is turned off.
[107118.226843] Preemption disabled at:[<ffffffffa05ffa22>] btrfs_clear_lock_blocking_rw+0x96/0xea [btrfs]

[107118.229175] CPU: 3 PID: 19148 Comm: kworker/u32:3 Tainted: G        W       4.3.0-rc5-btrfs-next-17+ #1
[107118.231326] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[107118.233687] Workqueue: btrfs-qgroup-rescan btrfs_qgroup_rescan_helper [btrfs]
[107118.236835]  0000000000000000 ffff880424bf3b78 ffffffff812566f4 0000000000000000
[107118.238369]  ffff880424bf3ba0 ffffffff81070664 ffffffff817f1cd5 0000000000000b37
[107118.239769]  0000000000000000 ffff880424bf3bc8 ffffffff8107070a 0000000000008850
[107118.241244] Call Trace:
[107118.241729]  [<ffffffff812566f4>] dump_stack+0x4e/0x79
[107118.242602]  [<ffffffff81070664>] ___might_sleep+0x23a/0x241
[107118.243586]  [<ffffffff8107070a>] __might_sleep+0x9f/0xa6
[107118.244532]  [<ffffffff8115af70>] cache_alloc_debugcheck_before+0x25/0x36
[107118.245939]  [<ffffffff8115d52b>] kmem_cache_alloc+0x50/0x215
[107118.246930]  [<ffffffffa05e627e>] __alloc_extent_buffer+0x2a/0x11f [btrfs]
[107118.248121]  [<ffffffffa05ecb1a>] btrfs_clone_extent_buffer+0x3d/0xdd [btrfs]
[107118.249451]  [<ffffffffa06239ea>] btrfs_qgroup_rescan_worker+0x16d/0x434 [btrfs]
[107118.250755]  [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[107118.251754]  [<ffffffffa05f7952>] normal_work_helper+0x14c/0x32a [btrfs]
[107118.252899]  [<ffffffffa05f7952>] ? normal_work_helper+0x14c/0x32a [btrfs]
[107118.254195]  [<ffffffffa05f7c82>] btrfs_qgroup_rescan_helper+0x12/0x14 [btrfs]
[107118.255436]  [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[107118.263690]  [<ffffffff81064285>] worker_thread+0x206/0x2c2
[107118.264888]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[107118.267413]  [<ffffffff8106904d>] kthread+0xef/0xf7
[107118.268417]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[107118.269505]  [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[107118.270491]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24

So just use blocking locks for our path to solve this.
This fixes the patch titled:
  "btrfs: qgroup: Don't copy extent buffer to do qgroup rescan"

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/qgroup.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 75bb4af9..93e12c1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2198,7 +2198,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	int slot;
 	int ret;
 
-	path->leave_spinning = 1;
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	ret = btrfs_search_slot_for_read(fs_info->extent_root,
 					 &fs_info->qgroup_rescan_progress,
-- 
cgit v1.1


From 6933599697c96c3213c95f5f1fc7cb6abfd08c54 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 5 Nov 2015 18:43:43 -0800
Subject: inotify: hide internal kernel bits from fdinfo

There was a report that my patch:

    inotify: actually check for invalid bits in sys_inotify_add_watch()

broke CRIU.

The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/* to
figure out how to rebuild inotify watches and then passes those flags
directly back in to the inotify API.  One of those flags
(FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the inotify
API.  It is used inside the kernel to _implement_ inotify but it is not
and has never been part of the API.

My patch above ensured that we only allow bits which are part of the API
(IN_ALL_EVENTS).  This broke CRIU.

FS_EVENT_ON_CHILD is really internal to the kernel.  It is set _anyway_ on
all inotify marks.  So, CRIU was really just trying to set a bit that was
already set.

This patch hides that bit from fdinfo.  CRIU will not see the bit, not try
to set it, and should work as before.  We should not have been exposing
this bit in the first place, so this is a good patch independent of the
CRIU problem.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reported-by: Andrey Wagin <avagin@gmail.com>
Acked-by: Andrey Vagin <avagin@openvz.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Eric Paris <eparis@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Robert Love <rlove@rlove.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fdinfo.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 6b6f0d47..fd98e51 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
 	inode = igrab(mark->inode);
 	if (inode) {
+		/*
+		 * IN_ALL_EVENTS represents all of the mask bits
+		 * that we expose to userspace.  There is at
+		 * least one bit (FS_EVENT_ON_CHILD) which is
+		 * used only internally to the kernel.
+		 */
+		u32 mask = mark->mask & IN_ALL_EVENTS;
 		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
 			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
-			   mark->mask, mark->ignored_mask);
+			   mask, mark->ignored_mask);
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
-- 
cgit v1.1


From d30e2c05a1a231452c273e74851d6b70d516f7f2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 5 Nov 2015 18:43:46 -0800
Subject: inotify: actually check for invalid bits in sys_inotify_add_watch()

The comment here says that it is checking for invalid bits.  But, the mask
is *actually* checking to ensure that _any_ valid bit is set, which is
quite different.

Without this check, an unexpected bit could get set on an inotify object.
Since these bits are also interpreted by the fsnotify/dnotify code, there
is the potential for an object to be mishandled inside the kernel.  For
instance, can we be sure that setting the dnotify flag FS_DN_RENAME on an
inotify watch is harmless?

Add the actual check which was intended.  Retain the existing inotify bits
are being added to the watch.  Plus, this is existing behavior which would
be nice to preserve.

I did a quick sniff test that inotify functions and that my
'inotify-tools' package passes 'make check'.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Robert Love <rlove@rlove.org>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5b1e2a4..b8d08d0 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	int ret;
 	unsigned flags = 0;
 
-	/* don't allow invalid bits: we don't want flags set */
+	/*
+	 * We share a lot of code with fs/dnotify.  We also share
+	 * the bit layout between inotify's IN_* and the fsnotify
+	 * FS_*.  This check ensures that only the inotify IN_*
+	 * bits get passed in and set in watches/events.
+	 */
+	if (unlikely(mask & ~ALL_INOTIFY_BITS))
+		return -EINVAL;
+	/*
+	 * Require at least one valid bit set in the mask.
+	 * Without _something_ set, we would have no events to
+	 * watch for.
+	 */
 	if (unlikely(!(mask & ALL_INOTIFY_BITS)))
 		return -EINVAL;
 
-- 
cgit v1.1


From ce4f2fd7eacd84d78530fb97621621ae50d167f1 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Thu, 5 Nov 2015 18:43:49 -0800
Subject: logfs: fix build warning

fs/logfs/dev_bdev.c: In function '__bdev_writeseg':
include/linux/kernel.h:601:17: warning: comparison of distinct pointer types lacks a cast [enabled by default]
  (void) (&_min1 == &_min2);  \
fs/logfs/dev_bdev.c:84:14: note: in  expansion of macro 'min'
  max_pages = min(nr_pages, BIO_MAX_PAGES);

fs/logfs/dev_bdev.c: In function 'do_erase':
include/linux/kernel.h:601:17: warning: comparison of distinct pointer types lacks a cast [enabled by default]
 (void) (&_min1 == &_min2);  \
fs/logfs/dev_bdev.c:174:14: note: in expansion of macro 'min'
 max_pages = min(nr_pages, BIO_MAX_PAGES);

Lets use min_t and mention the type.

Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/dev_bdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a7fdbd8..a709d80 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages;
 	int i;
 
-	max_pages = min(nr_pages, BIO_MAX_PAGES);
+	max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
 
 	bio = bio_alloc(GFP_NOFS, max_pages);
 	BUG_ON(!bio);
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages;
 	int i;
 
-	max_pages = min(nr_pages, BIO_MAX_PAGES);
+	max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
 
 	bio = bio_alloc(GFP_NOFS, max_pages);
 	BUG_ON(!bio);
-- 
cgit v1.1


From d162eaad7726e5f10a2b5813bdfac9d55c8eba69 Mon Sep 17 00:00:00 2001
From: "Norton.Zhu" <norton.zhu@huawei.com>
Date: Thu, 5 Nov 2015 18:43:52 -0800
Subject: ocfs2_direct_IO_write() misses ocfs2_is_overwrite() error code

If ocfs2_is_overwrite failed, ocfs2_direct_IO_write mays till return
success to the caller.

Signed-off-by: Norton.Zhu <norton.zhu@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 64b11d9..f04914c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -864,6 +864,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
 		is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
 		if (is_overwrite < 0) {
 			mlog_errno(is_overwrite);
+			ret = is_overwrite;
 			ocfs2_inode_unlock(inode, 1);
 			goto clean_orphan;
 		}
-- 
cgit v1.1


From 4e357b932a665e62eb0642633057b8d7156ed8db Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Thu, 5 Nov 2015 18:43:55 -0800
Subject: ocfs2: fill in the unused portion of the block with zeros by
 dio_zero_block()

A simplified test case is (this case from Ryan):
1) dd if=/dev/zero of=/mnt/hello bs=512 count=1 oflag=direct;
2) truncate /mnt/hello -s 2097152
file 'hello' is not exist before test. After this command,
file 'hello' should be all zero. But 512~4096 is some random data.

Setting bh state to new when get a new block, if so,
direct_io_worker()->dio_zero_block() will fill-in the unused portion
of the block with zero.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f04914c..7f60472 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 			ret = -EIO;
 			goto bail;
 		}
+		set_buffer_new(bh_result);
 		up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
-- 
cgit v1.1


From 1d1aff8cf367d2216a678c722161784e207965c4 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 5 Nov 2015 18:43:58 -0800
Subject: ocfs2: improve performance for localalloc

Currently cluster allocation is always trying to find a victim chain (a
chian has most space), and this may lead to poor performance because of
discontiguous allocation in some scenarios.

Our test case is block size 4k, cluster size 1M and mount option with
localalloc=2048 (2G), since a gd is 32256M (about 31.5G) and a localalloc
window is only 2G, creating 50G file will result in 2G from gd0, 2G from
gd1, ...

One way to improve performance is enlarge localalloc window size (max
31104M), but this will make end user feel that about 30G is suddenly
"missing", and localalloc currently do not support steal, which means one
node cannot use another node's localalloc even it is not used in fact.  So
using the last gd to record the allocation and continues with the gd if it
has enough space for a localalloc window can make the allocation as more
contiguous as possible.

Our test result is below (evaluated in IOPS), which is using iometer
running in VM, dynamic vhd virtual disk stored in ocfs2.

IO model                Original   After   Improved(%)
16K60%Write100%Random     703       876     24.59%
8K90%Write100%Random      735       827     12.59%
4K100%Write100%Random     859       915      6.52%
4K100%Read100%Random     2092      2600     24.30%

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Tested-by: Norton Zhu <norton.zhu@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/suballoc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d83d260..fc6d25f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
 				    res, &bits_left);
 	if (!status) {
-		hint = ocfs2_group_from_res(res);
+		if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+			hint = res->sr_bg_blkno;
+		else
+			hint = ocfs2_group_from_res(res);
 		goto set_hint;
 	}
 	if (status < 0 && status != -ENOSPC) {
-- 
cgit v1.1


From 30edc43c7ff0760f6896c37c06a84533546588fa Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 5 Nov 2015 18:44:01 -0800
Subject: ocfs2: do not include dio entry in case of orphan scan

dio entry will only do truncate in case of ORPHAN_NEED_TRUNCATE. So do
not include it when doing normal orphan scan to reduce contention.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/journal.c | 15 ++++++++++++---
 fs/ocfs2/namei.c   |  2 --
 fs/ocfs2/namei.h   |  3 +++
 3 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff82b28..4bac600 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv {
 	struct dir_context	ctx;
 	struct inode		*head;
 	struct ocfs2_super	*osb;
+	enum ocfs2_orphan_reco_type orphan_reco_type;
 };
 
 static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
@@ -2036,6 +2037,12 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
 	if (name_len == 2 && !strncmp("..", name, 2))
 		return 0;
 
+	/* do not include dio entry in case of orphan scan */
+	if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+			(!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+			OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+		return 0;
+
 	/* Skip bad inodes so that recovery can continue */
 	iter = ocfs2_iget(p->osb, ino,
 			  OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
@@ -2060,14 +2067,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
 
 static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 			       int slot,
-			       struct inode **head)
+			       struct inode **head,
+			       enum ocfs2_orphan_reco_type orphan_reco_type)
 {
 	int status;
 	struct inode *orphan_dir_inode = NULL;
 	struct ocfs2_orphan_filldir_priv priv = {
 		.ctx.actor = ocfs2_orphan_filldir,
 		.osb = osb,
-		.head = *head
+		.head = *head,
+		.orphan_reco_type = orphan_reco_type
 	};
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -2170,7 +2179,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	trace_ocfs2_recover_orphans(slot);
 
 	ocfs2_mark_recovering_orphan_dir(osb, slot);
-	ret = ocfs2_queue_orphans(osb, slot, &inode);
+	ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
 	ocfs2_clear_recovering_orphan_dir(osb, slot);
 
 	/* Error here should be noted, but we want to continue with as
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b7dfac2..1206392 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
-#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
-#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
 
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e173329..1155918 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -26,6 +26,9 @@
 #ifndef OCFS2_NAMEI_H
 #define OCFS2_NAMEI_H
 
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
 extern const struct inode_operations ocfs2_dir_iops;
 
 struct dentry *ocfs2_get_parent(struct dentry *child);
-- 
cgit v1.1


From 93d911fcce259a3f950ee20592beee31b855cd96 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 5 Nov 2015 18:44:04 -0800
Subject: ocfs2: only take lock if dio entry when recover orphans

We have no need to take inode mutex, rw and inode lock if it is not dio
entry when recover orphans.  Optimize it by adding a flag
OCFS2_INODE_DIO_ORPHAN_ENTRY to ocfs2_inode_info to reduce contention.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/inode.h   |  2 ++
 fs/ocfs2/journal.c | 86 +++++++++++++++++++++++++++++-------------------------
 2 files changed, 49 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431e..aac8b86 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -112,6 +112,8 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_OPEN_DIRECT		0x00000020
 /* Tell the inode wipe code it's not in orphan dir */
 #define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY	0x00000080
 
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4bac600..04d6303 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2049,6 +2049,10 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
 	if (IS_ERR(iter))
 		return 0;
 
+	if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+			OCFS2_DIO_ORPHAN_PREFIX_LEN))
+		OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
 	/* Skip inodes which are already added to recover list, since dio may
 	 * happen concurrently with unlink/rename */
 	if (OCFS2_I(iter)->ip_next_orphan) {
@@ -2195,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		iter = oi->ip_next_orphan;
 		oi->ip_next_orphan = NULL;
 
-		mutex_lock(&inode->i_mutex);
-		ret = ocfs2_rw_lock(inode, 1);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto next;
-		}
-		/*
-		 * We need to take and drop the inode lock to
-		 * force read inode from disk.
-		 */
-		ret = ocfs2_inode_lock(inode, &di_bh, 1);
-		if (ret) {
-			mlog_errno(ret);
-			goto unlock_rw;
-		}
+		if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+			mutex_lock(&inode->i_mutex);
+			ret = ocfs2_rw_lock(inode, 1);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto unlock_mutex;
+			}
+			/*
+			 * We need to take and drop the inode lock to
+			 * force read inode from disk.
+			 */
+			ret = ocfs2_inode_lock(inode, &di_bh, 1);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock_rw;
+			}
 
-		di = (struct ocfs2_dinode *)di_bh->b_data;
+			di = (struct ocfs2_dinode *)di_bh->b_data;
 
-		if (inode->i_nlink == 0) {
+			if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+				ret = ocfs2_truncate_file(inode, di_bh,
+						i_size_read(inode));
+				if (ret < 0) {
+					if (ret != -ENOSPC)
+						mlog_errno(ret);
+					goto unlock_inode;
+				}
+
+				ret = ocfs2_del_inode_from_orphan(osb, inode,
+						di_bh, 0, 0);
+				if (ret)
+					mlog_errno(ret);
+			}
+unlock_inode:
+			ocfs2_inode_unlock(inode, 1);
+			brelse(di_bh);
+			di_bh = NULL;
+unlock_rw:
+			ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+			mutex_unlock(&inode->i_mutex);
+
+			/* clear dio flag in ocfs2_inode_info */
+			oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+		} else {
 			spin_lock(&oi->ip_lock);
 			/* Set the proper information to get us going into
 			 * ocfs2_delete_inode. */
@@ -2221,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 			spin_unlock(&oi->ip_lock);
 		}
 
-		if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
-				(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
-			ret = ocfs2_truncate_file(inode, di_bh,
-					i_size_read(inode));
-			if (ret < 0) {
-				if (ret != -ENOSPC)
-					mlog_errno(ret);
-				goto unlock_inode;
-			}
-
-			ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
-			if (ret)
-				mlog_errno(ret);
-		} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-unlock_inode:
-		ocfs2_inode_unlock(inode, 1);
-		brelse(di_bh);
-		di_bh = NULL;
-unlock_rw:
-		ocfs2_rw_unlock(inode, 1);
-next:
-		mutex_unlock(&inode->i_mutex);
 		iput(inode);
 		inode = iter;
 	}
-- 
cgit v1.1


From 0986fe9b50f425ec81f25a1a85aaf3574b31d801 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 5 Nov 2015 18:44:07 -0800
Subject: ocfs2: fix race between mount and delete node/cluster

There is a race case between mount and delete node/cluster, which will
lead o2hb_thread to malfunctioning dead loop.

    o2hb_thread
    {
        o2nm_depend_this_node();
        <<<<<< race window, node may have already been deleted, and then
               enter the loop, o2hb thread will be malfunctioning
               because of no configured nodes found.
        while (!kthread_should_stop() &&
               !reg->hr_unclean_stop && !reg->hr_aborted_start) {
    }

So check the return value of o2nm_depend_this_node() is needed.  If node
has been deleted, do not enter the loop and let mount fail.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index fa15deb..ddddef0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -219,7 +219,8 @@ struct o2hb_region {
 	unsigned		hr_unclean_stop:1,
 				hr_aborted_start:1,
 				hr_item_pinned:1,
-				hr_item_dropped:1;
+				hr_item_dropped:1,
+				hr_node_deleted:1;
 
 	/* protected by the hr_callback_sem */
 	struct task_struct 	*hr_task;
@@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data)
 	set_user_nice(current, MIN_NICE);
 
 	/* Pin node */
-	o2nm_depend_this_node();
+	ret = o2nm_depend_this_node();
+	if (ret) {
+		mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+		reg->hr_node_deleted = 1;
+		wake_up(&o2hb_steady_queue);
+		return 0;
+	}
 
 	while (!kthread_should_stop() &&
 	       !reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	spin_unlock(&o2hb_live_lock);
 
 	ret = wait_event_interruptible(o2hb_steady_queue,
-				atomic_read(&reg->hr_steady_iterations) == 0);
+				atomic_read(&reg->hr_steady_iterations) == 0 ||
+				reg->hr_node_deleted);
 	if (ret) {
 		atomic_set(&reg->hr_steady_iterations, 0);
 		reg->hr_aborted_start = 1;
@@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out3;
 	}
 
+	if (reg->hr_node_deleted) {
+		ret = -EINVAL;
+		goto out3;
+	}
+
 	/* Ok, we were woken.  Make sure it wasn't by drop_item() */
 	spin_lock(&o2hb_live_lock);
 	hb_task = reg->hr_task;
-- 
cgit v1.1


From b1529a41f777a48f95d4af29668b70ffe3360e1b Mon Sep 17 00:00:00 2001
From: alex chen <alex.chen@huawei.com>
Date: Thu, 5 Nov 2015 18:44:10 -0800
Subject: ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an
 error

In ocfs2_mknod_locked if '__ocfs2_mknod_locke		d' returns an error, we
should reclaim the inode successfully claimed above, otherwise, the
inode never be reused. The case is described below:

ocfs2_mknod
    ocfs2_mknod_locked
        ocfs2_claim_new_inode
                Successfully claim the inode
        __ocfs2_mknod_locked
            ocfs2_journal_access_di
            Failed because of -ENOMEM or other reasons, the inode
                        lockres has not been initialized yet.

    iput(inode)
        ocfs2_evict_inode
            ocfs2_delete_inode
                ocfs2_inode_lock
                    ocfs2_inode_lock_full_nested
                        __ocfs2_cluster_lock
                                Return -EINVAL because of the inode
                                lockres has not been initialized.

                So the following operations are not performed
                ocfs2_wipe_inode
                        ocfs2_remove_inode
                                ocfs2_free_dinode
                                        ocfs2_free_suballoc_bits

Signed-off-by: Alex Chen <alex.chen@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 1206392..3b48ac2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -655,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		return status;
 	}
 
-	return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+	status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
 				    parent_fe_bh, handle, inode_ac,
 				    fe_blkno, suballoc_loc, suballoc_bit);
+	if (status < 0) {
+		u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
+		int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
+				inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
+		if (tmp)
+			mlog_errno(tmp);
+	}
+
+	return status;
 }
 
 static int ocfs2_mkdir(struct inode *dir,
-- 
cgit v1.1


From 5afc44e2e9678c0808211f1662732b368cc25f76 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 5 Nov 2015 18:44:13 -0800
Subject: ocfs2: add uuid to ocfs2 thread name for problem analysis

A node can mount multiple ocfs2 volumes.  And if thread names are same for
each volume/domain, it will bring inconvenience when analyzing problems
because we have to identify which volume/domain the messages belong to.

Since thread name will be printed to messages, so add volume uuid or dlm
name to thread name can benefit problem analysis.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmdomain.c   | 4 +++-
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 fs/ocfs2/dlm/dlmthread.c   | 3 ++-
 fs/ocfs2/dlmglue.c         | 3 ++-
 fs/ocfs2/journal.c         | 4 ++--
 5 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6918f30..2ee7fe7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 	int status;
 	unsigned int backoff;
 	unsigned int total_backoff = 0;
+	char wq_name[O2NM_MAX_NAME_LEN];
 
 	BUG_ON(!dlm);
 
@@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
-	dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+	snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
+	dlm->dlm_worker = create_singlethread_workqueue(wq_name);
 	if (!dlm->dlm_worker) {
 		status = -ENOMEM;
 		mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 58eaa5c..9e4f862 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
 	mlog(0, "starting dlm recovery thread...\n");
 
 	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
-						"dlm_reco_thread");
+			"dlm_reco-%s", dlm->name);
 	if (IS_ERR(dlm->dlm_reco_thread_task)) {
 		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
 		dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2e5e6d5..c5f6c24 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
 	mlog(0, "Starting dlm_thread...\n");
 
-	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
+			dlm->name);
 	if (IS_ERR(dlm->dlm_thread_task)) {
 		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
 		dlm->dlm_thread_task = NULL;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c91103..20276e3 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 	}
 
 	/* launch downconvert thread */
-	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
+			osb->uuid_str);
 	if (IS_ERR(osb->dc_task)) {
 		status = PTR_ERR(osb->dc_task);
 		osb->dc_task = NULL;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 04d6303..13534f4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 	/* Launch the commit thread */
 	if (!local) {
 		osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
-					       "ocfs2cmt");
+				"ocfs2cmt-%s", osb->uuid_str);
 		if (IS_ERR(osb->commit_task)) {
 			status = PTR_ERR(osb->commit_task);
 			osb->commit_task = NULL;
@@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 		goto out;
 
 	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
-						 "ocfs2rec");
+			"ocfs2rec-%s", osb->uuid_str);
 	if (IS_ERR(osb->recovery_thread_task)) {
 		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
 		osb->recovery_thread_task = NULL;
-- 
cgit v1.1


From 262d8a8779e24596ec3452ebb7d0a989de91089a Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Thu, 5 Nov 2015 18:44:16 -0800
Subject: ocfs2: clean up unused variable in ocfs2_duplicate_clusters_by_page()

readahead_pages in ocfs2_duplicate_clusters_by_page is defined but not
used, so clean it up.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/refcounttree.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e5d57cd..2521198 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
 	struct page *page;
 	pgoff_t page_index;
-	unsigned int from, to, readahead_pages;
+	unsigned int from, to;
 	loff_t offset, end, map_end;
 	struct address_space *mapping = inode->i_mapping;
 
 	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
 					       new_cluster, new_len);
 
-	readahead_pages =
-		(ocfs2_cow_contig_clusters(sb) <<
-		 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
 	/*
-- 
cgit v1.1


From b64787401fd85b66403dd05159a749e333059c0a Mon Sep 17 00:00:00 2001
From: Dominique Martinet <dominique.martinet@cea.fr>
Date: Thu, 5 Nov 2015 18:44:21 -0800
Subject: 9p: do not overwrite return code when locking fails

If the remote locking fail, we run a local vfs unlock that should work and
return success to userland when we didn't actually lock at all.  We need
to tell the application that tried to lock that it didn't get it, not that
all went well.

Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/9p/vfs_file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3abc447..6b74739 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -231,7 +231,8 @@ out_unlock:
 	if (res < 0 && fl->fl_type != F_UNLCK) {
 		fl_type = fl->fl_type;
 		fl->fl_type = F_UNLCK;
-		res = posix_lock_file_wait(filp, fl);
+		/* Even if this fails we want to return the remote error */
+		posix_lock_file_wait(filp, fl);
 		fl->fl_type = fl_type;
 	}
 out:
-- 
cgit v1.1


From 25ee01a2fca02dfb5a3ce316e77910c468108199 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 5 Nov 2015 18:47:11 -0800
Subject: mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps

Currently /proc/PID/smaps provides no usage info for vma(VM_HUGETLB),
which is inconvenient when we want to know per-task or per-vma base
hugetlb usage.  To solve this, this patch adds new fields for hugetlb
usage like below:

  Size:              20480 kB
  Rss:                   0 kB
  Pss:                   0 kB
  Shared_Clean:          0 kB
  Shared_Dirty:          0 kB
  Private_Clean:         0 kB
  Private_Dirty:         0 kB
  Referenced:            0 kB
  Anonymous:             0 kB
  AnonHugePages:         0 kB
  Shared_Hugetlb:    18432 kB
  Private_Hugetlb:    2048 kB
  Swap:                  0 kB
  KernelPageSize:     2048 kB
  MMUPageSize:        2048 kB
  Locked:                0 kB
  VmFlags: rd wr mr mw me de ht

[hughd@google.com: fix Private_Hugetlb alignment ]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Joern Engel <joern@logfs.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b029d42..5988b83 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -446,6 +446,8 @@ struct mem_size_stats {
 	unsigned long anonymous;
 	unsigned long anonymous_thp;
 	unsigned long swap;
+	unsigned long shared_hugetlb;
+	unsigned long private_hugetlb;
 	u64 pss;
 	u64 swap_pss;
 };
@@ -625,12 +627,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 	seq_putc(m, '\n');
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+				 unsigned long addr, unsigned long end,
+				 struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct page *page = NULL;
+
+	if (pte_present(*pte)) {
+		page = vm_normal_page(vma, addr, *pte);
+	} else if (is_swap_pte(*pte)) {
+		swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+		if (is_migration_entry(swpent))
+			page = migration_entry_to_page(swpent);
+	}
+	if (page) {
+		int mapcount = page_mapcount(page);
+
+		if (mapcount >= 2)
+			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+		else
+			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+	}
+	return 0;
+}
+#endif /* HUGETLB_PAGE */
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
 	struct mm_walk smaps_walk = {
 		.pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+		.hugetlb_entry = smaps_hugetlb_range,
+#endif
 		.mm = vma->vm_mm,
 		.private = &mss,
 	};
@@ -652,6 +686,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   "Referenced:     %8lu kB\n"
 		   "Anonymous:      %8lu kB\n"
 		   "AnonHugePages:  %8lu kB\n"
+		   "Shared_Hugetlb: %8lu kB\n"
+		   "Private_Hugetlb: %7lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "SwapPss:        %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
@@ -667,6 +703,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   mss.referenced >> 10,
 		   mss.anonymous >> 10,
 		   mss.anonymous_thp >> 10,
+		   mss.shared_hugetlb >> 10,
+		   mss.private_hugetlb >> 10,
 		   mss.swap >> 10,
 		   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
 		   vma_kernel_pagesize(vma) >> 10,
-- 
cgit v1.1


From 5d317b2b6536592a9b51fe65faed43d65ca9158e Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 5 Nov 2015 18:47:14 -0800
Subject: mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status

Currently there's no easy way to get per-process usage of hugetlb pages,
which is inconvenient because userspace applications which use hugetlb
typically want to control their processes on the basis of how much memory
(including hugetlb) they use.  So this patch simply provides easy access
to the info via /proc/PID/status.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Joern Engel <joern@logfs.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5988b83..288185e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		ptes >> 10,
 		pmds >> 10,
 		swap << (PAGE_SHIFT-10));
+	hugetlb_report_usage(m, mm);
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
-- 
cgit v1.1


From aa750fd71c242dba02ee2034e15fbd7d0cdb2461 Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 5 Nov 2015 18:47:23 -0800
Subject: mm/filemap.c: make global sync not clear error status of individual
 inodes

filemap_fdatawait() is a function to wait for on-going writeback to
complete but also consume and clear error status of the mapping set during
writeback.

The latter functionality is critical for applications to detect writeback
error with system calls like fsync(2)/fdatasync(2).

However filemap_fdatawait() is also used by sync(2) or FIFREEZE ioctl,
which don't check error status of individual mappings.

As a result, fsync() may not be able to detect writeback error if events
happen in the following order:

   Application                    System admin
   ----------------------------------------------------------
   write data on page cache
                                  Run sync command
                                  writeback completes with error
                                  filemap_fdatawait() clears error
   fsync returns success
   (but the data is not on disk)

This patch adds filemap_fdatawait_keep_errors() for call sites where
writeback error is not handled so that they don't clear error status.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: Fengguang Wu <fengguang.wu@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 7 ++++++-
 fs/sync.c         | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7378169..206a68b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb)
 		iput(old_inode);
 		old_inode = inode;
 
-		filemap_fdatawait(mapping);
+		/*
+		 * We keep the error status of individual mapping so that
+		 * applications can catch the writeback error using fsync(2).
+		 * See filemap_fdatawait_keep_errors() for details.
+		 */
+		filemap_fdatawait_keep_errors(mapping);
 
 		cond_resched();
 
diff --git a/fs/sync.c b/fs/sync.c
index fbc98ee..4ec430a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
 
 static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 {
-	filemap_fdatawait(bdev->bd_inode->i_mapping);
+	/*
+	 * We keep the error status of individual mapping so that
+	 * applications can catch the writeback error using fsync(2).
+	 * See filemap_fdatawait_keep_errors() for details.
+	 */
+	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
 /*
-- 
cgit v1.1


From 326c2597a3b93b8a89e0c3a2b41fd9971d095a97 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:49:21 -0800
Subject: mm: clear pte in clear_soft_dirty()

As mentioned in the commit 56eecdb912b5 ("mm: Use ptep/pmdp_set_numa()
for updating _PAGE_NUMA bit"), architectures like ppc64 don't do tlb
flush in set_pte/pmd functions.

So when dealing with existing pte in clear_soft_dirty, the pte must be
cleared before being modified.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 288185e..ef44bb4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -792,19 +792,20 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	pte_t ptent = *pte;
 
 	if (pte_present(ptent)) {
+		ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
 		ptent = pte_wrprotect(ptent);
 		ptent = pte_clear_soft_dirty(ptent);
+		ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
+		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
-
-	set_pte_at(vma->vm_mm, addr, pte, ptent);
 }
 
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
-	pmd_t pmd = *pmdp;
+	pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
 
 	pmd = pmd_wrprotect(pmd);
 	pmd = pmd_clear_soft_dirty(pmd);
-- 
cgit v1.1


From 5d3875a01eeafef7ef0b73a99462cb9bd55e8485 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 18:49:24 -0800
Subject: mm: clear_soft_dirty_pmd() requires THP

Don't build clear_soft_dirty_pmd() if transparent huge pages are not
enabled.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ef44bb4..187b3b5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -801,7 +801,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
 }
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *pte)
+{
+}
+#endif
 
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
@@ -815,14 +822,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 
 	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 }
-
 #else
-
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *pte)
-{
-}
-
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
-- 
cgit v1.1


From b72bdfa73603f2f81fbac651b9ae36807e877752 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Nov 2015 18:50:32 -0800
Subject: mm, oom: add comment for why oom_adj exists

/proc/pid/oom_adj exists solely to avoid breaking existing userspace
binaries that write to the tunable.

Add a comment in the only possible location within the kernel tree to
describe the situation and motivation for keeping it around.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 29595af..bd3e9e6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
 	return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
+/*
+ * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+ * kernels.  The effective policy is defined by oom_score_adj, which has a
+ * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+ * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+ * Processes that become oom disabled via oom_adj will still be oom disabled
+ * with this implementation.
+ *
+ * oom_adj cannot be removed since existing userspace binaries use it.
+ */
 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-- 
cgit v1.1


From ea5c58e70c3a148ada0d3061a8f529589bb766ba Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Fri, 6 Nov 2015 00:32:04 -0600
Subject: vfs: clear remainder of 'full_fds_bits' in dup_fd()

This fixes a bug from commit f3f86e33dc3d ("vfs: Fix pathological
performance case for __alloc_fd()").

v2: refactor to share fd bitmap copying code
Signed-off-by: Eric Biggers <ebiggers3@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 64 +++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index c6986dc..39f8f15 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -60,8 +60,31 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
 
 /*
- * Expand the fdset in the files_struct.  Called with the files spinlock
- * held for write.
+ * Copy 'count' fd bits from the old table to the new table and clear the extra
+ * space if any.  This does not copy the file pointers.  Called with the files
+ * spinlock held for write.
+ */
+static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+			    unsigned int count)
+{
+	unsigned int cpy, set;
+
+	cpy = count / BITS_PER_BYTE;
+	set = (nfdt->max_fds - count) / BITS_PER_BYTE;
+	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+	memset((char *)nfdt->open_fds + cpy, 0, set);
+	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+	memset((char *)nfdt->close_on_exec + cpy, 0, set);
+
+	cpy = BITBIT_SIZE(count);
+	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
+	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
+	memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+}
+
+/*
+ * Copy all file descriptors from the old table to the new, expanded table and
+ * clear the extra space.  Called with the files spinlock held for write.
  */
 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 {
@@ -72,19 +95,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 	cpy = ofdt->max_fds * sizeof(struct file *);
 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 	memcpy(nfdt->fd, ofdt->fd, cpy);
-	memset((char *)(nfdt->fd) + cpy, 0, set);
+	memset((char *)nfdt->fd + cpy, 0, set);
 
-	cpy = ofdt->max_fds / BITS_PER_BYTE;
-	set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
-	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
-	memset((char *)(nfdt->open_fds) + cpy, 0, set);
-	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
-	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
-
-	cpy = BITBIT_SIZE(ofdt->max_fds);
-	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
-	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
-	memset(cpy+(char *)nfdt->full_fds_bits, 0, set);
+	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
 }
 
 static struct fdtable * alloc_fdtable(unsigned int nr)
@@ -277,7 +290,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
 	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
-	int open_files, size, i;
+	int open_files, i;
 	struct fdtable *old_fdt, *new_fdt;
 
 	*errorp = -ENOMEM;
@@ -334,13 +347,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		open_files = count_open_files(old_fdt);
 	}
 
+	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+
 	old_fds = old_fdt->fd;
 	new_fds = new_fdt->fd;
 
-	memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
-	memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
-	memcpy(new_fdt->full_fds_bits, old_fdt->full_fds_bits, BITBIT_SIZE(open_files));
-
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
 		if (f) {
@@ -358,19 +369,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	}
 	spin_unlock(&oldf->file_lock);
 
-	/* compute the remainder to be cleared */
-	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
-	/* This is long word aligned thus could use a optimized version */
-	memset(new_fds, 0, size);
-
-	if (new_fdt->max_fds > open_files) {
-		int left = (new_fdt->max_fds - open_files) / 8;
-		int start = open_files / BITS_PER_LONG;
-
-		memset(&new_fdt->open_fds[start], 0, left);
-		memset(&new_fdt->close_on_exec[start], 0, left);
-	}
+	/* clear the remainder */
+	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
 
 	rcu_assign_pointer(newf->fdt, new_fdt);
 
-- 
cgit v1.1


From 86ba9ed928f33cf6afbac433ff03dd8c6a0ceac4 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Tue, 29 Sep 2015 21:57:33 +0800
Subject: fs/ubifs: remove unnecessary new_valid_dev check

As currently new_valid_dev always returns 1, so new_valid_dev check is not
needed, remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/dir.c  | 3 ---
 fs/ubifs/misc.h | 9 ++-------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c27c66..1dd9267 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -787,9 +787,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 
 	dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	if (S_ISBLK(mode) || S_ISCHR(mode)) {
 		dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
 		if (!dev)
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index ee7cb5e..8ece6ca 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -155,13 +155,8 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
  */
 static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
 {
-	if (new_valid_dev(rdev)) {
-		dev->new = cpu_to_le32(new_encode_dev(rdev));
-		return sizeof(dev->new);
-	} else {
-		dev->huge = cpu_to_le64(huge_encode_dev(rdev));
-		return sizeof(dev->huge);
-	}
+	dev->new = cpu_to_le32(new_encode_dev(rdev));
+	return sizeof(dev->new);
 }
 
 /**
-- 
cgit v1.1


From aeeb14f763917ccf639a602cfbeee6957fd944a2 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 12 Oct 2015 23:35:36 +0200
Subject: UBIFS: Fix possible memory leak in ubifs_readdir()

If ubifs_tnc_next_ent() returns something else than -ENOENT
we leak file->private_data.

Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: David Gstir <david@sigma-star.at>
---
 fs/ubifs/dir.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 1dd9267..a2f9d97 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -449,13 +449,14 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 	}
 
 out:
+	kfree(file->private_data);
+	file->private_data = NULL;
+
 	if (err != -ENOENT) {
 		ubifs_err(c, "cannot find next direntry, error %d", err);
 		return err;
 	}
 
-	kfree(file->private_data);
-	file->private_data = NULL;
 	/* 2 is a special value indicating that there are no more direntries */
 	ctx->pos = 2;
 	return 0;
-- 
cgit v1.1


From 54bcfdf19ec7c2f2788fa76426c91abdc4dab973 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 4 Nov 2015 20:00:42 +0100
Subject: UBIFS: Delete unnecessary checks before the function call "iput"

The iput() function tests whether its argument is NULL and then
returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/recovery.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 2a1f0bc..586d593 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1331,8 +1331,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c)
 	struct size_entry *e, *n;
 
 	rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
-		if (e->inode)
-			iput(e->inode);
+		iput(e->inode);
 		kfree(e);
 	}
 
@@ -1533,8 +1532,7 @@ int ubifs_recover_size(struct ubifs_info *c)
 				err = fix_size_in_place(c, e);
 				if (err)
 					return err;
-				if (e->inode)
-					iput(e->inode);
+				iput(e->inode);
 			}
 		}
 
-- 
cgit v1.1


From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 6 Nov 2015 16:28:21 -0800
Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling
 to sleep and avoiding waking kswapd

__GFP_WAIT has been used to identify atomic context in callers that hold
spinlocks or are in interrupts.  They are expected to be high priority and
have access one of two watermarks lower than "min" which can be referred
to as the "atomic reserve".  __GFP_HIGH users get access to the first
lower watermark and can be called the "high priority reserve".

Over time, callers had a requirement to not block when fallback options
were available.  Some have abused __GFP_WAIT leading to a situation where
an optimisitic allocation with a fallback option can access atomic
reserves.

This patch uses __GFP_ATOMIC to identify callers that are truely atomic,
cannot sleep and have no alternative.  High priority users continue to use
__GFP_HIGH.  __GFP_DIRECT_RECLAIM identifies callers that can sleep and
are willing to enter direct reclaim.  __GFP_KSWAPD_RECLAIM to identify
callers that want to wake kswapd for background reclaim.  __GFP_WAIT is
redefined as a caller that is willing to enter direct reclaim and wake
kswapd for background reclaim.

This patch then converts a number of sites

o __GFP_ATOMIC is used by callers that are high priority and have memory
  pools for those requests. GFP_ATOMIC uses this flag.

o Callers that have a limited mempool to guarantee forward progress clear
  __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall
  into this category where kswapd will still be woken but atomic reserves
  are not used as there is a one-entry mempool to guarantee progress.

o Callers that are checking if they are non-blocking should use the
  helper gfpflags_allow_blocking() where possible. This is because
  checking for __GFP_WAIT as was done historically now can trigger false
  positives. Some exceptions like dm-crypt.c exist where the code intent
  is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to
  flag manipulations.

o Callers that built their own GFP flags instead of starting with GFP_KERNEL
  and friends now also need to specify __GFP_KSWAPD_RECLAIM.

The first key hazard to watch out for is callers that removed __GFP_WAIT
and was depending on access to atomic reserves for inconspicuous reasons.
In some cases it may be appropriate for them to use __GFP_HIGH.

The second key hazard is callers that assembled their own combination of
GFP flags instead of starting with something like GFP_KERNEL.  They may
now wish to specify __GFP_KSWAPD_RECLAIM.  It's almost certainly harmless
if it's missed in most cases as other activity will wake kswapd.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vitaly Wool <vitalywool@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/disk-io.c    |  2 +-
 fs/btrfs/extent_io.c  | 14 +++++++-------
 fs/btrfs/volumes.c    |  4 ++--
 fs/ext4/super.c       |  2 +-
 fs/fscache/cookie.c   |  2 +-
 fs/fscache/page.c     |  6 +++---
 fs/jbd2/transaction.c |  4 ++--
 fs/nfs/file.c         |  6 +++---
 fs/xfs/xfs_qm.c       |  2 +-
 9 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e60d00..c339d56 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2572,7 +2572,7 @@ int open_ctree(struct super_block *sb,
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
 	/* readahead state */
-	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	spin_lock_init(&fs_info->reada_lock);
 
 	fs_info->thread_pool_size = min_t(unsigned long,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3915c94..032abfb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -594,7 +594,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 		clear = 1;
 again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
+	if (!prealloc && gfpflags_allow_blocking(mask)) {
 		/*
 		 * Don't care for allocation failure here because we might end
 		 * up not needing the pre-allocated extent state at all, which
@@ -718,7 +718,7 @@ search_again:
 	if (start > end)
 		goto out;
 	spin_unlock(&tree->lock);
-	if (mask & __GFP_WAIT)
+	if (gfpflags_allow_blocking(mask))
 		cond_resched();
 	goto again;
 }
@@ -850,7 +850,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
 	bits |= EXTENT_FIRST_DELALLOC;
 again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
+	if (!prealloc && gfpflags_allow_blocking(mask)) {
 		prealloc = alloc_extent_state(mask);
 		BUG_ON(!prealloc);
 	}
@@ -1028,7 +1028,7 @@ search_again:
 	if (start > end)
 		goto out;
 	spin_unlock(&tree->lock);
-	if (mask & __GFP_WAIT)
+	if (gfpflags_allow_blocking(mask))
 		cond_resched();
 	goto again;
 }
@@ -1076,7 +1076,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
+	if (!prealloc && gfpflags_allow_blocking(mask)) {
 		/*
 		 * Best effort, don't worry if extent state allocation fails
 		 * here for the first iteration. We might have a cached state
@@ -1253,7 +1253,7 @@ search_again:
 	if (start > end)
 		goto out;
 	spin_unlock(&tree->lock);
-	if (mask & __GFP_WAIT)
+	if (gfpflags_allow_blocking(mask))
 		cond_resched();
 	first_iteration = false;
 	goto again;
@@ -4319,7 +4319,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 
-	if ((mask & __GFP_WAIT) &&
+	if (gfpflags_allow_blocking(mask) &&
 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
 		u64 len;
 		while (start <= end) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc73586..e023919 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -156,8 +156,8 @@ static struct btrfs_device *__alloc_device(void)
 	spin_lock_init(&dev->reada_lock);
 	atomic_set(&dev->reada_in_flight, 0);
 	atomic_set(&dev->dev_stats_ccnt, 0);
-	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
-	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 
 	return dev;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a63c7b0..49f6c78 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1058,7 +1058,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page,
-							wait & ~__GFP_WAIT);
+						wait & ~__GFP_DIRECT_RECLAIM);
 	return try_to_free_buffers(page);
 }
 
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d403c69..43040721 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
 
 	/* radix tree insertion won't use the preallocation pool unless it's
 	 * told it may not wait */
-	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 
 	switch (cookie->def->type) {
 	case FSCACHE_COOKIE_TYPE_INDEX:
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 483bbc6..79483b3 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
 
 /*
  * decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
  */
 bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 				  struct page *page,
@@ -122,7 +122,7 @@ page_busy:
 	 * allocator as the work threads writing to the cache may all end up
 	 * sleeping on memory allocation, so we may need to impose a timeout
 	 * too. */
-	if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+	if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
 		fscache_stat(&fscache_n_store_vmscan_busy);
 		return false;
 	}
@@ -132,7 +132,7 @@ page_busy:
 		_debug("fscache writeout timeout page: %p{%lx}",
 			page, page->index);
 
-	gfp &= ~__GFP_WAIT;
+	gfp &= ~__GFP_DIRECT_RECLAIM;
 	goto try_again;
 }
 EXPORT_SYMBOL(__fscache_maybe_release_page);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6b8338e..89463ee 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1937,8 +1937,8 @@ out:
  * @journal: journal for operation
  * @page: to try and free
  * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
  *
  *
  * For all the buffers on this page,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 37f639d..93e2364 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
 	/* Always try to initiate a 'commit' if relevant, but only
-	 * wait for it if __GFP_WAIT is set.  Even then, only wait 1
-	 * second and only if the 'bdi' is not congested.
+	 * wait for it if the caller allows blocking.  Even then,
+	 * only wait 1 second and only if the 'bdi' is not congested.
 	 * Waiting indefinitely can cause deadlocks when the NFS
 	 * server is on this machine, when a new TCP connection is
 	 * needed and in other rare cases.  There is no particular
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	if (mapping) {
 		struct nfs_server *nfss = NFS_SERVER(mapping->host);
 		nfs_commit_inode(mapping->host, 0);
-		if ((gfp & __GFP_WAIT) &&
+		if (gfpflags_allow_blocking(gfp) &&
 		    !bdi_write_congested(&nfss->backing_dev_info)) {
 			wait_on_page_bit_killable_timeout(page, PG_private,
 							  HZ);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eac9549..587174f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
 	unsigned long		freed;
 	int			error;
 
-	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
 		return 0;
 
 	INIT_LIST_HEAD(&isol.buffers);
-- 
cgit v1.1


From 71baba4b92dc1fa1bc461742c6ab1942ec6034e9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 6 Nov 2015 16:28:28 -0800
Subject: mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM

__GFP_WAIT was used to signal that the caller was in atomic context and
could not sleep.  Now it is possible to distinguish between true atomic
context and callers that are not willing to sleep.  The latter should
clear __GFP_DIRECT_RECLAIM so kswapd will still wake.  As clearing
__GFP_WAIT behaves differently, there is a risk that people will clear the
wrong flags.  This patch renames __GFP_WAIT to __GFP_RECLAIM to clearly
indicate what it does -- setting it allows all reclaim activity, clearing
them prevents it.

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Vitaly Wool <vitalywool@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cachefiles/internal.h | 2 +-
 fs/direct-io.c           | 2 +-
 fs/nilfs2/mdt.h          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index aecd085..9c4b737 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -30,7 +30,7 @@ extern unsigned cachefiles_debug;
 #define CACHEFILES_DEBUG_KLEAVE	2
 #define CACHEFILES_DEBUG_KDEBUG	4
 
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
 
 /*
  * node records
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3ae0e04..18e7554 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -361,7 +361,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 
 	/*
 	 * bio_alloc() is guaranteed to return a bio when called with
-	 * __GFP_WAIT and we request a valid number of vectors.
+	 * __GFP_RECLAIM and we request a valid number of vectors.
 	 */
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index fe529a8..03246ca 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 }
 
 /* Default GFP flags using highmem */
-#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP      (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
 
 int nilfs_mdt_get_block(struct inode *, unsigned long, int,
 			void (*init_block)(struct inode *,
-- 
cgit v1.1


From c62d25556be6c965dc14288e796a576e8e39a7e9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 6 Nov 2015 16:28:49 -0800
Subject: mm, fs: introduce mapping_gfp_constraint()

There are many places which use mapping_gfp_mask to restrict a more
generic gfp mask which would be used for allocations which are not
directly related to the page cache but they are performed in the same
context.

Let's introduce a helper function which makes the restriction explicit and
easier to track.  This patch doesn't introduce any functional changes.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/compression.c      | 7 +++----
 fs/btrfs/ctree.h            | 2 +-
 fs/btrfs/free-space-cache.c | 4 ++--
 fs/buffer.c                 | 2 +-
 fs/ceph/addr.c              | 7 ++++---
 fs/cifs/file.c              | 2 +-
 fs/ext4/inode.c             | 2 +-
 fs/ext4/readpage.c          | 2 +-
 fs/logfs/segment.c          | 2 +-
 fs/mpage.c                  | 4 ++--
 fs/namei.c                  | 2 +-
 fs/nilfs2/inode.c           | 4 ++--
 fs/ntfs/file.c              | 4 ++--
 fs/splice.c                 | 2 +-
 14 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca..36dfeff 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			goto next;
 		}
 
-		page = __page_cache_alloc(mapping_gfp_mask(mapping) &
-								~__GFP_FS);
+		page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+								 ~__GFP_FS));
 		if (!page)
 			break;
 
-		if (add_to_page_cache_lru(page, mapping, pg_index,
-								GFP_NOFS)) {
+		if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
 			page_cache_release(page);
 			goto next;
 		}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..eb90f0f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3316,7 +3316,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 
 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 {
-	return mapping_gfp_mask(mapping) & ~__GFP_FS;
+	return mapping_gfp_constraint(mapping, ~__GFP_FS);
 }
 
 /* extent-tree.c */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66..ed05da1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 	}
 
 	mapping_set_gfp_mask(inode->i_mapping,
-			mapping_gfp_mask(inode->i_mapping) &
-			~(__GFP_FS | __GFP_HIGHMEM));
+			mapping_gfp_constraint(inode->i_mapping,
+			~(__GFP_FS | __GFP_HIGHMEM)));
 
 	return inode;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 82283ab..51aff02 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -999,7 +999,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	int ret = 0;		/* Will call free_more_memory() */
 	gfp_t gfp_mask;
 
-	gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 
 	/*
 	 * XXX: __getblk_slow() can not really deal with failure and
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9d23e78..b7d218a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		int ret1;
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page = find_or_create_page(mapping, 0,
-						mapping_gfp_mask(mapping) &
-						~__GFP_FS);
+						mapping_gfp_constraint(mapping,
+						~__GFP_FS));
 		if (!page) {
 			ret = VM_FAULT_OOM;
 			goto out;
@@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 		if (i_size_read(inode) == 0)
 			return;
 		page = find_or_create_page(mapping, 0,
-					   mapping_gfp_mask(mapping) & ~__GFP_FS);
+					   mapping_gfp_constraint(mapping,
+					   ~__GFP_FS));
 		if (!page)
 			return;
 		if (PageUptodate(page)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 47c5c97..0068e82 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 	struct page *page, *tpage;
 	unsigned int expected_index;
 	int rc;
-	gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 
 	INIT_LIST_HEAD(tmplist);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 612fbcf..60aaecd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3344,7 +3344,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 	int err = 0;
 
 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+				   mapping_gfp_constraint(mapping, ~__GFP_FS));
 	if (!page)
 		return -ENOMEM;
 
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 560af04..1061611 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			page = list_entry(pages->prev, struct page, lru);
 			list_del(&page->lru);
 			if (add_to_page_cache_lru(page, mapping, page->index,
-					GFP_KERNEL & mapping_gfp_mask(mapping)))
+				  mapping_gfp_constraint(mapping, GFP_KERNEL)))
 				goto next_page;
 		}
 
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 7f9b096..6de0fbf 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
 	filler_t *filler = super->s_devops->readpage;
 	struct page *page;
 
-	BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+	BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
 	if (use_filler)
 		page = read_cache_page(mapping, index, filler, sb);
 	else {
diff --git a/fs/mpage.c b/fs/mpage.c
index 09abba7..1480d3a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -361,7 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
-	gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
@@ -397,7 +397,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
-	gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+	gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
diff --git a/fs/namei.c b/fs/namei.c
index 0d3340b..3c18970 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4604,7 +4604,7 @@ EXPORT_SYMBOL(__page_symlink);
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
 	return __page_symlink(inode, symname, len,
-			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
 }
 EXPORT_SYMBOL(page_symlink);
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4a73d6d..ac2f649 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 		goto failed;
 
 	mapping_set_gfp_mask(inode->i_mapping,
-			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+			   mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
 
 	root = NILFS_I(dir)->i_root;
 	ii = NILFS_I(inode);
@@ -522,7 +522,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
 	mapping_set_gfp_mask(inode->i_mapping,
-			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+			   mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
 	return 0;
 
  failed_unmap:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 262561f..9d383e5 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 				}
 			}
 			err = add_to_page_cache_lru(*cached_page, mapping,
-					index,
-					GFP_KERNEL & mapping_gfp_mask(mapping));
+				   index,
+				   mapping_gfp_constraint(mapping, GFP_KERNEL));
 			if (unlikely(err)) {
 				if (err == -EEXIST)
 					continue;
diff --git a/fs/splice.c b/fs/splice.c
index 5fc1e50..801c21c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				break;
 
 			error = add_to_page_cache_lru(page, mapping, index,
-					GFP_KERNEL & mapping_gfp_mask(mapping));
+				   mapping_gfp_constraint(mapping, GFP_KERNEL));
 			if (unlikely(error)) {
 				page_cache_release(page);
 				if (error == -EEXIST)
-- 
cgit v1.1


From 23d0127096cb91cb6d354bdc71bd88a7bae3a1d5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 6 Nov 2015 16:28:55 -0800
Subject: fs/sync.c: make sync_file_range(2) use WB_SYNC_NONE writeback

sync_file_range(2) is documented to issue writeback only for pages that
are not currently being written.  After all the system call has been
created for userspace to be able to issue background writeout and so
waiting for in-flight IO is undesirable there.  However commit
ee53a891f474 ("mm: do_sync_mapping_range integrity fix") switched
do_sync_mapping_range() and thus sync_file_range() to issue writeback in
WB_SYNC_ALL mode since do_sync_mapping_range() was used by other code
relying on WB_SYNC_ALL semantics.

These days do_sync_mapping_range() went away and we can switch
sync_file_range(2) back to issuing WB_SYNC_NONE writeback.  That should
help PostgreSQL avoid large latency spikes when flushing data in the
background.

Andres measured a 20% increase in transactions per second on an SSD disk.

Signed-off-by: Jan Kara <jack@suse.com>
Reported-by: Andres Freund <andres@anarazel.de>
Tested-By: Andres Freund <andres@anarazel.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sync.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 4ec430a..dd5d171 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -348,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 	}
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
-		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+						 WB_SYNC_NONE);
 		if (ret < 0)
 			goto out_put;
 	}
-- 
cgit v1.1


From 3a49f3d2a10dfb27411d321900197a3d6c52405b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 6 Nov 2015 16:30:03 -0800
Subject: fs/proc/array.c: set overflow flag in case of error

For now in task_name() we ignore the return code of string_escape_str()
call.  This is not good if buffer suddenly becomes not big enough.  Do the
proper error handling there.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index eed2050..d73291f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,18 +91,18 @@
 static inline void task_name(struct seq_file *m, struct task_struct *p)
 {
 	char *buf;
+	size_t size;
 	char tcomm[sizeof(p->comm)];
+	int ret;
 
 	get_task_comm(tcomm, p);
 
 	seq_puts(m, "Name:\t");
-	buf = m->buf + m->count;
 
-	/* Ignore error for now */
-	buf += string_escape_str(tcomm, buf, m->size - m->count,
-				 ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+	size = seq_get_buf(m, &buf);
+	ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+	seq_commit(m, ret < size ? ret : -1);
 
-	m->count = buf - m->buf;
 	seq_putc(m, '\n');
 }
 
-- 
cgit v1.1


From 54708d2858e79a2bdda10bf8a20c80eb96c20613 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 6 Nov 2015 16:30:06 -0800
Subject: proc: actually make proc_fd_permission() thread-friendly

The commit 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly")
fixed the access to /proc/self/fd from sub-threads, but introduced another
problem: a sub-thread can't access /proc/<tid>/fd/ or /proc/thread-self/fd
if generic_permission() fails.

Change proc_fd_permission() to check same_thread_group(pid_task(), current).

Fixes: 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly")
Reported-by: "Jin, Yihua" <yihua.jin@intel.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/fd.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e5fcd0..3c2a915 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
  */
 int proc_fd_permission(struct inode *inode, int mask)
 {
-	int rv = generic_permission(inode, mask);
+	struct task_struct *p;
+	int rv;
+
+	rv = generic_permission(inode, mask);
 	if (rv == 0)
-		return 0;
-	if (task_tgid(current) == proc_pid(inode))
+		return rv;
+
+	rcu_read_lock();
+	p = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (p && same_thread_group(p, current))
 		rv = 0;
+	rcu_read_unlock();
+
 	return rv;
 }
 
-- 
cgit v1.1


From eac44a5e07be41a153e52c35c4d7dc0fec23adb3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 6 Nov 2015 16:30:12 -0800
Subject: fs/jffs2/wbuf.c: remove stray semicolon

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/wbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 09ed551..955da62 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
 	if ((c->flash_size % c->sector_size) != 0) {
 		c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
 		pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
-	};
+	}
 
 	c->wbuf_ofs = 0xFFFFFFFF;
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
-- 
cgit v1.1


From da80a39fc962ceca085ddfb7d63e00309b305f17 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Fri, 6 Nov 2015 16:31:40 -0800
Subject: nilfs2: drop null test before destroy functions

Remove unneeded NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@ expression x; @@
-if (x != NULL)
  \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/super.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585b..c69455a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void)
 	 */
 	rcu_barrier();
 
-	if (nilfs_inode_cachep)
-		kmem_cache_destroy(nilfs_inode_cachep);
-	if (nilfs_transaction_cachep)
-		kmem_cache_destroy(nilfs_transaction_cachep);
-	if (nilfs_segbuf_cachep)
-		kmem_cache_destroy(nilfs_segbuf_cachep);
-	if (nilfs_btree_path_cache)
-		kmem_cache_destroy(nilfs_btree_path_cache);
+	kmem_cache_destroy(nilfs_inode_cachep);
+	kmem_cache_destroy(nilfs_transaction_cachep);
+	kmem_cache_destroy(nilfs_segbuf_cachep);
+	kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
 static int __init nilfs_init_cachep(void)
-- 
cgit v1.1


From b7bed712d090c340b97d455c5cb62d151e004503 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:43 -0800
Subject: nilfs2: use nilfs_warning() in allocator implementation

This uses nilfs_warning() to replace "printk(KERN_WARNING ...);" in the
bitmap based allocator implementation of nilfs2.  The warning messages are
modified to include the device name and the inode number in each message.
This makes it clear which metadata file of which device has output
warnings such as "entry number xxxx already freed".

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b..afe98364 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -583,8 +583,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
-		printk(KERN_WARNING "%s: entry number %llu already freed\n",
-		       __func__, (unsigned long long)req->pr_entry_nr);
+		nilfs_warning(inode->i_sb, __func__,
+			      "entry number %llu already freed: ino=%lu\n",
+			      (unsigned long long)req->pr_entry_nr,
+			      (unsigned long)inode->i_ino);
 	else
 		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
 
@@ -620,8 +622,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
-		printk(KERN_WARNING "%s: entry number %llu already freed\n",
-		       __func__, (unsigned long long)req->pr_entry_nr);
+		nilfs_warning(inode->i_sb, __func__,
+			      "entry number %llu already freed: ino=%lu\n",
+			      (unsigned long long)req->pr_entry_nr,
+			      (unsigned long)inode->i_ino);
 	else
 		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
 
@@ -734,10 +738,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			if (!nilfs_clear_bit_atomic(
 				    nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap)) {
-				printk(KERN_WARNING
-				       "%s: entry number %llu already freed\n",
-				       __func__,
-				       (unsigned long long)entry_nrs[j]);
+				nilfs_warning(inode->i_sb, __func__,
+					      "entry number %llu already freed: ino=%lu\n",
+					      (unsigned long long)entry_nrs[j],
+					      (unsigned long)inode->i_ino);
 			} else {
 				n++;
 			}
-- 
cgit v1.1


From 4e9e63a671fbe13f448fb2e69dfdbb6c2a008368 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:45 -0800
Subject: nilfs2: do not call nilfs_mdt_bgl_lock() needlessly

In the bitmap based allocator implementation, nilfs_mdt_bgl_lock() helper
is frequently used to get a spinlock protecting a target block group.
This reduces its usage and simplifies arguments of some related functions
by directly passing a pointer to the spinlock.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 84 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index afe98364..ff0d62c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -133,38 +133,34 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 
 /**
  * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
  * @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
  */
 static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
-			       const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+			       spinlock_t *lock)
 {
 	unsigned long nfree;
 
-	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	spin_lock(lock);
 	nfree = le32_to_cpu(desc->pg_nfrees);
-	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+	spin_unlock(lock);
 	return nfree;
 }
 
 /**
  * nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
  * @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
  * @n: delta to be added
  */
 static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
-				    unsigned long group,
-				    struct nilfs_palloc_group_desc *desc,
-				    u32 n)
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+				    spinlock_t *lock, u32 n)
 {
-	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	spin_lock(lock);
 	le32_add_cpu(&desc->pg_nfrees, n);
-	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+	spin_unlock(lock);
 }
 
 /**
@@ -332,17 +328,15 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
 
 /**
  * nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
  * @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
  * @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
  */
-static int nilfs_palloc_find_available_slot(struct inode *inode,
-					    unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 					    unsigned long target,
-					    unsigned char *bitmap,
-					    int bsize)
+					    int bsize,
+					    spinlock_t *lock)
 {
 	int curr, pos, end, i;
 
@@ -351,12 +345,11 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
 		if (end > bsize)
 			end = bsize;
 		pos = nilfs_find_next_zero_bit(bitmap, end, target);
-		if (pos < end &&
-		    !nilfs_set_bit_atomic(
-			    nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+		if (pos < end && !nilfs_set_bit_atomic(lock, pos, bitmap))
 			return pos;
-	} else
+	} else {
 		end = 0;
+	}
 
 	for (i = 0, curr = end;
 	     i < bsize;
@@ -370,10 +363,8 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
 			if (end > bsize)
 				end = bsize;
 			pos = nilfs_find_next_zero_bit(bitmap, end, curr);
-			if ((pos < end) &&
-			    !nilfs_set_bit_atomic(
-				    nilfs_mdt_bgl_lock(inode, group), pos,
-				    bitmap))
+			if (pos < end &&
+			    !nilfs_set_bit_atomic(lock, pos, bitmap))
 				return pos;
 		}
 	}
@@ -477,6 +468,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 	unsigned long group_offset, maxgroup_offset;
 	unsigned long n, entries_per_group, groups_per_desc_block;
 	unsigned long i, j;
+	spinlock_t *lock;
 	int pos, ret;
 
 	ngroups = nilfs_palloc_groups_count(inode);
@@ -501,8 +493,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
 							   maxgroup);
 		for (j = 0; j < n; j++, desc++, group++) {
-			if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
-			    > 0) {
+			lock = nilfs_mdt_bgl_lock(inode, group);
+			if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
 				ret = nilfs_palloc_get_bitmap_block(
 					inode, group, 1, &bitmap_bh);
 				if (ret < 0)
@@ -510,12 +502,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 				bitmap_kaddr = kmap(bitmap_bh->b_page);
 				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 				pos = nilfs_palloc_find_available_slot(
-					inode, group, group_offset, bitmap,
-					entries_per_group);
+					bitmap, group_offset,
+					entries_per_group, lock);
 				if (pos >= 0) {
 					/* found a free entry */
 					nilfs_palloc_group_desc_add_entries(
-						inode, group, desc, -1);
+						desc, lock, -1);
 					req->pr_entry_nr =
 						entries_per_group * group + pos;
 					kunmap(desc_bh->b_page);
@@ -573,6 +565,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	unsigned long group, group_offset;
 	unsigned char *bitmap;
 	void *desc_kaddr, *bitmap_kaddr;
+	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
 	desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -580,15 +573,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 						 req->pr_desc_bh, desc_kaddr);
 	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+	lock = nilfs_mdt_bgl_lock(inode, group);
 
-	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
-				    group_offset, bitmap))
+	if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
 		nilfs_warning(inode->i_sb, __func__,
 			      "entry number %llu already freed: ino=%lu\n",
 			      (unsigned long long)req->pr_entry_nr,
 			      (unsigned long)inode->i_ino);
 	else
-		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
 	kunmap(req->pr_bitmap_bh->b_page);
 	kunmap(req->pr_desc_bh->b_page);
@@ -613,6 +606,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	void *desc_kaddr, *bitmap_kaddr;
 	unsigned char *bitmap;
 	unsigned long group, group_offset;
+	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
 	desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -620,14 +614,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 						 req->pr_desc_bh, desc_kaddr);
 	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
-	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
-				    group_offset, bitmap))
+	lock = nilfs_mdt_bgl_lock(inode, group);
+
+	if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
 		nilfs_warning(inode->i_sb, __func__,
 			      "entry number %llu already freed: ino=%lu\n",
 			      (unsigned long long)req->pr_entry_nr,
 			      (unsigned long)inode->i_ino);
 	else
-		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
 	kunmap(req->pr_bitmap_bh->b_page);
 	kunmap(req->pr_desc_bh->b_page);
@@ -712,6 +707,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 	unsigned char *bitmap;
 	void *desc_kaddr, *bitmap_kaddr;
 	unsigned long group, group_offset;
+	spinlock_t *lock;
 	int i, j, n, ret;
 
 	for (i = 0; i < nitems; i = j) {
@@ -730,14 +726,14 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			inode, group, desc_bh, desc_kaddr);
 		bitmap_kaddr = kmap(bitmap_bh->b_page);
 		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+		lock = nilfs_mdt_bgl_lock(inode, group);
 		for (j = i, n = 0;
 		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
 							      entry_nrs[j]);
 		     j++) {
 			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
-			if (!nilfs_clear_bit_atomic(
-				    nilfs_mdt_bgl_lock(inode, group),
-				    group_offset, bitmap)) {
+			if (!nilfs_clear_bit_atomic(lock, group_offset,
+						    bitmap)) {
 				nilfs_warning(inode->i_sb, __func__,
 					      "entry number %llu already freed: ino=%lu\n",
 					      (unsigned long long)entry_nrs[j],
@@ -746,7 +742,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 				n++;
 			}
 		}
-		nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+		nilfs_palloc_group_desc_add_entries(desc, lock, n);
 
 		kunmap(bitmap_bh->b_page);
 		kunmap(desc_bh->b_page);
-- 
cgit v1.1


From 18c41b37f0f16a0d6e5b1a73563d0c1333e7ef70 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:48 -0800
Subject: nilfs2: refactor nilfs_palloc_find_available_slot()

The current implementation of nilfs_palloc_find_available_slot() function
is overkill.  The underlying bit search routine is well optimized, so this
uses it more simply in nilfs_palloc_find_available_slot().

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index ff0d62c..b15daf8 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -335,39 +335,33 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
  */
 static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 					    unsigned long target,
-					    int bsize,
+					    unsigned bsize,
 					    spinlock_t *lock)
 {
-	int curr, pos, end, i;
+	int pos, end = bsize;
 
-	if (target > 0) {
-		end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
-		if (end > bsize)
-			end = bsize;
-		pos = nilfs_find_next_zero_bit(bitmap, end, target);
-		if (pos < end && !nilfs_set_bit_atomic(lock, pos, bitmap))
-			return pos;
-	} else {
-		end = 0;
+	if (likely(target < bsize)) {
+		pos = target;
+		do {
+			pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+			if (pos >= end)
+				break;
+			if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+				return pos;
+		} while (++pos < end);
+
+		end = target;
 	}
 
-	for (i = 0, curr = end;
-	     i < bsize;
-	     i += BITS_PER_LONG, curr += BITS_PER_LONG) {
-		/* wrap around */
-		if (curr >= bsize)
-			curr = 0;
-		while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
-		       != ~0UL) {
-			end = curr + BITS_PER_LONG;
-			if (end > bsize)
-				end = bsize;
-			pos = nilfs_find_next_zero_bit(bitmap, end, curr);
-			if (pos < end &&
-			    !nilfs_set_bit_atomic(lock, pos, bitmap))
-				return pos;
-		}
+	/* wrap around */
+	for (pos = 0; pos < end; pos++) {
+		pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+		if (pos >= end)
+			break;
+		if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+			return pos;
 	}
+
 	return -ENOSPC;
 }
 
-- 
cgit v1.1


From b22580948c39d71fb150c1d53148a381011dd109 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:51 -0800
Subject: nilfs2: get rid of nilfs_palloc_group_is_in()

This unfolds nilfs_palloc_group_is_in() helper function into
nilfs_palloc_freev() function to simplify a range check and an index
calculation repeatedy performed in a loop of the function.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index b15daf8..5b7ee36 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -673,22 +673,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
 }
 
 /**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
-	__u64 first, last;
-
-	first = group * nilfs_palloc_entries_per_group(inode);
-	last = first + nilfs_palloc_entries_per_group(inode) - 1;
-	return (nr >= first) && (nr <= last);
-}
-
-/**
  * nilfs_palloc_freev - deallocate a set of persistent objects
  * @inode: inode of metadata file using this allocator
  * @entry_nrs: array of entry numbers to be deallocated
@@ -701,6 +685,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 	unsigned char *bitmap;
 	void *desc_kaddr, *bitmap_kaddr;
 	unsigned long group, group_offset;
+	__u64 group_min_nr;
+	const unsigned long epg = nilfs_palloc_entries_per_group(inode);
 	spinlock_t *lock;
 	int i, j, n, ret;
 
@@ -715,6 +701,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			brelse(desc_bh);
 			return ret;
 		}
+
+		/* Get the first entry number of the group */
+		group_min_nr = (__u64)group * epg;
+
 		desc_kaddr = kmap(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
@@ -722,10 +712,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		lock = nilfs_mdt_bgl_lock(inode, group);
 		for (j = i, n = 0;
-		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
-							      entry_nrs[j]);
+		     j < nitems && entry_nrs[j] >= group_min_nr &&
+			     entry_nrs[j] < group_min_nr + epg;
 		     j++) {
-			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+			group_offset = entry_nrs[j] - group_min_nr;
 			if (!nilfs_clear_bit_atomic(lock, group_offset,
 						    bitmap)) {
 				nilfs_warning(inode->i_sb, __func__,
-- 
cgit v1.1


From da019954dd821682d6b2a8330c9c90acb943c456 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:54 -0800
Subject: nilfs2: add helper functions to delete blocks from dat file

This adds delete functions for data blocks of metadata files using bitmap
based allocator.  nilfs_palloc_delete_entry_block() deletes an entry block
(e.g.  block storing dat entries), and nilfs_palloc_delete_bitmap_block()
deletes a bitmap block, respectively.

These helpers are intended to be used in the successive change on
deallocator of block addresses ("nilfs2: free unused dat file blocks
during garbage collection").

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 5b7ee36..225b797 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -236,6 +236,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
 }
 
 /**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+				     struct nilfs_bh_assoc *prev,
+				     spinlock_t *lock)
+{
+	spin_lock(lock);
+	if (prev->bh && blkoff == prev->blkoff) {
+		brelse(prev->bh);
+		prev->bh = NULL;
+	}
+	spin_unlock(lock);
+	return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
  * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
  * @inode: inode of metadata file using this allocator
  * @group: group number
@@ -274,6 +294,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
 }
 
 /**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+					    unsigned long group)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_delete_block(inode,
+					 nilfs_palloc_bitmap_blkoff(inode,
+								    group),
+					 &cache->prev_bitmap, &cache->lock);
+}
+
+/**
  * nilfs_palloc_get_entry_block - get buffer head of an entry block
  * @inode: inode of metadata file using this allocator
  * @nr: serial number of the entry (e.g. inode number)
@@ -292,6 +328,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
 }
 
 /**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_delete_block(inode,
+					 nilfs_palloc_entry_blkoff(inode, nr),
+					 &cache->prev_entry, &cache->lock);
+}
+
+/**
  * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
  * @inode: inode of metadata file using this allocator
  * @group: group number
-- 
cgit v1.1


From d0c14a9ee79467cd6a04b281577e1e6f74806ab2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:56 -0800
Subject: nilfs2: free unused dat file blocks during garbage collection

As a nilfs2 volume ages, the amount of available disk space decreases
little by little due to bloat of DAT (disk address translation) metadata
file.  Even if we delete all files in a file system and free their block
addresses from the DAT file through a garbage collection, empty DAT blocks
are not freed.

This fixes the issue by extending the deallocator of block addresses so
that empty data blocks and empty bitmap blocks of DAT are deleted.

The following comparison shows the effect of this patch.  Each shows disk
amount information of a nilfs2 volume that we cleaned out by deleting all
files and running gc after having filled 90% of its capacity.

Before:
Filesystem     1K-blocks     Used Available Use% Mounted on
/dev/sda1      500105212  3022844 472072192   1% /test

After:
Filesystem     1K-blocks     Used Available Use% Mounted on
/dev/sda1      500105212    16380 475078656   1% /test

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++-----------
 fs/nilfs2/alloc.h |  1 +
 2 files changed, 75 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 225b797..b335a32 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -154,13 +154,17 @@ nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
  * @lock: spin lock protecting @desc
  * @n: delta to be added
  */
-static void
+static u32
 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
 				    spinlock_t *lock, u32 n)
 {
+	u32 nfree;
+
 	spin_lock(lock);
 	le32_add_cpu(&desc->pg_nfrees, n);
+	nfree = le32_to_cpu(desc->pg_nfrees);
 	spin_unlock(lock);
+	return nfree;
 }
 
 /**
@@ -735,12 +739,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 	unsigned char *bitmap;
 	void *desc_kaddr, *bitmap_kaddr;
 	unsigned long group, group_offset;
-	__u64 group_min_nr;
+	__u64 group_min_nr, last_nrs[8];
 	const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+	const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+	unsigned entry_start, end, pos;
 	spinlock_t *lock;
-	int i, j, n, ret;
+	int i, j, k, ret;
+	u32 nfree;
 
 	for (i = 0; i < nitems; i = j) {
+		int change_group = false;
+		int nempties = 0, n = 0;
+
 		group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
 		ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
 		if (ret < 0)
@@ -755,17 +765,13 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		/* Get the first entry number of the group */
 		group_min_nr = (__u64)group * epg;
 
-		desc_kaddr = kmap(desc_bh->b_page);
-		desc = nilfs_palloc_block_get_group_desc(
-			inode, group, desc_bh, desc_kaddr);
 		bitmap_kaddr = kmap(bitmap_bh->b_page);
 		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		lock = nilfs_mdt_bgl_lock(inode, group);
-		for (j = i, n = 0;
-		     j < nitems && entry_nrs[j] >= group_min_nr &&
-			     entry_nrs[j] < group_min_nr + epg;
-		     j++) {
-			group_offset = entry_nrs[j] - group_min_nr;
+
+		j = i;
+		entry_start = rounddown(group_offset, epb);
+		do {
 			if (!nilfs_clear_bit_atomic(lock, group_offset,
 						    bitmap)) {
 				nilfs_warning(inode->i_sb, __func__,
@@ -775,18 +781,69 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			} else {
 				n++;
 			}
-		}
-		nilfs_palloc_group_desc_add_entries(desc, lock, n);
+
+			j++;
+			if (j >= nitems || entry_nrs[j] < group_min_nr ||
+			    entry_nrs[j] >= group_min_nr + epg) {
+				change_group = true;
+			} else {
+				group_offset = entry_nrs[j] - group_min_nr;
+				if (group_offset >= entry_start &&
+				    group_offset < entry_start + epb) {
+					/* This entry is in the same block */
+					continue;
+				}
+			}
+
+			/* Test if the entry block is empty or not */
+			end = entry_start + epb;
+			pos = nilfs_find_next_bit(bitmap, end, entry_start);
+			if (pos >= end) {
+				last_nrs[nempties++] = entry_nrs[j - 1];
+				if (nempties >= ARRAY_SIZE(last_nrs))
+					break;
+			}
+
+			if (change_group)
+				break;
+
+			/* Go on to the next entry block */
+			entry_start = rounddown(group_offset, epb);
+		} while (true);
 
 		kunmap(bitmap_bh->b_page);
-		kunmap(desc_bh->b_page);
+		mark_buffer_dirty(bitmap_bh);
+		brelse(bitmap_bh);
 
+		for (k = 0; k < nempties; k++) {
+			ret = nilfs_palloc_delete_entry_block(inode,
+							      last_nrs[k]);
+			if (ret && ret != -ENOENT) {
+				nilfs_warning(inode->i_sb, __func__,
+					      "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+					      (unsigned long long)last_nrs[k],
+					      (unsigned long)inode->i_ino, ret);
+			}
+		}
+
+		desc_kaddr = kmap_atomic(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+		kunmap_atomic(desc_kaddr);
 		mark_buffer_dirty(desc_bh);
-		mark_buffer_dirty(bitmap_bh);
 		nilfs_mdt_mark_dirty(inode);
-
-		brelse(bitmap_bh);
 		brelse(desc_bh);
+
+		if (nfree == nilfs_palloc_entries_per_group(inode)) {
+			ret = nilfs_palloc_delete_bitmap_block(inode, group);
+			if (ret && ret != -ENOENT) {
+				nilfs_warning(inode->i_sb, __func__,
+					      "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+					      group,
+					      (unsigned long)inode->i_ino, ret);
+			}
+		}
 	}
 	return 0;
 }
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4bd6451..6e6f49a 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_set_bit_atomic		ext2_set_bit_atomic
 #define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
 #define nilfs_find_next_zero_bit	find_next_zero_bit_le
+#define nilfs_find_next_bit		find_next_bit_le
 
 /**
  * struct nilfs_bh_assoc - block offset and buffer head association
-- 
cgit v1.1


From 58497703837048ac501ce56056eb74b4361108fc Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:31:59 -0800
Subject: nilfs2: add a tracepoint for tracking stage transition of segment
 construction

This patch adds a tracepoint for tracking stage transition of block
collection in segment construction.  With the tracepoint, we can analysis
the behavior of segment construction in depth.  It would be useful for
bottleneck detection and debugging, etc.

The tracepoint is created with the standard trace API of linux (like ext3,
ext4, f2fs and btrfs).  So we can analysis with existing tools easily.  Of
course, more detailed analysis will be possible if we can create nilfs
specific analysis tools.

Below is an example of event dump with Brendan Gregg's perf-tools
(https://github.com/brendangregg/perf-tools).  Time consumption between
each stage can be obtained.

$ sudo bin/tpoint nilfs2:nilfs2_collection_stage_transition
Tracing nilfs2:nilfs2_collection_stage_transition. Ctrl-C to end.
        segctord-14875 [003] ...1 28311.067794: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_INIT
        segctord-14875 [003] ...1 28311.068139: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_GC
        segctord-14875 [003] ...1 28311.068139: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_FILE
        segctord-14875 [003] ...1 28311.068486: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_IFILE
        segctord-14875 [003] ...1 28311.068540: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_CPFILE
        segctord-14875 [003] ...1 28311.068561: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_SUFILE
        segctord-14875 [003] ...1 28311.068565: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_DAT
        segctord-14875 [003] ...1 28311.068573: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_SR
        segctord-14875 [003] ...1 28311.068574: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_DONE

For capturing transition correctly, this patch adds wrappers for the
member scnt of nilfs_cstage.  With this change, every transition of the
stage can produce trace event in a correct manner.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c | 71 ++++++++++++++++++++++++++++++++++++++---------------
 fs/nilfs2/segment.h |  3 ++-
 2 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9..ef35404 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -77,6 +77,36 @@ enum {
 	NILFS_ST_DONE,
 };
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+	sci->sc_stage.scnt++;
+	trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+	sci->sc_stage.scnt = next_scnt;
+	trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+	return sci->sc_stage.scnt;
+}
+
 /* State flags of collection */
 #define NILFS_CF_NODE		0x0001	/* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED	0x0002	/* IFILE stage has started */
@@ -1062,7 +1092,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 	size_t ndone;
 	int err = 0;
 
-	switch (sci->sc_stage.scnt) {
+	switch (nilfs_sc_cstage_get(sci)) {
 	case NILFS_ST_INIT:
 		/* Pre-processes */
 		sci->sc_stage.flags = 0;
@@ -1071,7 +1101,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			sci->sc_nblk_inc = 0;
 			sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
 			if (mode == SC_LSEG_DSYNC) {
-				sci->sc_stage.scnt = NILFS_ST_DSYNC;
+				nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
 				goto dsync_mode;
 			}
 		}
@@ -1079,10 +1109,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		sci->sc_stage.dirty_file_ptr = NULL;
 		sci->sc_stage.gc_inode_ptr = NULL;
 		if (mode == SC_FLUSH_DAT) {
-			sci->sc_stage.scnt = NILFS_ST_DAT;
+			nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
 			goto dat_stage;
 		}
-		sci->sc_stage.scnt++;  /* Fall through */
+		nilfs_sc_cstage_inc(sci);  /* Fall through */
 	case NILFS_ST_GC:
 		if (nilfs_doing_gc()) {
 			head = &sci->sc_gc_inodes;
@@ -1103,7 +1133,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			}
 			sci->sc_stage.gc_inode_ptr = NULL;
 		}
-		sci->sc_stage.scnt++;  /* Fall through */
+		nilfs_sc_cstage_inc(sci);  /* Fall through */
 	case NILFS_ST_FILE:
 		head = &sci->sc_dirty_files;
 		ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1125,10 +1155,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		}
 		sci->sc_stage.dirty_file_ptr = NULL;
 		if (mode == SC_FLUSH_FILE) {
-			sci->sc_stage.scnt = NILFS_ST_DONE;
+			nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
 			return 0;
 		}
-		sci->sc_stage.scnt++;
+		nilfs_sc_cstage_inc(sci);
 		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
 		/* Fall through */
 	case NILFS_ST_IFILE:
@@ -1136,7 +1166,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 					      &nilfs_sc_file_ops);
 		if (unlikely(err))
 			break;
-		sci->sc_stage.scnt++;
+		nilfs_sc_cstage_inc(sci);
 		/* Creating a checkpoint */
 		err = nilfs_segctor_create_checkpoint(sci);
 		if (unlikely(err))
@@ -1147,7 +1177,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 					      &nilfs_sc_file_ops);
 		if (unlikely(err))
 			break;
-		sci->sc_stage.scnt++;  /* Fall through */
+		nilfs_sc_cstage_inc(sci);  /* Fall through */
 	case NILFS_ST_SUFILE:
 		err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
 					 sci->sc_nfreesegs, &ndone);
@@ -1163,7 +1193,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 					      &nilfs_sc_file_ops);
 		if (unlikely(err))
 			break;
-		sci->sc_stage.scnt++;  /* Fall through */
+		nilfs_sc_cstage_inc(sci);  /* Fall through */
 	case NILFS_ST_DAT:
  dat_stage:
 		err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1171,10 +1201,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		if (unlikely(err))
 			break;
 		if (mode == SC_FLUSH_DAT) {
-			sci->sc_stage.scnt = NILFS_ST_DONE;
+			nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
 			return 0;
 		}
-		sci->sc_stage.scnt++;  /* Fall through */
+		nilfs_sc_cstage_inc(sci);  /* Fall through */
 	case NILFS_ST_SR:
 		if (mode == SC_LSEG_SR) {
 			/* Appending a super root */
@@ -1184,7 +1214,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		}
 		/* End of a logical segment */
 		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
-		sci->sc_stage.scnt = NILFS_ST_DONE;
+		nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
 		return 0;
 	case NILFS_ST_DSYNC:
  dsync_mode:
@@ -1197,7 +1227,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		if (unlikely(err))
 			break;
 		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
-		sci->sc_stage.scnt = NILFS_ST_DONE;
+		nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
 		return 0;
 	case NILFS_ST_DONE:
 		return 0;
@@ -1442,7 +1472,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 			goto failed;
 
 		/* The current segment is filled up */
-		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+		if (mode != SC_LSEG_SR ||
+		    nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
 			break;
 
 		nilfs_clear_logs(&sci->sc_segbufs);
@@ -1946,7 +1977,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int err;
 
-	sci->sc_stage.scnt = NILFS_ST_INIT;
+	nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
 	sci->sc_cno = nilfs->ns_cno;
 
 	err = nilfs_segctor_collect_dirty_files(sci, nilfs);
@@ -1974,7 +2005,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			goto failed;
 
 		/* Avoid empty segment */
-		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+		if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
 		    nilfs_segbuf_empty(sci->sc_curseg)) {
 			nilfs_segctor_abort_construction(sci, nilfs, 1);
 			goto out;
@@ -1988,7 +2019,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			nilfs_segctor_fill_in_file_bmap(sci);
 
 		if (mode == SC_LSEG_SR &&
-		    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+		    nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
 			err = nilfs_segctor_fill_in_checkpoint(sci);
 			if (unlikely(err))
 				goto failed_to_write;
@@ -2007,7 +2038,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		if (unlikely(err))
 			goto failed_to_write;
 
-		if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+		if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
 		    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
 			/*
 			 * At this point, we avoid double buffering
@@ -2020,7 +2051,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			if (err)
 				goto failed_to_write;
 		}
-	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
+	} while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
 
  out:
 	nilfs_segctor_drop_written_files(sci, nilfs);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a48d6de..0408b9b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -67,7 +67,8 @@ struct nilfs_recovery_info {
 
 /**
  * struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ *        nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
  * @flags: State flags
  * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
  * @gc_inode_ptr: Pointer on the list of gc-inodes
-- 
cgit v1.1


From 44fda114601fa5edebeacecb265f09d802670bc0 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:32:02 -0800
Subject: nilfs2: add a tracepoint for transaction events

This patch adds a tracepoint for transaction events of nilfs.  With the
tracepoint, these events can be tracked: begin, abort, commit, trylock,
lock, and unlock.  Basically, these events have corresponding functions
e.g.  begin event corresponds nilfs_transaction_begin().  The unlock event
is an exception.  It corresponds to the iteration in
nilfs_transaction_lock().

Only one tracepoint is introcued: nilfs2_transaction_transition.  The
above events are distinguished with newly introduced enum.  With this
tracepoint, we can analyse a critical section of segment constructoin.

Sample output by tpoint of perf-tools:
              cp-4457  [000] ...1    63.266220: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800bf5ccc58 count = 1 flags = 9 state = BEGIN
              cp-4457  [000] ...1    63.266221: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800bf5ccc58 count = 0 flags = 9 state = COMMIT
              cp-4457  [000] ...1    63.266221: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800bf5ccc58 count = 0 flags = 9 state = COMMIT
        segctord-4371  [001] ...1    68.261196: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 10 state = TRYLOCK
        segctord-4371  [001] ...1    68.261280: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 10 state = LOCK
        segctord-4371  [001] ...1    68.261877: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 1 flags = 10 state = BEGIN
        segctord-4371  [001] ...1    68.262116: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 18 state = COMMIT
        segctord-4371  [001] ...1    68.265032: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 18 state = UNLOCK
        segctord-4371  [001] ...1   132.376847: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 10 state = TRYLOCK

This patch also does trivial cleaning of comma usage in collection stage
transition event for consistent coding style.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ef35404..3fc4732 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -214,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
 {
 	struct the_nilfs *nilfs;
 	int ret = nilfs_prepare_segment_lock(ti);
+	struct nilfs_transaction_info *trace_ti;
 
 	if (unlikely(ret < 0))
 		return ret;
-	if (ret > 0)
+	if (ret > 0) {
+		trace_ti = current->journal_info;
+
+		trace_nilfs2_transaction_transition(sb, trace_ti,
+				    trace_ti->ti_count, trace_ti->ti_flags,
+				    TRACE_NILFS2_TRANSACTION_BEGIN);
 		return 0;
+	}
 
 	sb_start_intwrite(sb);
 
@@ -229,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
 		ret = -ENOSPC;
 		goto failed;
 	}
+
+	trace_ti = current->journal_info;
+	trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+					    trace_ti->ti_flags,
+					    TRACE_NILFS2_TRANSACTION_BEGIN);
 	return 0;
 
  failed:
@@ -261,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
 	ti->ti_flags |= NILFS_TI_COMMIT;
 	if (ti->ti_count > 0) {
 		ti->ti_count--;
+		trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+			    ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
 		return 0;
 	}
 	if (nilfs->ns_writer) {
@@ -272,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
 			nilfs_segctor_do_flush(sci, 0);
 	}
 	up_read(&nilfs->ns_segctor_sem);
+	trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+			    ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
 	current->journal_info = ti->ti_save;
 
 	if (ti->ti_flags & NILFS_TI_SYNC)
@@ -290,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
 	if (ti->ti_count > 0) {
 		ti->ti_count--;
+		trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+			    ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
 		return;
 	}
 	up_read(&nilfs->ns_segctor_sem);
 
+	trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+		    ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
 	current->journal_info = ti->ti_save;
 	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
 		kmem_cache_free(nilfs_transaction_cachep, ti);
@@ -339,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
 	current->journal_info = ti;
 
 	for (;;) {
+		trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+			    ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
 		down_write(&nilfs->ns_segctor_sem);
 		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
 			break;
@@ -350,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
 	}
 	if (gcflag)
 		ti->ti_flags |= NILFS_TI_GC;
+
+	trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+			    ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
 }
 
 static void nilfs_transaction_unlock(struct super_block *sb)
@@ -362,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb)
 
 	up_write(&nilfs->ns_segctor_sem);
 	current->journal_info = ti->ti_save;
+
+	trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+			    ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
 }
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
-- 
cgit v1.1


From 83eec5e6dd10f0b1ab83ee660c8be883b3da7ba8 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:32:05 -0800
Subject: nilfs2: add tracepoints for analyzing sufile manipulation

This patch adds tracepoints which would be useful for analyzing segment
usage from a perspective of high level sufile manipulation (check, alloc,
free).  sufile is an important in-place updated metadata file, so
analyzing the behavior would be useful for performance turning.

example of usage (a case of allocation):

$ sudo bin/tpoint nilfs2:nilfs2_segment_usage_allocated
Tracing nilfs2:nilfs2_segment_usage_allocated. Ctrl-C to end.
        segctord-17800 [002] ...1 10671.867294: nilfs2_segment_usage_allocated: sufile = ffff880054f908a8 segnum = 2
        segctord-17800 [002] ...1 10675.073477: nilfs2_segment_usage_allocated: sufile = ffff880054f908a8 segnum = 3

Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benixon Dhas <benixon.dhas@wdc.com>
Cc: TK Kato <TK.Kato@wdc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/sufile.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c3..7ff8f15 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
 #include "mdt.h"
 #include "sufile.h"
 
+#include <trace/events/nilfs2.h>
+
 /**
  * struct nilfs_sufile_info - on-memory private data of sufile
  * @mi: on-memory private data of metadata file
@@ -358,6 +360,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 				break; /* never happens */
 			}
 		}
+		trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
 		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
 							   &su_bh);
 		if (ret < 0)
@@ -388,6 +391,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 			nilfs_mdt_mark_dirty(sufile);
 			brelse(su_bh);
 			*segnump = segnum;
+
+			trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
 			goto out_header;
 		}
 
@@ -490,6 +496,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 	NILFS_SUI(sufile)->ncleansegs++;
 
 	nilfs_mdt_mark_dirty(sufile);
+
+	trace_nilfs2_segment_usage_freed(sufile, segnum);
 }
 
 /**
-- 
cgit v1.1


From a9cd207c23ca4fa5bd5f1092e867e87542e349a3 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:32:08 -0800
Subject: nilfs2: add tracepoints for analyzing reading and writing metadata
 files

This patch adds tracepoints for analyzing requests of reading and writing
metadata files.  The tracepoints cover every in-place mdt files (cpfile,
sufile, and datfile).

Example of tracing mdt_insert_new_block():
              cp-14635 [000] ...1 30598.199309: nilfs2_mdt_insert_new_block: inode = ffff88022a8d0178 ino = 3 block = 155
              cp-14635 [000] ...1 30598.199520: nilfs2_mdt_insert_new_block: inode = ffff88022a8d0178 ino = 3 block = 5
              cp-14635 [000] ...1 30598.200828: nilfs2_mdt_insert_new_block: inode = ffff88022a8d0178 ino = 3 block = 253

Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: TK Kato <TK.Kato@wdc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/mdt.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index dee34d9..1125f40 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -33,6 +33,7 @@
 #include "page.h"
 #include "mdt.h"
 
+#include <trace/events/nilfs2.h>
 
 #define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
 
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(inode);
+
+	trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
 	return 0;
 }
 
@@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 	get_bh(bh);
 	submit_bh(mode, bh);
 	ret = 0;
+
+	trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
  out:
 	get_bh(bh);
 	*out_bh = bh;
-- 
cgit v1.1


From 09ef29e0f6ac9f08ba4cc501ab4a3c33be526343 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:32:14 -0800
Subject: nilfs2: fix gcc unused-but-set-variable warnings

Fix the following build warnings:

 $ make W=1
 [...]
   CC [M]  fs/nilfs2/btree.o
 fs/nilfs2/btree.c: In function 'nilfs_btree_split':
 fs/nilfs2/btree.c:923:8: warning: variable 'newptr' set but not used [-Wunused-but-set-variable]
   __u64 newptr;
         ^
 fs/nilfs2/btree.c:922:8: warning: variable 'newkey' set but not used [-Wunused-but-set-variable]
   __u64 newkey;
         ^
   CC [M]  fs/nilfs2/dat.o
 fs/nilfs2/dat.c: In function 'nilfs_dat_prepare_end':
 fs/nilfs2/dat.c:158:8: warning: variable 'start' set but not used [-Wunused-but-set-variable]
   __u64 start;
         ^
   CC [M]  fs/nilfs2/segment.o
 fs/nilfs2/segment.c: In function 'nilfs_segctor_do_immediate_flush':
 fs/nilfs2/segment.c:2433:6: warning: variable 'err' set but not used [-Wunused-but-set-variable]
   int err;
       ^
   CC [M]  fs/nilfs2/sufile.o
 fs/nilfs2/sufile.c: In function 'nilfs_sufile_alloc':
 fs/nilfs2/sufile.c:320:27: warning: variable 'ncleansegs' set but not used [-Wunused-but-set-variable]
   unsigned long nsegments, ncleansegs, nsus, cnt;
                            ^
   CC [M]  fs/nilfs2/alloc.o
 fs/nilfs2/alloc.c: In function 'nilfs_palloc_prepare_alloc_entry':
 fs/nilfs2/alloc.c:478:38: warning: variable 'groups_per_desc_block' set but not used [-Wunused-but-set-variable]
   unsigned long n, entries_per_group, groups_per_desc_block;
                                       ^

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.c   | 3 +--
 fs/nilfs2/btree.c   | 5 -----
 fs/nilfs2/dat.c     | 2 --
 fs/nilfs2/segment.c | 3 +--
 fs/nilfs2/sufile.c  | 3 +--
 5 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index b335a32..2ccbf55 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -514,7 +514,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 	void *desc_kaddr, *bitmap_kaddr;
 	unsigned long group, maxgroup, ngroups;
 	unsigned long group_offset, maxgroup_offset;
-	unsigned long n, entries_per_group, groups_per_desc_block;
+	unsigned long n, entries_per_group;
 	unsigned long i, j;
 	spinlock_t *lock;
 	int pos, ret;
@@ -523,7 +523,6 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 	maxgroup = ngroups - 1;
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
 	entries_per_group = nilfs_palloc_entries_per_group(inode);
-	groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
 
 	for (i = 0; i < ngroups; i += n) {
 		if (group >= ngroups) {
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919fd5b..f609a85 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
 			      int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *right;
-	__u64 newkey;
-	__u64 newptr;
 	int nchildren, n, move, ncblk;
 
 	node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		mark_buffer_dirty(path[level].bp_sib_bh);
 
-	newkey = nilfs_btree_node_get_key(right, 0);
-	newptr = path[level].bp_newreq.bpr_ptr;
-
 	if (move) {
 		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
 		nilfs_btree_node_insert(right, path[level].bp_index,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 0d5fada..7dc23f1 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	struct nilfs_dat_entry *entry;
-	__u64 start;
 	sector_t blocknr;
 	void *kaddr;
 	int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
-	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
 	kunmap_atomic(kaddr);
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3fc4732..3b65ada 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2492,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
 {
 	int mode = 0;
-	int err;
 
 	spin_lock(&sci->sc_state_lock);
 	mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2500,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
 	spin_unlock(&sci->sc_state_lock);
 
 	if (mode) {
-		err = nilfs_segctor_do_construct(sci, mode);
+		nilfs_segctor_do_construct(sci, mode);
 
 		spin_lock(&sci->sc_state_lock);
 		sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 7ff8f15..52821ff 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -319,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
 	__u64 segnum, maxsegnum, last_alloc;
 	void *kaddr;
-	unsigned long nsegments, ncleansegs, nsus, cnt;
+	unsigned long nsegments, nsus, cnt;
 	int ret, j;
 
 	down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -329,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 		goto out_sem;
 	kaddr = kmap_atomic(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
-	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
 	last_alloc = le64_to_cpu(header->sh_last_alloc);
 	kunmap_atomic(kaddr);
 
-- 
cgit v1.1


From 4f05028f8d1af782cfd03d09e0a052e9745dc5ad Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 6 Nov 2015 16:32:16 -0800
Subject: nilfs2: fix gcc uninitialized-variable warnings in powerpc build

Some false positive warnings are reported for powerpc build.

The following warnings are reported in
 http://kisskb.ellerman.id.au/kisskb/buildresult/12519703/

   CC      fs/nilfs2/super.o
 fs/nilfs2/super.c: In function 'nilfs_resize_fs':
 fs/nilfs2/super.c:376:2: warning: 'blocknr' may be used uninitialized in this function [-Wuninitialized]
 fs/nilfs2/super.c:362:11: note: 'blocknr' was declared here
   CC      fs/nilfs2/recovery.o
 fs/nilfs2/recovery.c: In function 'nilfs_salvage_orphan_logs':
 fs/nilfs2/recovery.c:631:21: warning: 'sum' may be used uninitialized in this function [-Wuninitialized]
 fs/nilfs2/recovery.c:585:32: note: 'sum' was declared here
 fs/nilfs2/recovery.c: In function 'nilfs_search_super_root':
 fs/nilfs2/recovery.c:873:11: warning: 'sum' may be used uninitialized in this function [-Wuninitialized]

Another similar warning is reported in
 http://kisskb.ellerman.id.au/kisskb/buildresult/12520079/

   CC      fs/nilfs2/btree.o
 fs/nilfs2/btree.c: In function 'nilfs_btree_convert_and_insert':
 include/asm-generic/bitops/non-atomic.h:105:20: warning: 'bh' may be used uninitialized in this function [-Wuninitialized]
 fs/nilfs2/btree.c:1859:22: note: 'bh' was declared here

This cleans out these warnings by forcing the variables to be initialized.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/btree.c    | 2 +-
 fs/nilfs2/recovery.c | 4 ++--
 fs/nilfs2/super.c    | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f609a85..3a3821b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1851,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
 				   __u64 key, __u64 ptr,
 				   const __u64 *keys, const __u64 *ptrs, int n)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
 	struct nilfs_bmap_stats stats;
 	int ret;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ff00a0b..9b4f205 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 				 struct nilfs_recovery_info *ri)
 {
 	struct buffer_head *bh_sum = NULL;
-	struct nilfs_segment_summary *sum;
+	struct nilfs_segment_summary *sum = NULL;
 	sector_t pseg_start;
 	sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
 	unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
 			    struct nilfs_recovery_info *ri)
 {
 	struct buffer_head *bh_sum = NULL;
-	struct nilfs_segment_summary *sum;
+	struct nilfs_segment_summary *sum = NULL;
 	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
 	sector_t seg_start, seg_end; /* range of full segment (block number) */
 	sector_t b, end;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c69455a..354013e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
 	struct nilfs_super_block *nsbp;
 	sector_t blocknr, newblocknr;
 	unsigned long offset;
-	int sb2i = -1;  /* array index of the secondary superblock */
+	int sb2i;  /* array index of the secondary superblock */
 	int ret = 0;
 
 	/* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
 	} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
 		sb2i = 0;
 		blocknr = nilfs->ns_sbh[0]->b_blocknr;
+	} else {
+		sb2i = -1;
+		blocknr = 0;
 	}
 	if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
 		goto out;  /* super block location is unchanged */
-- 
cgit v1.1


From be0e6f290f78b84a3b21b8c8c46819c4514fe632 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 6 Nov 2015 16:32:22 -0800
Subject: signal: turn dequeue_signal_lock() into kernel_dequeue_signal()

1. Rename dequeue_signal_lock() to kernel_dequeue_signal(). This
   matches another "for kthreads only" kernel_sigaction() helper.

2. Remove the "tsk" and "mask" arguments, they are always current
   and current->blocked. And it is simply wrong if tsk != current.

3. We could also remove the 3rd "siginfo_t *info" arg but it looks
   potentially useful. However we can simplify the callers if we
   change kernel_dequeue_signal() to accept info => NULL.

4. Remove _irqsave, it is never called from atomic context.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/background.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index bb9cebc..f3145fd 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -121,13 +121,12 @@ static int jffs2_garbage_collect_thread(void *_c)
 		/* Put_super will send a SIGKILL and then wait on the sem.
 		 */
 		while (signal_pending(current) || freezing(current)) {
-			siginfo_t info;
 			unsigned long signr;
 
 			if (try_to_freeze())
 				goto again;
 
-			signr = dequeue_signal_lock(current, &current->blocked, &info);
+			signr = kernel_dequeue_signal(NULL);
 
 			switch(signr) {
 			case SIGSTOP:
-- 
cgit v1.1


From 9a13049e83f346cb1cbd60c64e520a73c396af16 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 6 Nov 2015 16:32:25 -0800
Subject: signal: introduce kernel_signal_stop() to fix
 jffs2_garbage_collect_thread()

jffs2_garbage_collect_thread() can race with SIGCONT and sleep in
TASK_STOPPED state after it was already sent. Add the new helper,
kernel_signal_stop(), which does this correctly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/background.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index f3145fd..53cc735 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -132,8 +132,7 @@ static int jffs2_garbage_collect_thread(void *_c)
 			case SIGSTOP:
 				jffs2_dbg(1, "%s(): SIGSTOP received\n",
 					  __func__);
-				set_current_state(TASK_STOPPED);
-				schedule();
+				kernel_signal_stop();
 				break;
 
 			case SIGKILL:
-- 
cgit v1.1


From 9317bb9696566e6759203ffcaa80481b725785b3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 6 Nov 2015 16:32:28 -0800
Subject: signal: remove jffs2_garbage_collect_thread()->allow_signal(SIGCONT)

jffs2_garbage_collect_thread() does allow_signal(SIGCONT) for no reason,
SIGCONT will wake a stopped task up even if it is ignored.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/background.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 53cc735..e5c1783 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c)
 	siginitset(&hupmask, sigmask(SIGHUP));
 	allow_signal(SIGKILL);
 	allow_signal(SIGSTOP);
-	allow_signal(SIGCONT);
 	allow_signal(SIGHUP);
 
 	c->gc_task = current;
-- 
cgit v1.1


From 5fa534c987784c4811757a34c425aff3ce3b5037 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 6 Nov 2015 16:32:31 -0800
Subject: coredump: ensure all coredumping tasks have SIGNAL_GROUP_COREDUMP

task_will_free_mem() is wrong in many ways, and in particular the
SIGNAL_GROUP_COREDUMP check is not reliable: a task can participate in the
coredumping without SIGNAL_GROUP_COREDUMP bit set.

change zap_threads() paths to always set SIGNAL_GROUP_COREDUMP even if
other CLONE_VM processes can't react to SIGKILL.  Fortunately, at least
oom-kill case if fine; it kills all tasks sharing the same mm, so it
should also kill the process which actually dumps the core.

The change in prepare_signal() is not strictly necessary, it just ensures
that the patch does not bring another subtle behavioural change.  But it
reminds us that this SIGNAL_GROUP_EXIT/COREDUMP case needs more changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle Walker <kwalker@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Stanislav Kozina <skozina@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index a8f7564..c66bb05 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -280,11 +280,13 @@ out:
 	return ispipe;
 }
 
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
 {
 	struct task_struct *t;
 	int nr = 0;
 
+	/* ignore all signals except SIGKILL, see prepare_signal() */
+	start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
 	start->signal->group_exit_code = exit_code;
 	start->signal->group_stop_count = 0;
 
@@ -311,10 +313,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!signal_group_exit(tsk->signal)) {
 		mm->core_state = core_state;
-		nr = zap_process(tsk, exit_code);
 		tsk->signal->group_exit_task = tsk;
-		/* ignore all signals except SIGKILL, see prepare_signal() */
-		tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+		nr = zap_process(tsk, exit_code, 0);
 		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
@@ -365,8 +365,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 			if (p->mm) {
 				if (unlikely(p->mm == mm)) {
 					lock_task_sighand(p, &flags);
-					nr += zap_process(p, exit_code);
-					p->signal->flags = SIGNAL_GROUP_EXIT;
+					nr += zap_process(p, exit_code,
+							  SIGNAL_GROUP_EXIT);
 					unlock_task_sighand(p, &flags);
 				}
 				break;
-- 
cgit v1.1


From d61ba58953fcf708c850bfd90b2858f2ddfbc34c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 6 Nov 2015 16:32:34 -0800
Subject: coredump: change zap_threads() and zap_process() to use
 for_each_thread()

Change zap_threads() paths to use for_each_thread() rather than
while_each_thread().

While at it, change zap_threads() to avoid the nested if's to make the
code more readable and lessen the indentation.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle Walker <kwalker@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stanislav Kozina <skozina@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index c66bb05..1777331 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -290,15 +290,14 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
 	start->signal->group_exit_code = exit_code;
 	start->signal->group_stop_count = 0;
 
-	t = start;
-	do {
+	for_each_thread(start, t) {
 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
 			nr++;
 		}
-	} while_each_thread(start, t);
+	}
 
 	return nr;
 }
@@ -360,18 +359,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		if (g->flags & PF_KTHREAD)
 			continue;
-		p = g;
-		do {
-			if (p->mm) {
-				if (unlikely(p->mm == mm)) {
-					lock_task_sighand(p, &flags);
-					nr += zap_process(p, exit_code,
-							  SIGNAL_GROUP_EXIT);
-					unlock_task_sighand(p, &flags);
-				}
-				break;
+
+		for_each_thread(g, p) {
+			if (unlikely(!p->mm))
+				continue;
+			if (unlikely(p->mm == mm)) {
+				lock_task_sighand(p, &flags);
+				nr += zap_process(p, exit_code,
+							SIGNAL_GROUP_EXIT);
+				unlock_task_sighand(p, &flags);
 			}
-		} while_each_thread(g, p);
+			break;
+		}
 	}
 	rcu_read_unlock();
 done:
-- 
cgit v1.1


From 8b91a318e422a0201a7bd7ac2626eecec6b27ae1 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 6 Nov 2015 16:32:37 -0800
Subject: fs/seq_file: use seq_* helpers in seq_hex_dump()

This improves code readability.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 225586e..ad034fb 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -773,6 +773,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 {
 	const u8 *ptr = buf;
 	int i, linelen, remaining = len;
+	char *buffer;
+	size_t size;
 	int ret;
 
 	if (rowsize != 16 && rowsize != 32)
@@ -794,15 +796,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 			break;
 		}
 
+		size = seq_get_buf(m, &buffer);
 		ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
-					 m->buf + m->count, m->size - m->count,
-					 ascii);
-		if (ret >= m->size - m->count) {
-			seq_set_overflow(m);
-		} else {
-			m->count += ret;
-			seq_putc(m, '\n');
-		}
+					 buffer, size, ascii);
+		seq_commit(m, ret < size ? ret : -1);
+
+		seq_putc(m, '\n');
 	}
 }
 EXPORT_SYMBOL(seq_hex_dump);
-- 
cgit v1.1


From 25c6bb76eafe37c8963ae58a6a1bcf4069caeedb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 6 Nov 2015 16:32:40 -0800
Subject: seq_file: reuse string_escape_str()

strint_escape_str() escapes input string by given criteria.  In case of
seq_escape() the criteria is to convert some characters to their octal
representation.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index ad034fb..00bbe2b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -13,6 +13,7 @@
 #include <linux/cred.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
+#include <linux/string_helpers.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -377,26 +378,12 @@ EXPORT_SYMBOL(seq_release);
  */
 void seq_escape(struct seq_file *m, const char *s, const char *esc)
 {
-	char *end = m->buf + m->size;
-	char *p;
-	char c;
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int ret;
 
-	for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
-		if (!strchr(esc, c)) {
-			*p++ = c;
-			continue;
-		}
-		if (p + 3 < end) {
-			*p++ = '\\';
-			*p++ = '0' + ((c & 0300) >> 6);
-			*p++ = '0' + ((c & 070) >> 3);
-			*p++ = '0' + (c & 07);
-			continue;
-		}
-		seq_set_overflow(m);
-		return;
-	}
-	m->count = p - m->buf;
+	ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+	seq_commit(m, ret < size ? ret : -1);
 }
 EXPORT_SYMBOL(seq_escape);
 
-- 
cgit v1.1


From 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 6 Nov 2015 16:32:42 -0800
Subject: fs, seqfile: always allow oom killer

Since 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill
processes") seq_buf_alloc() avoids calling the oom killer for PAGE_SIZE or
smaller allocations; but larger allocations can use the oom killer via
vmalloc().  Thus reads of small files can return ENOMEM, but larger files
use the oom killer to avoid ENOMEM.

The effect of this bug is that reads from /proc and other virtual
filesystems can return ENOMEM instead of the preferred behavior - oom
killing something (possibly the calling process).  I don't know of anyone
except Google who has noticed the issue.

I suspect the fix is more needed in smaller systems where there isn't any
reclaimable memory.  But these seem like the kinds of systems which
probably don't use the oom killer for production situations.

Memory overcommit requires use of the oom killer to select a victim
regardless of file size.

Enable oom killer for small seq_buf_alloc() allocations.

Fixes: 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill processes")
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 00bbe2b..e85664b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -26,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m)
 static void *seq_buf_alloc(unsigned long size)
 {
 	void *buf;
+	gfp_t gfp = GFP_KERNEL;
 
 	/*
-	 * __GFP_NORETRY to avoid oom-killings with high-order allocations -
-	 * it's better to fall back to vmalloc() than to kill things.
+	 * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+	 * it's better to fall back to vmalloc() than to kill things.  For small
+	 * allocations, just use GFP_KERNEL which will oom kill, thus no need
+	 * for vmalloc fallback.
 	 */
-	buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+	if (size > PAGE_SIZE)
+		gfp |= __GFP_NORETRY | __GFP_NOWARN;
+	buf = kmalloc(size, gfp);
 	if (!buf && size > PAGE_SIZE)
 		buf = vmalloc(size);
 	return buf;
-- 
cgit v1.1


From ab92a20bce3b4c21927cc1ae514c482f50ad3487 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 18 Aug 2015 12:38:36 +0800
Subject: ubifs: make ubifs_[get|set]xattr atomic

This commit make the ubifs_[get|set]xattr protected by ui_mutex.

Originally, there is a possibility that ubifs_getxattr to get
a wrong value.

  P1                                  P2
----------                  	----------
ubifs_getxattr                      ubifs_setxattr
					- kfree()
- memcpy()
					- kmemdup()

Then ubifs_getxattr() would get a non-sense data. To solve this
problem, this commit make the xattr of ubifs_inode updated in
atomic.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/xattr.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 96f3448..99364ae 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -200,6 +200,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 	int err;
 	struct ubifs_inode *host_ui = ubifs_inode(host);
 	struct ubifs_inode *ui = ubifs_inode(inode);
+	void *buf = NULL;
 	struct ubifs_budget_req req = { .dirtied_ino = 2,
 		.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
 
@@ -208,14 +209,17 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 	if (err)
 		return err;
 
-	kfree(ui->data);
-	ui->data = kmemdup(value, size, GFP_NOFS);
-	if (!ui->data) {
+	buf = kmemdup(value, size, GFP_NOFS);
+	if (!buf) {
 		err = -ENOMEM;
 		goto out_free;
 	}
+	mutex_lock(&ui->ui_mutex);
+	kfree(ui->data);
+	ui->data = buf;
 	inode->i_size = ui->ui_size = size;
 	ui->data_len = size;
+	mutex_unlock(&ui->ui_mutex);
 
 	mutex_lock(&host_ui->ui_mutex);
 	host->i_ctime = ubifs_current_time(host);
@@ -409,6 +413,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 	ubifs_assert(inode->i_size == ui->data_len);
 	ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
 
+	mutex_lock(&ui->ui_mutex);
 	if (buf) {
 		/* If @buf is %NULL we are supposed to return the length */
 		if (ui->data_len > size) {
@@ -423,6 +428,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 	err = ui->data_len;
 
 out_iput:
+	mutex_unlock(&ui->ui_mutex);
 	iput(inode);
 out_unlock:
 	kfree(xent);
-- 
cgit v1.1


From 8c1c5f263833ec2dc8fd716cf4281265c485d7ad Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Sat, 7 Nov 2015 12:46:11 +0800
Subject: ubifs: introduce UBIFS_ATIME_SUPPORT to ubifs

To make ubifs support atime flexily, this commit introduces
a Kconfig option named as UBIFS_ATIME_SUPPORT.

With UBIFS_ATIME_SUPPORT=n:
	ubifs keeps the full compatibility to no_atime from
the start of ubifs.

=================UBIFS_ATIME_SUPPORT=n=======================
-o - no atime
-o atime - no atime
-o noatime - no atime
-o relatime - no atime
-o strictatime - no atime
-o lazyatime - no atime

With UBIFS_ATIME_SUPPORT=y:
	ubifs supports the atime same with other main stream
file systems.
=================UBIFS_ATIME_SUPPORT=y=======================
-o - default behavior (relatime currently)
-o atime - atime support
-o noatime - no atime support
-o relatime - relative atime support
-o strictatime - strict atime support
-o lazyatime - lazy atime support

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/Kconfig | 15 +++++++++++++++
 fs/ubifs/dir.c   |  3 +++
 fs/ubifs/file.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/super.c | 12 ++++++++++--
 fs/ubifs/ubifs.h |  3 +++
 5 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index ba66d508..7ff7712 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -35,3 +35,18 @@ config UBIFS_FS_ZLIB
 	default y
 	help
 	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
+
+config UBIFS_ATIME_SUPPORT
+	bool "Access time support" if UBIFS_FS
+	depends on UBIFS_FS
+	default n
+	help
+	  Originally UBIFS did not support atime, because it looked like a bad idea due
+	  increased flash wear. This option adds atime support and it is disabled by default
+	  to preserve the old behavior. If you enable this option, UBIFS starts updating atime,
+	  which means that file-system read operations will cause writes (inode atime
+	  updates). This may affect file-system performance and increase flash device wear,
+	  so be careful. How often atime is updated depends on the selected strategy:
+	  strictatime is the "heavy", relatime is "lighter", etc.
+
+	  If unsure, say 'N'
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index a2f9d97..e49bd28 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1186,6 +1186,9 @@ const struct inode_operations ubifs_dir_inode_operations = {
 	.getxattr    = ubifs_getxattr,
 	.listxattr   = ubifs_listxattr,
 	.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+	.update_time = ubifs_update_time,
+#endif
 };
 
 const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a3dfe2a..0edc128 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1354,6 +1354,47 @@ static inline int mctime_update_needed(const struct inode *inode,
 	return 0;
 }
 
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+/**
+ * ubifs_update_time - update time of inode.
+ * @inode: inode to update
+ *
+ * This function updates time of the inode.
+ */
+int ubifs_update_time(struct inode *inode, struct timespec *time,
+			     int flags)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+			.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+	int iflags = I_DIRTY_TIME;
+	int err, release;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	mutex_lock(&ui->ui_mutex);
+	if (flags & S_ATIME)
+		inode->i_atime = *time;
+	if (flags & S_CTIME)
+		inode->i_ctime = *time;
+	if (flags & S_MTIME)
+		inode->i_mtime = *time;
+
+	if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+		iflags |= I_DIRTY_SYNC;
+
+	release = ui->dirty;
+	__mark_inode_dirty(inode, iflags);
+	mutex_unlock(&ui->ui_mutex);
+	if (release)
+		ubifs_release_budget(c, &req);
+	return 0;
+}
+#endif
+
 /**
  * update_ctime - update mtime and ctime of an inode.
  * @inode: inode to update
@@ -1537,6 +1578,9 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (err)
 		return err;
 	vma->vm_ops = &ubifs_file_vm_ops;
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+	file_accessed(file);
+#endif
 	return 0;
 }
 
@@ -1557,6 +1601,9 @@ const struct inode_operations ubifs_file_inode_operations = {
 	.getxattr    = ubifs_getxattr,
 	.listxattr   = ubifs_listxattr,
 	.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+	.update_time = ubifs_update_time,
+#endif
 };
 
 const struct inode_operations ubifs_symlink_inode_operations = {
@@ -1568,6 +1615,9 @@ const struct inode_operations ubifs_symlink_inode_operations = {
 	.getxattr    = ubifs_getxattr,
 	.listxattr   = ubifs_listxattr,
 	.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+	.update_time = ubifs_update_time,
+#endif
 };
 
 const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9547a278..8ee3133 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -128,7 +128,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	if (err)
 		goto out_ino;
 
-	inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+	inode->i_flags |= S_NOCMTIME;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+	inode->i_flags |= S_NOATIME;
+#endif
 	set_nlink(inode, le32_to_cpu(ino->nlink));
 	i_uid_write(inode, le32_to_cpu(ino->uid));
 	i_gid_write(inode, le32_to_cpu(ino->gid));
@@ -2139,7 +2142,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 		if (err)
 			goto out_deact;
 		/* We do not support atime */
-		sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+		sb->s_flags |= MS_ACTIVE;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+		sb->s_flags |= MS_NOATIME;
+#else
+		ubifs_msg(c, "full atime support is enabled.");
+#endif
 	}
 
 	/* 'fill_super()' opens ubi again so we must close it here */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 6ef2415..01142e1 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1746,6 +1746,9 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 /* file.c */
 int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
+#endif
 
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
-- 
cgit v1.1


From 15c4f638f3d41bae52105ca4c0c8760afbcbeaab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 27 Oct 2015 14:09:51 +0900
Subject: directio: add block polling support

This adds support for sync O_DIRECT read/write poll support.

Signed-off-by: Jens Axboe <axboe@fb.com>
[hch: split from a larger patch, minor updates]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Keith Busch <keith.busch@intel.com>
---
 fs/direct-io.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3ae0e04..7025029 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -109,6 +109,8 @@ struct dio_submit {
 struct dio {
 	int flags;			/* doesn't change */
 	int rw;
+	blk_qc_t bio_cookie;
+	struct block_device *bio_bdev;
 	struct inode *inode;
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
@@ -397,11 +399,14 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
-	if (sdio->submit_io)
+	if (sdio->submit_io) {
 		sdio->submit_io(dio->rw, bio, dio->inode,
 			       sdio->logical_offset_in_bio);
-	else
-		submit_bio(dio->rw, bio);
+		dio->bio_cookie = BLK_QC_T_NONE;
+	} else {
+		dio->bio_cookie = submit_bio(dio->rw, bio);
+		dio->bio_bdev = bio->bi_bdev;
+	}
 
 	sdio->bio = NULL;
 	sdio->boundary = 0;
@@ -440,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio)
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
-		io_schedule();
+		if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
 		dio->waiter = NULL;
-- 
cgit v1.1


From aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 6 Nov 2015 13:33:33 +0000
Subject: Btrfs: fix race leading to incorrect item deletion when dropping
 extents

While running a stress test I got the following warning triggered:

  [191627.672810] ------------[ cut here ]------------
  [191627.673949] WARNING: CPU: 8 PID: 8447 at fs/btrfs/file.c:779 __btrfs_drop_extents+0x391/0xa50 [btrfs]()
  (...)
  [191627.701485] Call Trace:
  [191627.702037]  [<ffffffff8145f077>] dump_stack+0x4f/0x7b
  [191627.702992]  [<ffffffff81095de5>] ? console_unlock+0x356/0x3a2
  [191627.704091]  [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
  [191627.705380]  [<ffffffffa0664499>] ? __btrfs_drop_extents+0x391/0xa50 [btrfs]
  [191627.706637]  [<ffffffff8104b46d>] warn_slowpath_null+0x1a/0x1c
  [191627.707789]  [<ffffffffa0664499>] __btrfs_drop_extents+0x391/0xa50 [btrfs]
  [191627.709155]  [<ffffffff8115663c>] ? cache_alloc_debugcheck_after.isra.32+0x171/0x1d0
  [191627.712444]  [<ffffffff81155007>] ? kmemleak_alloc_recursive.constprop.40+0x16/0x18
  [191627.714162]  [<ffffffffa06570c9>] insert_reserved_file_extent.constprop.40+0x83/0x24e [btrfs]
  [191627.715887]  [<ffffffffa065422b>] ? start_transaction+0x3bb/0x610 [btrfs]
  [191627.717287]  [<ffffffffa065b604>] btrfs_finish_ordered_io+0x273/0x4e2 [btrfs]
  [191627.728865]  [<ffffffffa065b888>] finish_ordered_fn+0x15/0x17 [btrfs]
  [191627.730045]  [<ffffffffa067d688>] normal_work_helper+0x14c/0x32c [btrfs]
  [191627.731256]  [<ffffffffa067d96a>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
  [191627.732661]  [<ffffffff81061119>] process_one_work+0x24c/0x4ae
  [191627.733822]  [<ffffffff810615b0>] worker_thread+0x206/0x2c2
  [191627.734857]  [<ffffffff810613aa>] ? process_scheduled_works+0x2f/0x2f
  [191627.736052]  [<ffffffff810613aa>] ? process_scheduled_works+0x2f/0x2f
  [191627.737349]  [<ffffffff810669a6>] kthread+0xef/0xf7
  [191627.738267]  [<ffffffff810f3b3a>] ? time_hardirqs_on+0x15/0x28
  [191627.739330]  [<ffffffff810668b7>] ? __kthread_parkme+0xad/0xad
  [191627.741976]  [<ffffffff81465592>] ret_from_fork+0x42/0x70
  [191627.743080]  [<ffffffff810668b7>] ? __kthread_parkme+0xad/0xad
  [191627.744206] ---[ end trace bbfddacb7aaada8d ]---

  $ cat -n fs/btrfs/file.c
  691  int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
  (...)
  758                  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  759                  if (key.objectid > ino ||
  760                      key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
  761                          break;
  762
  763                  fi = btrfs_item_ptr(leaf, path->slots[0],
  764                                      struct btrfs_file_extent_item);
  765                  extent_type = btrfs_file_extent_type(leaf, fi);
  766
  767                  if (extent_type == BTRFS_FILE_EXTENT_REG ||
  768                      extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
  (...)
  774                  } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
  (...)
  778                  } else {
  779                          WARN_ON(1);
  780                          extent_end = search_start;
  781                  }
  (...)

This happened because the item we were processing did not match a file
extent item (its key type != BTRFS_EXTENT_DATA_KEY), and even on this
case we cast the item to a struct btrfs_file_extent_item pointer and
then find a type field value that does not match any of the expected
values (BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]). This scenario happens
due to a tiny time window where a race can happen as exemplified below.
For example, consider the following scenario where we're using the
NO_HOLES feature and we have the following two neighbour leafs:

               Leaf X (has N items)                    Leaf Y

[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ]  [ (257 EXTENT_DATA 8192), ... ]
          slot N - 2         slot N - 1              slot 0

Our inode 257 has an implicit hole in the range [0, 8K[ (implicit rather
than explicit because NO_HOLES is enabled). Now if our inode has an
ordered extent for the range [4K, 8K[ that is finishing, the following
can happen:

          CPU 1                                       CPU 2

  btrfs_finish_ordered_io()
    insert_reserved_file_extent()
      __btrfs_drop_extents()
         Searches for the key
          (257 EXTENT_DATA 4096) through
          btrfs_lookup_file_extent()

         Key not found and we get a path where
         path->nodes[0] == leaf X and
         path->slots[0] == N

         Because path->slots[0] is >=
         btrfs_header_nritems(leaf X), we call
         btrfs_next_leaf()

         btrfs_next_leaf() releases the path

                                                  inserts key
                                                  (257 INODE_REF 4096)
                                                  at the end of leaf X,
                                                  leaf X now has N + 1 keys,
                                                  and the new key is at
                                                  slot N

         btrfs_next_leaf() searches for
         key (257 INODE_REF 256), with
         path->keep_locks set to 1,
         because it was the last key it
         saw in leaf X

           finds it in leaf X again and
           notices it's no longer the last
           key of the leaf, so it returns 0
           with path->nodes[0] == leaf X and
           path->slots[0] == N (which is now
           < btrfs_header_nritems(leaf X)),
           pointing to the new key
           (257 INODE_REF 4096)

         __btrfs_drop_extents() casts the
         item at path->nodes[0], slot
         path->slots[0], to a struct
         btrfs_file_extent_item - it does
         not skip keys for the target
         inode with a type less than
         BTRFS_EXTENT_DATA_KEY
         (BTRFS_INODE_REF_KEY < BTRFS_EXTENT_DATA_KEY)

         sees a bogus value for the type
         field triggering the WARN_ON in
         the trace shown above, and sets
         extent_end = search_start (4096)

         does the if-then-else logic to
         fixup 0 length extent items created
         by a past bug from hole punching:

           if (extent_end == key.offset &&
               extent_end >= search_start)
               goto delete_extent_item;

         that evaluates to true and it ends
         up deleting the key pointed to by
         path->slots[0], (257 INODE_REF 4096),
         from leaf X

The same could happen for example for a xattr that ends up having a key
with an offset value that matches search_start (very unlikely but not
impossible).

So fix this by ensuring that keys smaller than BTRFS_EXTENT_DATA_KEY are
skipped, never casted to struct btrfs_file_extent_item and never deleted
by accident. Also protect against the unexpected case of getting a key
for a lower inode number by skipping that key and issuing a warning.

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/file.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cfa243c..3009d45 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -756,8 +756,16 @@ next_slot:
 		}
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid > ino ||
-		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+
+		if (key.objectid > ino)
+			break;
+		if (WARN_ON_ONCE(key.objectid < ino) ||
+		    key.type < BTRFS_EXTENT_DATA_KEY) {
+			ASSERT(del_nr == 0);
+			path->slots[0]++;
+			goto next_slot;
+		}
+		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
 			break;
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -776,8 +784,8 @@ next_slot:
 				btrfs_file_extent_inline_len(leaf,
 						     path->slots[0], fi);
 		} else {
-			WARN_ON(1);
-			extent_end = search_start;
+			/* can't happen */
+			BUG();
 		}
 
 		/*
-- 
cgit v1.1


From 1d512cb77bdbda80f0dd0620a3b260d697fd581d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Nov 2015 00:33:58 +0000
Subject: Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow

If we are using the NO_HOLES feature, we have a tiny time window when
running delalloc for a nodatacow inode where we can race with a concurrent
link or xattr add operation leading to a BUG_ON.

This happens because at run_delalloc_nocow() we end up casting a leaf item
of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
file extent item (struct btrfs_file_extent_item) and then analyse its
extent type field, which won't match any of the expected extent types
(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
explicit BUG_ON(1).

The following sequence diagram shows how the race happens when running a
no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
neighbour leafs:

             Leaf X (has N items)                    Leaf Y

 [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ]  [ (257 EXTENT_DATA 8192), ... ]
              slot N - 2         slot N - 1              slot 0

 (Note the implicit hole for inode 257 regarding the [0, 8K[ range)

       CPU 1                                         CPU 2

 run_dealloc_nocow()
   btrfs_lookup_file_extent()
     --> searches for a key with value
         (257 EXTENT_DATA 4096) in the
         fs/subvol tree
     --> returns us a path with
         path->nodes[0] == leaf X and
         path->slots[0] == N

   because path->slots[0] is >=
   btrfs_header_nritems(leaf X), it
   calls btrfs_next_leaf()

   btrfs_next_leaf()
     --> releases the path

                                              hard link added to our inode,
                                              with key (257 INODE_REF 500)
                                              added to the end of leaf X,
                                              so leaf X now has N + 1 keys

     --> searches for the key
         (257 INODE_REF 256), because
         it was the last key in leaf X
         before it released the path,
         with path->keep_locks set to 1

     --> ends up at leaf X again and
         it verifies that the key
         (257 INODE_REF 256) is no longer
         the last key in the leaf, so it
         returns with path->nodes[0] ==
         leaf X and path->slots[0] == N,
         pointing to the new item with
         key (257 INODE_REF 500)

   the loop iteration of run_dealloc_nocow()
   does not break out the loop and continues
   because the key referenced in the path
   at path->nodes[0] and path->slots[0] is
   for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
   and its offset (500) is less then our delalloc
   range's end (8192)

   the item pointed by the path, an inode reference item,
   is (incorrectly) interpreted as a file extent item and
   we get an invalid extent type, leading to the BUG_ON(1):

   if (extent_type == BTRFS_FILE_EXTENT_REG ||
      extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
       (...)
   } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
       (...)
   } else {
       BUG_ON(1)
   }

The same can happen if a xattr is added concurrently and ends up having
a key with an offset smaller then the delalloc's range end.

So fix this by skipping keys with a type smaller than
BTRFS_EXTENT_DATA_KEY.

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6138eea..6e93349 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1304,8 +1304,14 @@ next_slot:
 		num_bytes = 0;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-		if (found_key.objectid > ino ||
-		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+		if (found_key.objectid > ino)
+			break;
+		if (WARN_ON_ONCE(found_key.objectid < ino) ||
+		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
+			path->slots[0]++;
+			goto next_slot;
+		}
+		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
 		    found_key.offset > end)
 			break;
 
-- 
cgit v1.1


From 7b52e2793a58af61b5d349c2c080437a437a4edb Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 9 Nov 2015 08:59:45 -0600
Subject: Allow copy offload (CopyChunk) across shares

FSCTL_SRV_COPYCHUNK_WRITE only requires that the source and target
be on the same server (not the same volume or same share),
so relax the existing check (which required them to be on
the same share). Note that this works to Windows (and presumably
most other NAS) but Samba requires that the source
and target be on the same share.  Moving a file across
shares is a common use case and can be very heplful (100x faster).

Signed-off-by: Steve French <steve.french@primarydata.com>
Reviewed-by: David Disseldorp <ddiss@samba.org>
---
 fs/cifs/cifsfs.c | 12 ------------
 fs/cifs/ioctl.c  | 11 ++++++++---
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f578ef9..cbc0f4b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -925,9 +925,7 @@ const struct file_operations cifs_file_ops = {
 	.mmap  = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -943,9 +941,7 @@ const struct file_operations cifs_file_strict_ops = {
 	.mmap = cifs_file_strict_mmap,
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -961,9 +957,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.flush = cifs_flush,
 	.mmap = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl  = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
@@ -979,9 +973,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.mmap  = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -996,9 +988,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 	.mmap = cifs_file_strict_mmap,
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -1013,9 +1003,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.flush = cifs_flush,
 	.mmap = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl  = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 28a77bf..35cf990 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -85,9 +85,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
 	src_tcon = tlink_tcon(smb_file_src->tlink);
 	target_tcon = tlink_tcon(smb_file_target->tlink);
 
-	/* check if source and target are on same tree connection */
-	if (src_tcon != target_tcon) {
-		cifs_dbg(VFS, "file copy src and target on different volume\n");
+	/* check source and target on same server (or volume if dup_extents) */
+	if (dup_extents && (src_tcon != target_tcon)) {
+		cifs_dbg(VFS, "source and target of copy not on same share\n");
+		goto out_fput;
+	}
+
+	if (!dup_extents && (src_tcon->ses != target_tcon->ses)) {
+		cifs_dbg(VFS, "source and target of copy not on same server\n");
 		goto out_fput;
 	}
 
-- 
cgit v1.1


From 31dddd9eb9ebae9a2a9b502750e9e481d752180a Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 28 Oct 2015 09:05:31 -0500
Subject: GFS2: Fix rgrp end rounding problem for bsize < page size

This patch fixes a bug introduced by commit 7005c3e. That patch
tries to map a vm range for resource groups, but the calculation
breaks down when the block size is less than the page size.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/rgrp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index d29dd0c..c134c04 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -933,8 +933,9 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 		goto fail;
 
 	rgd->rd_gl->gl_object = rgd;
-	rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
-	rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
+	rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK;
+	rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr +
+						  rgd->rd_length) * bsize) - 1;
 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
 	if (rgd->rd_data > sdp->sd_max_rg_data)
-- 
cgit v1.1


From 5037835c1f3eabf4f22163fc0278dd87165f8957 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Mon, 5 Oct 2015 16:33:36 -0600
Subject: coredump: add DAX filtering for ELF coredumps

Add two new flags to the existing coredump mechanism for ELF files to
allow us to explicitly filter DAX mappings.  This is desirable because
DAX mappings, like hugetlb mappings, have the potential to be very
large.

Update the coredump_filter documentation in
Documentation/filesystems/proc.txt so that it addresses the new DAX
coredump flags.  Also update the documented default value of
coredump_filter to be consistent with the core(5) man page.  The
documentation being updated talks about bit 4, Dump ELF headers, which
is enabled if CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is turned on in the
kernel config.  This kernel config option defaults to "y" if both ELF
binaries and coredump are enabled.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/binfmt_elf.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6b65996..5f399ea 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
+#include <linux/dax.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1236,6 +1237,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 	if (vma->vm_flags & VM_DONTDUMP)
 		return 0;
 
+	/* support for DAX */
+	if (vma_is_dax(vma)) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+			goto whole;
+		return 0;
+	}
+
 	/* Hugetlb memory check */
 	if (vma->vm_flags & VM_HUGETLB) {
 		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
-- 
cgit v1.1


From ab27a8d04b32b6ee8c30c14c4afd1058e8addc82 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Mon, 5 Oct 2015 16:33:37 -0600
Subject: coredump: add DAX filtering for FDPIC ELF coredumps

Add explicit filtering for DAX mappings to FDPIC ELF coredump.  This is
useful because DAX mappings have the potential to be very large.

This patch has only been compile tested.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/binfmt_elf_fdpic.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bf..8632501 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
 #include <linux/coredump.h>
+#include <linux/dax.h>
 
 #include <asm/uaccess.h>
 #include <asm/param.h>
@@ -1206,6 +1207,20 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
 		return 0;
 	}
 
+	/* support for DAX */
+	if (vma_is_dax(vma)) {
+		if (vma->vm_flags & VM_SHARED) {
+			dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
+			kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
+			       vma->vm_flags, dump_ok ? "yes" : "no");
+		} else {
+			dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
+			kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
+			       vma->vm_flags, dump_ok ? "yes" : "no");
+		}
+		return dump_ok;
+	}
+
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
 		if (file_inode(vma->vm_file)->i_nlink == 0) {
-- 
cgit v1.1


From f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Nov 2015 18:06:38 +0000
Subject: Btrfs: fix race when listing an inode's xattrs

When listing a inode's xattrs we have a time window where we race against
a concurrent operation for adding a new hard link for our inode that makes
us not return any xattr to user space. In order for this to happen, the
first xattr of our inode needs to be at slot 0 of a leaf and the previous
leaf must still have room for an inode ref (or extref) item, and this can
happen because an inode's listxattrs callback does not lock the inode's
i_mutex (nor does the VFS does it for us), but adding a hard link to an
inode makes the VFS lock the inode's i_mutex before calling the inode's
link callback.

If we have the following leafs:

               Leaf X (has N items)                    Leaf Y

 [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ]  [ (257 XATTR_ITEM 12345), ... ]
           slot N - 2         slot N - 1              slot 0

The race illustrated by the following sequence diagram is possible:

       CPU 1                                               CPU 2

  btrfs_listxattr()

    searches for key (257 XATTR_ITEM 0)

    gets path with path->nodes[0] == leaf X
    and path->slots[0] == N

    because path->slots[0] is >=
    btrfs_header_nritems(leaf X), it calls
    btrfs_next_leaf()

    btrfs_next_leaf()
      releases the path

                                                   adds key (257 INODE_REF 666)
                                                   to the end of leaf X (slot N),
                                                   and leaf X now has N + 1 items

      searches for the key (257 INODE_REF 256),
      with path->keep_locks == 1, because that
      is the last key it saw in leaf X before
      releasing the path

      ends up at leaf X again and it verifies
      that the key (257 INODE_REF 256) is no
      longer the last key in leaf X, so it
      returns with path->nodes[0] == leaf X
      and path->slots[0] == N, pointing to
      the new item with key (257 INODE_REF 666)

    btrfs_listxattr's loop iteration sees that
    the type of the key pointed by the path is
    different from the type BTRFS_XATTR_ITEM_KEY
    and so it breaks the loop and stops looking
    for more xattr items
      --> the application doesn't get any xattr
          listed for our inode

So fix this by breaking the loop only if the key's type is greater than
BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller.

Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6f518c9..1fcd7b6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		/* check to make sure this item is what we want */
 		if (found_key.objectid != key.objectid)
 			break;
-		if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
 			break;
+		if (found_key.type < BTRFS_XATTR_ITEM_KEY)
+			goto next;
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		if (verify_dir_item(root, leaf, di))
-- 
cgit v1.1


From edfb8ebce225a0638cf62591d4ccb502f052ffd4 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 10 Nov 2015 10:09:45 +1100
Subject: xfs: Fix error path in xfs_get_acl

Error codes from xfs_attr_get other than -ENOATTR were not properly
reported.  Fix that.

In addition, the declaration of struct xfs_inode in xfs_acl.h isn't needed.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_acl.c | 1 +
 fs/xfs/xfs_acl.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 763e365..6bb470f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -163,6 +163,7 @@ xfs_get_acl(struct inode *inode, int type)
 		 */
 		if (error == -ENOATTR)
 			goto out_update_cache;
+		acl = ERR_PTR(error);
 		goto out;
 	}
 
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 75af0a4..52f8255 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -20,7 +20,6 @@
 
 struct inode;
 struct posix_acl;
-struct xfs_inode;
 
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-- 
cgit v1.1


From 848ccfc8fe0e8ae572ed0d8a9a2c3a0cda3bce3b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 10 Nov 2015 10:10:33 +1100
Subject: xfs: fix log recovery op header validation assert

Commit 89cebc84 ("xfs: validate transaction header length on log
recovery") added additional validation of the on-disk op header length
to protect from buffer overflow during log recovery. It accounts for the
fact that the transaction header can be split across multiple op
headers. It added an assert for when this occurs that verifies the
length of the second part of a split transaction header is less than a
full transaction header. In other words, it expects that the first op
header of a split transaction header includes at least some portion of
the transaction header.

This expectation is not always valid as a zero-length op header can
exist for the first op header of a split transaction header (see
xlog_recover_add_to_trans() for details). This means that the second op
header can have a valid, full length transaction header and thus the
full header is copied in xlog_recover_add_to_cont_trans(). Fix the
assert in xlog_recover_add_to_cont_trans() to handle this case correctly
and require that the op header length is less than or equal to a full
transaction header.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log_recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 512a094..0d92b6f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3431,7 +3431,7 @@ xlog_recover_add_to_cont_trans(
 	 * previous record. Copy the rest of the header.
 	 */
 	if (list_empty(&trans->r_itemq)) {
-		ASSERT(len < sizeof(struct xfs_trans_header));
+		ASSERT(len <= sizeof(struct xfs_trans_header));
 		if (len > sizeof(struct xfs_trans_header)) {
 			xfs_warn(log->l_mp, "%s: bad header length", __func__);
 			return -EIO;
-- 
cgit v1.1


From 7a29ac474a47eb8cf212b45917683ae89d6fa13b Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 10 Nov 2015 10:10:34 +1100
Subject: xfs: give all workqueues rescuer threads

We're consistently hitting deadlocks here with XFS on recent kernels.
After some digging through the crash files, it looks like everyone in
the system is waiting for XFS to reclaim memory.

Something like this:

PID: 2733434  TASK: ffff8808cd242800  CPU: 19  COMMAND: "java"
 #0 [ffff880019c53588] __schedule at ffffffff818c4df2
 #1 [ffff880019c535d8] schedule at ffffffff818c5517
 #2 [ffff880019c535f8] _xfs_log_force_lsn at ffffffff81316348
 #3 [ffff880019c53688] xfs_log_force_lsn at ffffffff813164fb
 #4 [ffff880019c536b8] xfs_iunpin_wait at ffffffff8130835e
 #5 [ffff880019c53728] xfs_reclaim_inode at ffffffff812fd453
 #6 [ffff880019c53778] xfs_reclaim_inodes_ag at ffffffff812fd8c7
 #7 [ffff880019c53928] xfs_reclaim_inodes_nr at ffffffff812fe433
 #8 [ffff880019c53958] xfs_fs_free_cached_objects at ffffffff8130d3b9
 #9 [ffff880019c53968] super_cache_scan at ffffffff811a6f73
#10 [ffff880019c539c8] shrink_slab at ffffffff811460e6
#11 [ffff880019c53aa8] shrink_zone at ffffffff8114a53f
#12 [ffff880019c53b48] do_try_to_free_pages at ffffffff8114a8ba
#13 [ffff880019c53be8] try_to_free_pages at ffffffff8114ad5a
#14 [ffff880019c53c78] __alloc_pages_nodemask at ffffffff8113e1b8
#15 [ffff880019c53d88] alloc_kmem_pages_node at ffffffff8113e671
#16 [ffff880019c53dd8] copy_process at ffffffff8104f781
#17 [ffff880019c53ec8] do_fork at ffffffff8105129c
#18 [ffff880019c53f38] sys_clone at ffffffff810515b6
#19 [ffff880019c53f48] stub_clone at ffffffff818c8e4d

xfs_log_force_lsn is waiting for logs to get cleaned, which is waiting
for IO, which is waiting for workers to complete the IO which is waiting
for worker threads that don't exist yet:

PID: 2752451  TASK: ffff880bd6bdda00  CPU: 37  COMMAND: "kworker/37:1"
 #0 [ffff8808d20abbb0] __schedule at ffffffff818c4df2
 #1 [ffff8808d20abc00] schedule at ffffffff818c5517
 #2 [ffff8808d20abc20] schedule_timeout at ffffffff818c7c6c
 #3 [ffff8808d20abcc0] wait_for_completion_killable at ffffffff818c6495
 #4 [ffff8808d20abd30] kthread_create_on_node at ffffffff8106ec82
 #5 [ffff8808d20abdf0] create_worker at ffffffff8106752f
 #6 [ffff8808d20abe40] worker_thread at ffffffff810699be
 #7 [ffff8808d20abec0] kthread at ffffffff8106ef59
 #8 [ffff8808d20abf50] ret_from_fork at ffffffff818c8ac8

I think we should be using WQ_MEM_RECLAIM to make sure this thread
pool makes progress when we're not able to allocate new workers.

[dchinner: make all workqueues WQ_MEM_RECLAIM]

Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 29531ec..65fbfb7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -838,17 +838,18 @@ xfs_init_mount_workqueues(
 		goto out_destroy_unwritten;
 
 	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
-			WQ_FREEZABLE, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_reclaim_workqueue)
 		goto out_destroy_cil;
 
 	mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-			WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
+			mp->m_fsname);
 	if (!mp->m_log_workqueue)
 		goto out_destroy_reclaim;
 
 	mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
-			WQ_FREEZABLE, 0, mp->m_fsname);
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 	if (!mp->m_eofblocks_workqueue)
 		goto out_destroy_log;
 
-- 
cgit v1.1


From 30fdc8ee0ef0d9dffaeaccb202f79d69e1291cbd Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 9 Nov 2015 14:57:58 -0800
Subject: fs/inode.c: fix kernel-doc warning

Fix kernel-doc warning in fs/inode.c:

  ../fs/inode.c:1606: warning: No description found for parameter 'inode'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 78a17b8..1be5f90 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1597,6 +1597,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 /**
  *	touch_atime	-	update the access time
  *	@path: the &struct path to update
+ *	@inode: inode to update
  *
  *	Update the accessed time on an inode and mark it for writeback.
  *	This function automatically handles read only file systems and media,
-- 
cgit v1.1


From dbce03b9e3e61e122451a7aa4e6900d5f0bb5993 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 9 Nov 2015 14:58:00 -0800
Subject: fs/writeback.c: fix kernel-doc warnings

Fix kernel-doc warnings in fs/fs-writeback.c by moving a #define macro to
after the function's opening brace.  Also #undef this macro at the end of
the function.

  ../fs/fs-writeback.c:1984: warning: Excess function parameter 'inode' description in 'I_DIRTY_INODE'
  ../fs/fs-writeback.c:1984: warning: Excess function parameter 'flags' description in 'I_DIRTY_INODE'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 206a68b..023f6a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1981,9 +1981,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 	struct super_block *sb = inode->i_sb;
 	int dirtytime;
 
@@ -2093,6 +2093,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 out_unlock_inode:
 	spin_unlock(&inode->i_lock);
 
+#undef I_DIRTY_INODE
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-- 
cgit v1.1


From 79211c8ed19c055ca105502c8733800d442a0ae6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 9 Nov 2015 14:58:13 -0800
Subject: remove abs64()

Switch everything to the new and more capable implementation of abs().
Mainly to give the new abs() a bit of a workout.

Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/mballoc.c  | 4 ++--
 fs/gfs2/lock_dlm.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b4b3c1f..61eaf74 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3333,8 +3333,8 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
 		atomic_inc(&pa->pa_count);
 		return pa;
 	}
-	cur_distance = abs64(goal_block - cpa->pa_pstart);
-	new_distance = abs64(goal_block - pa->pa_pstart);
+	cur_distance = abs(goal_block - cpa->pa_pstart);
+	new_distance = abs(goal_block - pa->pa_pstart);
 
 	if (cur_distance <= new_distance)
 		return cpa;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 284c154..8b907c5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -50,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
 	s64 delta = sample - s->stats[index];
 	s->stats[index] += (delta >> 3);
 	index++;
-	s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+	s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
 }
 
 /**
-- 
cgit v1.1


From 349c7037b15927eed3cc8a5dbc08639dea654ea5 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:31 -0800
Subject: fs/9p: remove unnecessary new_valid_dev() checks

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/9p/vfs_inode.c      | 3 ---
 fs/9p/vfs_inode_dotl.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b1dc518..699941e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1368,9 +1368,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
 		 dir->i_ino, dentry, mode,
 		 MAJOR(rdev), MINOR(rdev));
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	/* build extension */
 	if (S_ISBLK(mode))
 		sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e8aa57d..cb899af 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -829,9 +829,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 		 dir->i_ino, dentry, omode,
 		 MAJOR(rdev), MINOR(rdev));
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	v9ses = v9fs_inode2v9ses(dir);
 	dir_dentry = dentry->d_parent;
 	dfid = v9fs_fid_lookup(dir_dentry);
-- 
cgit v1.1


From 7cac0a85992229a631d4e01934d2e1c042566189 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:34 -0800
Subject: fs/btrfs/inode.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Acked-by: David Sterba <dsterba@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4439fbb..0e4f2bf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6360,9 +6360,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
-- 
cgit v1.1


From 5738939289d58829fd8b73f8c26b34ab08539fcf Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:37 -0800
Subject: fs/exofs/namei.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: Boaz Harrosh <ooo@electrozaur.com>
Cc: Benny Halevy <bhalevy@primarydata.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exofs/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 09a6bb1..994e078 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -80,9 +80,6 @@ static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct inode *inode;
 	int err;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	inode = exofs_new_inode(dir, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
-- 
cgit v1.1


From d7df00072e64f972e0f7eb4fea78eed2da9c0e59 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:39 -0800
Subject: fs/ext2/namei.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index b4841e3..3267a80 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -143,9 +143,6 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
 	struct inode * inode;
 	int err;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
-- 
cgit v1.1


From a8415e4b130e0cd919c7c6584bdf27660f3ff295 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:42 -0800
Subject: fs/f2fs/namei.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Changman Lee <cm224.lee@samsung.com>
Cc: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/f2fs/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e48b80c..2c32110 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -481,9 +481,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	f2fs_balance_fs(sbi);
 
 	inode = f2fs_new_inode(dir, mode);
-- 
cgit v1.1


From fdca5e6a6d0a6600ff24d6d9c843e5aee637f29f Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:44 -0800
Subject: fs/hpfs/namei.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hpfs/namei.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9e92c9c..ae4d5a1 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -227,8 +227,6 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
 	int err;
 	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
 	hpfs_lock(dir->i_sb);
 	err = -ENOSPC;
 	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
-- 
cgit v1.1


From a2a17044094d0816d56a9e151bbf146d89a23010 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:47 -0800
Subject: fs/jfs: remove unnecessary new_valid_dev() checks

new_valid_dev() always returns 1, so the !new_valid_dev() checks are not
needed.  Remove them.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jfs/namei.c | 3 ---
 fs/jfs/super.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 35976bd..9d7551f 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1372,9 +1372,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	tid_t tid;
 	struct tblock *tblk;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	jfs_info("jfs_mknod: %pd", dentry);
 
 	rc = dquot_initialize(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4cd9798f..8f9176c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -496,9 +496,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
 
-	if (!new_valid_dev(sb->s_bdev->bd_dev))
-		return -EOVERFLOW;
-
 	sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-- 
cgit v1.1


From 4467e29f0c48d0fc3e164c39c1eccd1ce230fbe4 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:50 -0800
Subject: fs/ncpfs/dir.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Petr Vandrovec <petr@vandrovec.name>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ncpfs/dir.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 356816e..f0e3e9e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1165,8 +1165,6 @@ out:
 static int ncp_mknod(struct inode * dir, struct dentry *dentry,
 		     umode_t mode, dev_t rdev)
 {
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
 	if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
 		ncp_dbg(1, "mode = 0%ho\n", mode);
 		return ncp_create_new(dir, dentry, mode, rdev, 0);
-- 
cgit v1.1


From 3348a172befd1b78025e30037bc383514be8d32f Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:53 -0800
Subject: fs/nilfs2/namei.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 37dd6b0..c9a1a49 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 	struct nilfs_transaction_info ti;
 	int err;
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
 	if (err)
 		return err;
-- 
cgit v1.1


From 3cc5d9a905826e835d4d4dcd0faf922ab7b9eaf9 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:55 -0800
Subject: fs/reiserfs/namei.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 5f1c9c2..47f9698 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -712,9 +712,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
 
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
 	retval = dquot_initialize(dir);
 	if (retval)
 		return retval;
-- 
cgit v1.1


From 28f65708a57395799781f5c44863b50f99facbf2 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <bywxiaobai@163.com>
Date: Mon, 9 Nov 2015 14:58:58 -0800
Subject: fs/stat.c: remove unnecessary new_valid_dev() check

new_valid_dev() always returns 1, so the !new_valid_dev() check is not
needed.  Remove it.

Signed-off-by: Yaowei Bai <bywxiaobai@163.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/stat.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/stat.c b/fs/stat.c
index cccc1aa..d4a61d8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -367,8 +367,6 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	INIT_STRUCT_STAT64_PADDING(tmp);
 #ifdef CONFIG_MIPS
 	/* mips has weird padding, so we don't get 64 bits there */
-	if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
-		return -EOVERFLOW;
 	tmp.st_dev = new_encode_dev(stat->dev);
 	tmp.st_rdev = new_encode_dev(stat->rdev);
 #else
-- 
cgit v1.1


From 1bde925d235478cd5e70a7d9f94f3d8eb7c6362b Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@libc.org>
Date: Mon, 9 Nov 2015 14:59:01 -0800
Subject: fs/binfmt_elf_fdpic.c: provide NOMMU loader for regular ELF binaries

The ELF binary loader in binfmt_elf.c requires an MMU, making it
impossible to use regular ELF binaries on NOMMU archs.  However, the FDPIC
ELF loader in binfmt_elf_fdpic.c is fully capable as a loader for plain
ELF, which requires constant displacements between LOAD segments, since it
already supports FDPIC ELF files flagged as needing constant displacement.

This patch adjusts the FDPIC ELF loader to accept non-FDPIC ELF files on
NOMMU archs.  They are treated identically to FDPIC ELF files with the
constant-displacement flag bit set, except for personality, which must
match the ABI of the program being loaded; the PER_LINUX_FDPIC personality
controls how the kernel interprets function pointers passed to sigaction.

Files that do not set a stack size requirement explicitly are given a
default stack size (matching the amount of committed stack the normal ELF
loader for MMU archs would give them) rather than being rejected; this is
necessary because plain ELF files generally do not declare stack
requirements in theit program headers.

Only ET_DYN (PIE) format ELF files are supported, since loading at a fixed
virtual address is not possible on NOMMU.

This patch was developed and tested on J2 (SH2-compatible) but should
be usable immediately on all archs where binfmt_elf_fdpic is
available. Moreover, by providing dummy definitions of the
elf_check_fdpic() and elf_check_const_displacement() macros for archs
which lack an FDPIC ABI, it should be possible to enable building of
binfmt_elf_fdpic on all other NOMMU archs and thereby give them ELF
binary support, but I have not yet tested this.

The motivation for using binfmt_elf_fdpic.c rather than adapting
binfmt_elf.c to NOMMU is that the former already has all the necessary
code to work properly on NOMMU and has already received widespread
real-world use and testing. I hope this is not controversial.

I'm not really happy with having to unset the FDPIC_FUNCPTRS
personality bit when loading non-FDPIC ELF. This bit should really
reset automatically on execve, since otherwise, executing non-ELF
binaries (e.g. bFLT) from an FDPIC process will leave the personality
in the wrong state and severely break signal handling. But that's a
separate, existing bug and I don't know the right place to fix it.

Signed-off-by: Rich Felker <dalias@libc.org>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Oleg Endo <oleg.endo@t-online.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf_fdpic.c | 46 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bf..2046206 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -103,19 +103,36 @@ static void __exit exit_elf_fdpic_binfmt(void)
 core_initcall(init_elf_fdpic_binfmt);
 module_exit(exit_elf_fdpic_binfmt);
 
-static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+static int is_elf(struct elfhdr *hdr, struct file *file)
 {
 	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
 		return 0;
 	if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
 		return 0;
-	if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+	if (!elf_check_arch(hdr))
 		return 0;
 	if (!file->f_op->mmap)
 		return 0;
 	return 1;
 }
 
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(x) 0
+#endif
+
+#ifndef elf_check_const_displacement
+#define elf_check_const_displacement(x) 0
+#endif
+
+static int is_constdisp(struct elfhdr *hdr)
+{
+	if (!elf_check_fdpic(hdr))
+		return 1;
+	if (elf_check_const_displacement(hdr))
+		return 1;
+	return 0;
+}
+
 /*****************************************************************************/
 /*
  * read the program headers table into memory
@@ -191,8 +208,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 
 	/* check that this is a binary we know how to deal with */
 	retval = -ENOEXEC;
-	if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+	if (!is_elf(&exec_params.hdr, bprm->file))
 		goto error;
+	if (!elf_check_fdpic(&exec_params.hdr)) {
+#ifdef CONFIG_MMU
+		/* binfmt_elf handles non-fdpic elf except on nommu */
+		goto error;
+#else
+		/* nommu can only load ET_DYN (PIE) ELF */
+		if (exec_params.hdr.e_type != ET_DYN)
+			goto error;
+#endif
+	}
 
 	/* read the program header table */
 	retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
@@ -269,13 +296,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 
 	}
 
-	if (elf_check_const_displacement(&exec_params.hdr))
+	if (is_constdisp(&exec_params.hdr))
 		exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
 
 	/* perform insanity checks on the interpreter */
 	if (interpreter_name) {
 		retval = -ELIBBAD;
-		if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+		if (!is_elf(&interp_params.hdr, interpreter))
 			goto error;
 
 		interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
@@ -306,9 +333,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 
 	retval = -ENOEXEC;
 	if (stack_size == 0)
-		goto error;
+		stack_size = 131072UL; /* same as exec.c's default commit */
 
-	if (elf_check_const_displacement(&interp_params.hdr))
+	if (is_constdisp(&interp_params.hdr))
 		interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
 
 	/* flush all traces of the currently running executable */
@@ -319,7 +346,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 	/* there's now no turning back... the old userspace image is dead,
 	 * defunct, deceased, etc.
 	 */
-	set_personality(PER_LINUX_FDPIC);
+	if (elf_check_fdpic(&exec_params.hdr))
+		set_personality(PER_LINUX_FDPIC);
+	else
+		set_personality(PER_LINUX);
 	if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
-- 
cgit v1.1


From 3e80dbcda7f3e1e349a779d7a14c0e08677c39fa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 4 Nov 2015 11:02:29 -0500
Subject: nfsd: remove recurring workqueue job to clean DRC

We have a shrinker, we clean out the cache when nfsd is shut down, and
prune the chains on each request. A recurring workqueue job seems like
unnecessary overhead. Just remove it.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 116940c..54cde9a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -63,7 +63,6 @@ static unsigned int		longest_chain;
 static unsigned int		longest_chain_cachesize;
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-static void	cache_cleaner_func(struct work_struct *unused);
 static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc);
 static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
@@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
 };
 
 /*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
- */
-static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-
-/*
  * Put a cap on the size of the DRC based on the amount of available
  * low memory in the machine.
  *
@@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void)
 	unsigned int i;
 
 	unregister_shrinker(&nfsd_reply_cache_shrinker);
-	cancel_delayed_work_sync(&cache_cleaner);
 
 	for (i = 0; i < drc_hashsize; i++) {
 		struct list_head *head = &drc_hashtbl[i].lru_head;
@@ -230,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
 {
 	rp->c_timestamp = jiffies;
 	list_move_tail(&rp->c_lru, &b->lru_head);
-	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
 }
 
 static long
@@ -264,7 +254,6 @@ prune_cache_entries(void)
 {
 	unsigned int i;
 	long freed = 0;
-	bool cancel = true;
 
 	for (i = 0; i < drc_hashsize; i++) {
 		struct nfsd_drc_bucket *b = &drc_hashtbl[i];
@@ -273,26 +262,11 @@ prune_cache_entries(void)
 			continue;
 		spin_lock(&b->cache_lock);
 		freed += prune_bucket(b);
-		if (!list_empty(&b->lru_head))
-			cancel = false;
 		spin_unlock(&b->cache_lock);
 	}
-
-	/*
-	 * Conditionally rearm the job to run in RC_EXPIRE since we just
-	 * ran the pruner.
-	 */
-	if (!cancel)
-		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
 	return freed;
 }
 
-static void
-cache_cleaner_func(struct work_struct *unused)
-{
-	prune_cache_entries();
-}
-
 static unsigned long
 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-- 
cgit v1.1


From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001
From: Andrew Elble <aweits@rit.edu>
Date: Thu, 15 Oct 2015 12:07:28 -0400
Subject: nfsd: eliminate sending duplicate and repeated delegations

We've observed the nfsd server in a state where there are
multiple delegations on the same nfs4_file for the same client.
The nfs client does attempt to DELEGRETURN these when they are presented to
it - but apparently under some (unknown) circumstances the client does not
manage to return all of them. This leads to the eventual
attempt to CB_RECALL more than one delegation with the same nfs
filehandle to the same client. The first recall will succeed, but the
next recall will fail with NFS4ERR_BADHANDLE. This leads to the server
having delegations on cl_revoked that the client has no way to FREE
or DELEGRETURN, with resulting inability to recover. The state manager
on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED,
and the state manager on the client will be looping unable to satisfy
the server.

List discussion also reports a race between OPEN and DELEGRETURN that
will be avoided by only sending the delegation once to the
client. This is also logically in accordance with RFC5561 9.1.1 and 10.2.

So, let's:

1.) Not hand out duplicate delegations.
2.) Only send them to the client once.

RFC 5561:

9.1.1:
"Delegations and layouts, on the other hand, are not associated with a
specific owner but are associated with the client as a whole
(identified by a client ID)."

10.2:
"...the stateid for a delegation is associated with a client ID and may be
used on behalf of all the open-owners for the given client.  A
delegation is made to the client as a whole and not to any specific
process or thread of control within it."

Reported-by: Eric Meddaugh <etmsys@rit.edu>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Olga Kornievskaia <aglo@umich.edu>
Signed-off-by: Andrew Elble <aweits@rit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0a69715..6411c34 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -778,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
 	s->sc_type = 0;
 }
 
-static void
+/**
+ * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * @clp:     a pointer to the nfs4_client we're granting a delegation to
+ * @fp:      a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ *      On success: NULL if an existing delegation was not found.
+ *
+ *      On error: -EAGAIN if one was previously granted to this nfs4_client
+ *                 for this nfs4_file.
+ *
+ */
+
+static int
+nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_delegation *searchdp = NULL;
+	struct nfs4_client *searchclp = NULL;
+
+	lockdep_assert_held(&state_lock);
+	lockdep_assert_held(&fp->fi_lock);
+
+	list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+		searchclp = searchdp->dl_stid.sc_client;
+		if (clp == searchclp) {
+			return -EAGAIN;
+		}
+	}
+	return 0;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp:     a pointer to the nfs4_delegation we are adding.
+ * @fp:     a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ *      On success: NULL if the delegation was successfully hashed.
+ *
+ *      On error: -EAGAIN if one was previously granted to this
+ *                 nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
 hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 {
+	int status;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+
 	lockdep_assert_held(&state_lock);
 	lockdep_assert_held(&fp->fi_lock);
 
+	status = nfs4_get_existing_delegation(clp, fp);
+	if (status)
+		return status;
+	++fp->fi_delegees;
 	atomic_inc(&dp->dl_stid.sc_count);
 	dp->dl_stid.sc_type = NFS4_DELEG_STID;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
-	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+	list_add(&dp->dl_perclnt, &clp->cl_delegations);
+	return 0;
 }
 
 static bool
@@ -3969,6 +4021,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 	return fl;
 }
 
+/**
+ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
+ * @dp:   a pointer to the nfs4_delegation we're adding.
+ *
+ * Return:
+ *      On success: Return code will be 0 on success.
+ *
+ *      On error: -EAGAIN if there was an existing delegation.
+ *                 nonzero if there is an error in other cases.
+ *
+ */
+
 static int nfs4_setlease(struct nfs4_delegation *dp)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
@@ -4000,16 +4064,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
 		goto out_unlock;
 	/* Race breaker */
 	if (fp->fi_deleg_file) {
-		status = 0;
-		++fp->fi_delegees;
-		hash_delegation_locked(dp, fp);
+		status = hash_delegation_locked(dp, fp);
 		goto out_unlock;
 	}
 	fp->fi_deleg_file = filp;
-	fp->fi_delegees = 1;
-	hash_delegation_locked(dp, fp);
+	fp->fi_delegees = 0;
+	status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
+	if (status) {
+		/* Should never happen, this is a new fi_deleg_file  */
+		WARN_ON_ONCE(1);
+		goto out_fput;
+	}
 	return 0;
 out_unlock:
 	spin_unlock(&fp->fi_lock);
@@ -4029,6 +4096,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 	if (fp->fi_had_conflict)
 		return ERR_PTR(-EAGAIN);
 
+	spin_lock(&state_lock);
+	spin_lock(&fp->fi_lock);
+	status = nfs4_get_existing_delegation(clp, fp);
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&state_lock);
+
+	if (status)
+		return ERR_PTR(status);
+
 	dp = alloc_init_deleg(clp, fh, odstate);
 	if (!dp)
 		return ERR_PTR(-ENOMEM);
@@ -4047,9 +4123,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		status = -EAGAIN;
 		goto out_unlock;
 	}
-	++fp->fi_delegees;
-	hash_delegation_locked(dp, fp);
-	status = 0;
+	status = hash_delegation_locked(dp, fp);
 out_unlock:
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
-- 
cgit v1.1


From 7fc0564e3a8d16df096f48c9c6425ba84d945c6e Mon Sep 17 00:00:00 2001
From: Andrew Elble <aweits@rit.edu>
Date: Thu, 5 Nov 2015 20:42:43 -0500
Subject: nfsd: fix race with open / open upgrade stateids

We observed multiple open stateids on the server for files that
seemingly should have been closed.

nfsd4_process_open2() tests for the existence of a preexisting
stateid. If one is not found, the locks are dropped and a new
one is created. The problem is that init_open_stateid(), which
is also responsible for hashing the newly initialized stateid,
doesn't check to see if another open has raced in and created
a matching stateid. This fix is to enable init_open_stateid() to
return the matching stateid and have nfsd4_process_open2()
swap to that stateid and switch to the open upgrade path.
In testing this patch, coverage to the newly created
path indicates that the race was indeed happening.

Signed-off-by: Andrew Elble <aweits@rit.edu>
Reviewed-by: Jeff Layton <jlayton@poochiereds.net>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 78 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6411c34..6b800b5b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3392,6 +3392,27 @@ static const struct nfs4_stateowner_operations openowner_ops = {
 	.so_free =	nfs4_free_openowner,
 };
 
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+	struct nfs4_ol_stateid *local, *ret = NULL;
+	struct nfs4_openowner *oo = open->op_openowner;
+
+	lockdep_assert_held(&fp->fi_lock);
+
+	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+		/* ignore lock owners */
+		if (local->st_stateowner->so_is_open_owner == 0)
+			continue;
+		if (local->st_stateowner == &oo->oo_owner) {
+			ret = local;
+			atomic_inc(&ret->st_stid.sc_count);
+			break;
+		}
+	}
+	return ret;
+}
+
 static struct nfs4_openowner *
 alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 			   struct nfsd4_compound_state *cstate)
@@ -3423,9 +3444,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
 	return ret;
 }
 
-static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
+		struct nfsd4_open *open)
+{
+
 	struct nfs4_openowner *oo = open->op_openowner;
+	struct nfs4_ol_stateid *retstp = NULL;
 
+	spin_lock(&oo->oo_owner.so_client->cl_lock);
+	spin_lock(&fp->fi_lock);
+
+	retstp = nfsd4_find_existing_open(fp, open);
+	if (retstp)
+		goto out_unlock;
 	atomic_inc(&stp->st_stid.sc_count);
 	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	INIT_LIST_HEAD(&stp->st_locks);
@@ -3436,12 +3468,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 	stp->st_deny_bmap = 0;
 	stp->st_openstp = NULL;
 	init_rwsem(&stp->st_rwsem);
-	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
-	spin_lock(&fp->fi_lock);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&oo->oo_owner.so_client->cl_lock);
+	return retstp;
 }
 
 /*
@@ -3852,27 +3885,6 @@ out:
 	return nfs_ok;
 }
 
-static struct nfs4_ol_stateid *
-nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
-{
-	struct nfs4_ol_stateid *local, *ret = NULL;
-	struct nfs4_openowner *oo = open->op_openowner;
-
-	spin_lock(&fp->fi_lock);
-	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
-		/* ignore lock owners */
-		if (local->st_stateowner->so_is_open_owner == 0)
-			continue;
-		if (local->st_stateowner == &oo->oo_owner) {
-			ret = local;
-			atomic_inc(&ret->st_stid.sc_count);
-			break;
-		}
-	}
-	spin_unlock(&fp->fi_lock);
-	return ret;
-}
-
 static inline int nfs4_access_to_access(u32 nfs4_access)
 {
 	int flags = 0;
@@ -4258,6 +4270,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
 	struct nfs4_file *fp = NULL;
 	struct nfs4_ol_stateid *stp = NULL;
+	struct nfs4_ol_stateid *swapstp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
 
@@ -4271,7 +4284,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
 			goto out;
+		spin_lock(&fp->fi_lock);
 		stp = nfsd4_find_existing_open(fp, open);
+		spin_unlock(&fp->fi_lock);
 	} else {
 		open->op_file = NULL;
 		status = nfserr_bad_stateid;
@@ -4294,7 +4309,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	} else {
 		stp = open->op_stp;
 		open->op_stp = NULL;
-		init_open_stateid(stp, fp, open);
+		swapstp = init_open_stateid(stp, fp, open);
+		if (swapstp) {
+			nfs4_put_stid(&stp->st_stid);
+			stp = swapstp;
+			down_read(&stp->st_rwsem);
+			status = nfs4_upgrade_open(rqstp, fp, current_fh,
+						stp, open);
+			if (status) {
+				up_read(&stp->st_rwsem);
+				goto out;
+			}
+			goto upgrade_out;
+		}
 		down_read(&stp->st_rwsem);
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
@@ -4308,6 +4335,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		if (stp->st_clnt_odstate == open->op_odstate)
 			open->op_odstate = NULL;
 	}
+upgrade_out:
 	nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
 	up_read(&stp->st_rwsem);
 
-- 
cgit v1.1


From c1c534609fe8a859f9c8108a5591e6e8a97e34d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 10 Nov 2015 10:14:38 -0700
Subject: direct-io: be sure to assign dio->bio_bdev for both paths

btrfs sets ->submit_io(), and we failed to set the block dev for
that path. That resulted in a potential NULL dereference when
we later wait for IO in dio_await_one().

Reported-by: kernel test robot <ying.huang@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/direct-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7025029..15381c4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -399,14 +399,14 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
+	dio->bio_bdev = bio->bi_bdev;
+
 	if (sdio->submit_io) {
 		sdio->submit_io(dio->rw, bio, dio->inode,
 			       sdio->logical_offset_in_bio);
 		dio->bio_cookie = BLK_QC_T_NONE;
-	} else {
+	} else
 		dio->bio_cookie = submit_bio(dio->rw, bio);
-		dio->bio_bdev = bio->bi_bdev;
-	}
 
 	sdio->bio = NULL;
 	sdio->boundary = 0;
-- 
cgit v1.1


From 9799d2c32bef6fba098fbef763002bc8d4851a2c Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 25 Aug 2015 21:31:40 +0800
Subject: btrfs: scrub: set error stats when tree block spanning stripes

It is better to show error stats to user when we found tree block
spanning stripes.

On a btrfs created by old version of btrfs-convert:
Before patch:
  # btrfs scrub start -B /dev/vdh
  scrub done for 8b342d35-2904-41ab-b3cb-2f929709cf47
          scrub started at Tue Aug 25 21:19:09 2015 and finished after 00:00:00
          total bytes scrubbed: 53.54MiB with 0 errors
  # dmesg
  ...
  [  128.711434] BTRFS error (device vdh): scrub: tree block 27054080 spanning stripes, ignored. logical=27000832
  [  128.712744] BTRFS error (device vdh): scrub: tree block 27054080 spanning stripes, ignored. logical=27066368
  ...

After patch:
  # btrfs scrub start -B /dev/vdh
  scrub done for ff7f844b-7a4e-4b1a-88a9-8252ab25be1b
          scrub started at Tue Aug 25 21:42:29 2015 and finished after 00:00:00
          total bytes scrubbed: 53.60MiB with 2 errors
          error details:
          corrected errors: 0, uncorrectable errors: 2, unverified errors: 0
  ERROR: There are uncorrectable errors.
  # dmesg
  ...omit...
  #

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 550de89..3e2755e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3012,6 +3012,9 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 			     logic_start + map->stripe_len)) {
 				btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
 					  key.objectid, logic_start);
+				spin_lock(&sctx->stat_lock);
+				sctx->stat.uncorrectable_errors++;
+				spin_unlock(&sctx->stat_lock);
 				goto next;
 			}
 again:
@@ -3361,6 +3364,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   "scrub: tree block %llu spanning "
 					   "stripes, ignored. logical=%llu",
 				       key.objectid, logical);
+				spin_lock(&sctx->stat_lock);
+				sctx->stat.uncorrectable_errors++;
+				spin_unlock(&sctx->stat_lock);
 				goto next;
 			}
 
-- 
cgit v1.1


From 4734b7ed79e1230602927a38bc4d04dbe9dd09c1 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Wed, 19 Aug 2015 22:39:18 +0800
Subject: btrfs: scrub: setup all fields for sblock_to_check

scrub_setup_recheck_block() isn't setup all necessary fields for
sblock_to_check because history reason.

So current code need more arguments in severial functions,
and more local variables, just to passing these lacked values to
necessary place.

This patch setup above fields to sblock_to_check in
scrub_setup_recheck_block(), for:
1: more cleanup for function arg, local variable
2: to make sblock_to_check complete, then we can use sblock_to_check
   without concern about some uninitialized member.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3e2755e..12ed8a7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1318,6 +1318,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	u64 length = original_sblock->page_count * PAGE_SIZE;
 	u64 logical = original_sblock->pagev[0]->logical;
+	u64 generation = original_sblock->pagev[0]->generation;
+	u64 flags = original_sblock->pagev[0]->flags;
+	u64 have_csum = original_sblock->pagev[0]->have_csum;
 	struct scrub_recover *recover;
 	struct btrfs_bio *bbio;
 	u64 sublen;
@@ -1372,6 +1375,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 
 			sblock = sblocks_for_recheck + mirror_index;
 			sblock->sctx = sctx;
+
 			page = kzalloc(sizeof(*page), GFP_NOFS);
 			if (!page) {
 leave_nomem:
@@ -1383,7 +1387,15 @@ leave_nomem:
 			}
 			scrub_page_get(page);
 			sblock->pagev[page_index] = page;
+			page->sblock = sblock;
+			page->flags = flags;
+			page->generation = generation;
 			page->logical = logical;
+			page->have_csum = have_csum;
+			if (have_csum)
+				memcpy(page->csum,
+				       original_sblock->pagev[0]->csum,
+				       sctx->csum_size);
 
 			scrub_stripe_index_and_offset(logical,
 						      bbio->map_type,
-- 
cgit v1.1


From 772d233f5dec86e38a96e58cc995e7b04ab4ec68 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 24 Aug 2015 21:10:18 +0800
Subject: btrfs: Reset sblock->xxx_error stats before calling
 scrub_recheck_block_checksum

We should reset sblock->xxx_error stats before calling
scrub_recheck_block_checksum().

Current code run correctly because all sblock are allocated by
k[cz]alloc(), and the error stats are not got changed.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 12ed8a7..95dc60a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1495,6 +1495,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 	sblock->no_io_error_seen = 1;
 	sblock->header_error = 0;
 	sblock->checksum_error = 0;
+	sblock->generation_error = 0;
 
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
@@ -2203,6 +2204,9 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
 	logical = sblock->pagev[0]->logical;
 	dev = sblock->pagev[0]->dev;
 
+	sblock->header_error = 0;
+	sblock->checksum_error = 0;
+	sblock->generation_error = 0;
 	if (sblock->no_io_error_seen) {
 		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
 					     have_csum, csum, generation,
-- 
cgit v1.1


From ba7cf9882bd25bed77aad7dd8865e80c7ae0070a Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 24 Aug 2015 21:18:02 +0800
Subject: btrfs: Use scrub_checksum_data and scrub_checksum_tree_block for
 scrub_recheck_block_checksum

We can use existing scrub_checksum_data() and scrub_checksum_tree_block()
for scrub_recheck_block_checksum(), instead of write duplicated code.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 130 +++++++++++++++----------------------------------------
 1 file changed, 36 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 95dc60a..3e4db4b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -251,11 +251,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
 				u16 csum_size, int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
-					 struct scrub_block *sblock,
-					 int is_metadata, int have_csum,
-					 const u8 *csum, u64 generation,
-					 u16 csum_size);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good);
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
@@ -1493,9 +1489,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 	int page_num;
 
 	sblock->no_io_error_seen = 1;
-	sblock->header_error = 0;
-	sblock->checksum_error = 0;
-	sblock->generation_error = 0;
 
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
@@ -1531,9 +1524,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 	}
 
 	if (sblock->no_io_error_seen)
-		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
-					     have_csum, csum, generation,
-					     csum_size);
+		scrub_recheck_block_checksum(sblock);
 
 	return;
 }
@@ -1548,61 +1539,16 @@ static inline int scrub_check_fsid(u8 fsid[],
 	return !ret;
 }
 
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
-					 struct scrub_block *sblock,
-					 int is_metadata, int have_csum,
-					 const u8 *csum, u64 generation,
-					 u16 csum_size)
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
 {
-	int page_num;
-	u8 calculated_csum[BTRFS_CSUM_SIZE];
-	u32 crc = ~(u32)0;
-	void *mapped_buffer;
-
-	WARN_ON(!sblock->pagev[0]->page);
-	if (is_metadata) {
-		struct btrfs_header *h;
-
-		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
-		h = (struct btrfs_header *)mapped_buffer;
-
-		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
-		    !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
-		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
-			   BTRFS_UUID_SIZE)) {
-			sblock->header_error = 1;
-		} else if (generation != btrfs_stack_header_generation(h)) {
-			sblock->header_error = 1;
-			sblock->generation_error = 1;
-		}
-		csum = h->csum;
-	} else {
-		if (!have_csum)
-			return;
-
-		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
-	}
-
-	for (page_num = 0;;) {
-		if (page_num == 0 && is_metadata)
-			crc = btrfs_csum_data(
-				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
-				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
-		else
-			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
-
-		kunmap_atomic(mapped_buffer);
-		page_num++;
-		if (page_num >= sblock->page_count)
-			break;
-		WARN_ON(!sblock->pagev[page_num]->page);
-
-		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
-	}
+	sblock->header_error = 0;
+	sblock->checksum_error = 0;
+	sblock->generation_error = 0;
 
-	btrfs_csum_final(crc, calculated_csum);
-	if (memcmp(calculated_csum, csum, csum_size))
-		sblock->checksum_error = 1;
+	if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+		scrub_checksum_data(sblock);
+	else
+		scrub_checksum_tree_block(sblock);
 }
 
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
@@ -1846,6 +1792,18 @@ static int scrub_checksum(struct scrub_block *sblock)
 	u64 flags;
 	int ret;
 
+	/*
+	 * No need to initialize these stats currently,
+	 * because this function only use return value
+	 * instead of these stats value.
+	 *
+	 * Todo:
+	 * always use stats
+	 */
+	sblock->header_error = 0;
+	sblock->generation_error = 0;
+	sblock->checksum_error = 0;
+
 	WARN_ON(sblock->page_count < 1);
 	flags = sblock->pagev[0]->flags;
 	ret = 0;
@@ -1871,7 +1829,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 	struct page *page;
 	void *buffer;
 	u32 crc = ~(u32)0;
-	int fail = 0;
 	u64 len;
 	int index;
 
@@ -1902,9 +1859,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 
 	btrfs_csum_final(crc, csum);
 	if (memcmp(csum, on_disk_csum, sctx->csum_size))
-		fail = 1;
+		sblock->checksum_error = 1;
 
-	return fail;
+	return sblock->checksum_error;
 }
 
 static int scrub_checksum_tree_block(struct scrub_block *sblock)
@@ -1920,8 +1877,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	u64 mapped_size;
 	void *p;
 	u32 crc = ~(u32)0;
-	int fail = 0;
-	int crc_fail = 0;
 	u64 len;
 	int index;
 
@@ -1936,19 +1891,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	 * a) don't have an extent buffer and
 	 * b) the page is already kmapped
 	 */
-
 	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
-		++fail;
+		sblock->header_error = 1;
 
-	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
-		++fail;
+	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+		sblock->header_error = 1;
+		sblock->generation_error = 1;
+	}
 
 	if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
-		++fail;
+		sblock->header_error = 1;
 
 	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 		   BTRFS_UUID_SIZE))
-		++fail;
+		sblock->header_error = 1;
 
 	len = sctx->nodesize - BTRFS_CSUM_SIZE;
 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1973,9 +1929,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 
 	btrfs_csum_final(crc, calculated_csum);
 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
-		++crc_fail;
+		sblock->checksum_error = 1;
 
-	return fail || crc_fail;
+	return sblock->header_error || sblock->checksum_error;
 }
 
 static int scrub_checksum_super(struct scrub_block *sblock)
@@ -2189,42 +2145,28 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
 {
 	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
 	struct scrub_ctx *sctx = sblock->sctx;
-	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
-	unsigned int is_metadata;
-	unsigned int have_csum;
-	u8 *csum;
-	u64 generation;
 	u64 logical;
 	struct btrfs_device *dev;
 
-	is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
-	have_csum = sblock->pagev[0]->have_csum;
-	csum = sblock->pagev[0]->csum;
-	generation = sblock->pagev[0]->generation;
 	logical = sblock->pagev[0]->logical;
 	dev = sblock->pagev[0]->dev;
 
-	sblock->header_error = 0;
-	sblock->checksum_error = 0;
-	sblock->generation_error = 0;
 	if (sblock->no_io_error_seen) {
-		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
-					     have_csum, csum, generation,
-					     sctx->csum_size);
+		scrub_recheck_block_checksum(sblock);
 	}
 
 	if (!sblock->no_io_error_seen) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.read_errors++;
 		spin_unlock(&sctx->stat_lock);
-		btrfs_err_rl_in_rcu(fs_info,
+		btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
 			"IO error rebuilding logical %llu for dev %s",
 			logical, rcu_str_deref(dev->name));
 	} else if (sblock->header_error || sblock->checksum_error) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
-		btrfs_err_rl_in_rcu(fs_info,
+		btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
 			"failed to rebuild valid logical %llu for dev %s",
 			logical, rcu_str_deref(dev->name));
 	} else {
-- 
cgit v1.1


From affe4a5ae18baa2ab67af42eb21d7d734b851819 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 24 Aug 2015 21:32:06 +0800
Subject: btrfs: Reduce unnecessary arguments in scrub_recheck_block

We don't need pass so many arguments for recheck sblock now,
this patch cleans them.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3e4db4b..f8a4ca2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -248,9 +248,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 				     struct scrub_block *sblocks_for_recheck);
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-				struct scrub_block *sblock, int is_metadata,
-				int have_csum, u8 *csum, u64 generation,
-				u16 csum_size, int retry_failed_mirror);
+				struct scrub_block *sblock,
+				int retry_failed_mirror);
 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good);
@@ -885,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	struct btrfs_fs_info *fs_info;
 	u64 length;
 	u64 logical;
-	u64 generation;
 	unsigned int failed_mirror_index;
 	unsigned int is_metadata;
 	unsigned int have_csum;
-	u8 *csum;
 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 	struct scrub_block *sblock_bad;
 	int ret;
@@ -914,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	}
 	length = sblock_to_check->page_count * PAGE_SIZE;
 	logical = sblock_to_check->pagev[0]->logical;
-	generation = sblock_to_check->pagev[0]->generation;
 	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 	is_metadata = !(sblock_to_check->pagev[0]->flags &
 			BTRFS_EXTENT_FLAG_DATA);
 	have_csum = sblock_to_check->pagev[0]->have_csum;
-	csum = sblock_to_check->pagev[0]->csum;
 	dev = sblock_to_check->pagev[0]->dev;
 
 	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
@@ -983,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	sblock_bad = sblocks_for_recheck + failed_mirror_index;
 
 	/* build and submit the bios for the failed mirror, check checksums */
-	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-			    csum, generation, sctx->csum_size, 1);
+	scrub_recheck_block(fs_info, sblock_bad, 1);
 
 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 	    sblock_bad->no_io_error_seen) {
@@ -1097,9 +1091,7 @@ nodatasum_case:
 		sblock_other = sblocks_for_recheck + mirror_index;
 
 		/* build and submit the bios, check checksums */
-		scrub_recheck_block(fs_info, sblock_other, is_metadata,
-				    have_csum, csum, generation,
-				    sctx->csum_size, 0);
+		scrub_recheck_block(fs_info, sblock_other, 0);
 
 		if (!sblock_other->header_error &&
 		    !sblock_other->checksum_error &&
@@ -1211,9 +1203,7 @@ nodatasum_case:
 			 * is verified, but most likely the data comes out
 			 * of the page cache.
 			 */
-			scrub_recheck_block(fs_info, sblock_bad,
-					    is_metadata, have_csum, csum,
-					    generation, sctx->csum_size, 1);
+			scrub_recheck_block(fs_info, sblock_bad, 1);
 			if (!sblock_bad->header_error &&
 			    !sblock_bad->checksum_error &&
 			    sblock_bad->no_io_error_seen)
@@ -1482,9 +1472,8 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
  * the pages that are errored in the just handled mirror can be repaired.
  */
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-				struct scrub_block *sblock, int is_metadata,
-				int have_csum, u8 *csum, u64 generation,
-				u16 csum_size, int retry_failed_mirror)
+				struct scrub_block *sblock,
+				int retry_failed_mirror)
 {
 	int page_num;
 
@@ -2151,9 +2140,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
 	logical = sblock->pagev[0]->logical;
 	dev = sblock->pagev[0]->dev;
 
-	if (sblock->no_io_error_seen) {
+	if (sblock->no_io_error_seen)
 		scrub_recheck_block_checksum(sblock);
-	}
 
 	if (!sblock->no_io_error_seen) {
 		spin_lock(&sctx->stat_lock);
-- 
cgit v1.1


From 3b5753ec23597b05fd41d3f19ef582d6af1a6477 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 24 Aug 2015 22:03:02 +0800
Subject: btrfs: Remove len argument from scrub_find_csum

It is useless.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f8a4ca2..2907a77 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2446,8 +2446,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
 	}
 }
 
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
-			   u8 *csum)
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
 {
 	struct btrfs_ordered_sum *sum = NULL;
 	unsigned long index;
@@ -2511,7 +2510,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
 
 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
 			/* push csums to sbio */
-			have_csum = scrub_find_csum(sctx, logical, l, csum);
+			have_csum = scrub_find_csum(sctx, logical, csum);
 			if (have_csum == 0)
 				++sctx->stat.no_csum;
 			if (sctx->is_dev_replace && !have_csum) {
@@ -2649,7 +2648,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
 
 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
 			/* push csums to sbio */
-			have_csum = scrub_find_csum(sctx, logical, l, csum);
+			have_csum = scrub_find_csum(sctx, logical, csum);
 			if (have_csum == 0)
 				goto skip;
 		}
-- 
cgit v1.1


From aefbe9a633b50a6124dbeb33a5d4efcdc6de6c30 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 29 Sep 2015 21:03:54 +0800
Subject: btrfs: Fix lost-data-profile caused by auto removing bg

Reproduce:
 (In integration-4.3 branch)

 TEST_DEV=(/dev/vdg /dev/vdh)
 TEST_DIR=/mnt/tmp

 umount "$TEST_DEV" >/dev/null
 mkfs.btrfs -f -d raid1 "${TEST_DEV[@]}"

 mount -o nospace_cache "$TEST_DEV" "$TEST_DIR"
 umount "$TEST_DEV"

 mount -o nospace_cache "$TEST_DEV" "$TEST_DIR"
 btrfs filesystem usage $TEST_DIR

We can see the data chunk changed from raid1 to single:
 # btrfs filesystem usage $TEST_DIR
 Data,single: Size:8.00MiB, Used:0.00B
    /dev/vdg        8.00MiB
 #

Reason:
 When a empty filesystem mount with -o nospace_cache, the last
 data blockgroup will be auto-removed in umount.

 Then if we mount it again, there is no data chunk in the
 filesystem, so the only available data profile is 0x0, result
 is all new chunks are created as single type.

Fix:
 Don't auto-delete last blockgroup for a raid type.

Test:
 Test by above script, and confirmed the logic by debug output.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 99a8e57..b77ffbc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10279,8 +10279,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		block_group = list_first_entry(&fs_info->unused_bgs,
 					       struct btrfs_block_group_cache,
 					       bg_list);
-		space_info = block_group->space_info;
 		list_del_init(&block_group->bg_list);
+
+		space_info = block_group->space_info;
+
 		if (ret || btrfs_mixed_space_info(space_info)) {
 			btrfs_put_block_group(block_group);
 			continue;
@@ -10294,7 +10296,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_lock(&block_group->lock);
 		if (block_group->reserved ||
 		    btrfs_block_group_used(&block_group->item) ||
-		    block_group->ro) {
+		    block_group->ro ||
+		    list_is_singular(&block_group->list)) {
 			/*
 			 * We want to bail if we made new allocations or have
 			 * outstanding allocations in this block group.  We do
-- 
cgit v1.1


From 2c9fe835525896077e7e6d8e416b97f2f868edef Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 9 Nov 2015 11:51:32 +0800
Subject: btrfs: Fix lost-data-profile caused by balance bg

Reproduce:
 (In integration-4.3 branch)

 TEST_DEV=(/dev/vdg /dev/vdh)
 TEST_DIR=/mnt/tmp

 umount "$TEST_DEV" >/dev/null
 mkfs.btrfs -f -d raid1 "${TEST_DEV[@]}"

 mount -o nospace_cache "$TEST_DEV" "$TEST_DIR"
 btrfs balance start -dusage=0 $TEST_DIR
 btrfs filesystem usage $TEST_DIR

 dd if=/dev/zero of="$TEST_DIR"/file count=100
 btrfs filesystem usage $TEST_DIR

Result:
 We can see "no data chunk" in first "btrfs filesystem usage":
 # btrfs filesystem usage $TEST_DIR
 Overall:
    ...
 Metadata,single: Size:8.00MiB, Used:0.00B
    /dev/vdg        8.00MiB
 Metadata,RAID1: Size:122.88MiB, Used:112.00KiB
    /dev/vdg      122.88MiB
    /dev/vdh      122.88MiB
 System,single: Size:4.00MiB, Used:0.00B
    /dev/vdg        4.00MiB
 System,RAID1: Size:8.00MiB, Used:16.00KiB
    /dev/vdg        8.00MiB
    /dev/vdh        8.00MiB
 Unallocated:
    /dev/vdg        1.06GiB
    /dev/vdh        1.07GiB

 And "data chunks changed from raid1 to single" in second
 "btrfs filesystem usage":
 # btrfs filesystem usage $TEST_DIR
 Overall:
    ...
 Data,single: Size:256.00MiB, Used:0.00B
    /dev/vdh      256.00MiB
 Metadata,single: Size:8.00MiB, Used:0.00B
    /dev/vdg        8.00MiB
 Metadata,RAID1: Size:122.88MiB, Used:112.00KiB
    /dev/vdg      122.88MiB
    /dev/vdh      122.88MiB
 System,single: Size:4.00MiB, Used:0.00B
    /dev/vdg        4.00MiB
 System,RAID1: Size:8.00MiB, Used:16.00KiB
    /dev/vdg        8.00MiB
    /dev/vdh        8.00MiB
 Unallocated:
    /dev/vdg        1.06GiB
    /dev/vdh      841.92MiB

Reason:
 btrfs balance delete last data chunk in case of no data in
 the filesystem, then we can see "no data chunk" by "fi usage"
 command.

 And when we do write operation to fs, the only available data
 profile is 0x0, result is all new chunks are allocated single type.

Fix:
 Allocate a data chunk explicitly to ensure we don't lose the
 raid profile for data.

Test:
 Test by above script, and confirmed the logic by debug output.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 17ed76d..d198dd3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3400,6 +3400,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	u32 count_data = 0;
 	u32 count_meta = 0;
 	u32 count_sys = 0;
+	int chunk_reserved = 0;
 
 	/* step one make some room on all the devices */
 	devices = &fs_info->fs_devices->devices;
@@ -3501,6 +3502,7 @@ again:
 
 		ret = should_balance_chunk(chunk_root, leaf, chunk,
 					   found_key.offset);
+
 		btrfs_release_path(path);
 		if (!ret) {
 			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3537,6 +3539,25 @@ again:
 			goto loop;
 		}
 
+		if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+			trans = btrfs_start_transaction(chunk_root, 0);
+			if (IS_ERR(trans)) {
+				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+				ret = PTR_ERR(trans);
+				goto error;
+			}
+
+			ret = btrfs_force_chunk_alloc(trans, chunk_root,
+						      BTRFS_BLOCK_GROUP_DATA);
+			if (ret < 0) {
+				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+				goto error;
+			}
+
+			btrfs_end_transaction(trans, chunk_root);
+			chunk_reserved = 1;
+		}
+
 		ret = btrfs_relocate_chunk(chunk_root,
 					   found_key.offset);
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-- 
cgit v1.1


From d5f2e33b92b10b8007be50b570a27e9bacfb4c3a Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 8 Oct 2015 18:46:44 +0800
Subject: btrfs: Use fs_info directly in btrfs_delete_unused_bgs

No need to use root->fs_info in btrfs_delete_unused_bgs(),
use fs_info directly instead.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b77ffbc..acf3ed1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10289,7 +10289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		}
 		spin_unlock(&fs_info->unused_bgs_lock);
 
-		mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+		mutex_lock(&fs_info->delete_unused_bgs_mutex);
 
 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
@@ -10413,7 +10413,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 end_trans:
 		btrfs_end_transaction(trans, root);
 next:
-		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
-- 
cgit v1.1


From 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 5 Nov 2015 00:01:51 +0100
Subject: debugfs: fix refcount imbalance in start_creating

In debugfs' start_creating(), we pin the file system to safely access
its root. When we failed to create a file, we unpin the file system via
failed_creating() to release the mount count and eventually the reference
of the vfsmount.

However, when we run into an error during lookup_one_len() when still
in start_creating(), we only release the parent's mutex but not so the
reference on the mount. Looks like it was done in the past, but after
splitting portions of __create_file() into start_creating() and
end_creating() via 190afd81e4a5 ("debugfs: split the beginning and the
end of __create_file() off"), this seemed missed. Noticed during code
review.

Fixes: 190afd81e4a5 ("debugfs: split the beginning and the end of __create_file() off")
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/debugfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 5d8f35f..b7fcc0d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -271,8 +271,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 		dput(dentry);
 		dentry = ERR_PTR(-EEXIST);
 	}
-	if (IS_ERR(dentry))
+
+	if (IS_ERR(dentry)) {
 		mutex_unlock(&d_inode(parent)->i_mutex);
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	}
+
 	return dentry;
 }
 
-- 
cgit v1.1


From 86108c2e34a26e4bec3c6ddb23390bf8cedcf391 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 4 Nov 2015 15:20:15 +0000
Subject: FS-Cache: Increase reference of parent after registering, netfs
 success

If netfs exist, fscache should not increase the reference of parent's
usage and n_children, otherwise, never be decreased.

v2: thanks David's suggest,
 move increasing reference of parent if success
 use kmem_cache_free() freeing primary_index directly

v3: don't move "netfs->primary_index->parent = &fscache_fsdef_index;"

Cc: stable@vger.kernel.org # v2.6.30+
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fscache/netfs.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 6d941f5..458cc96 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -47,9 +47,6 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	netfs->primary_index->netfs_data	= netfs;
 	netfs->primary_index->flags		= 1 << FSCACHE_COOKIE_ENABLED;
 
-	atomic_inc(&netfs->primary_index->parent->usage);
-	atomic_inc(&netfs->primary_index->parent->n_children);
-
 	spin_lock_init(&netfs->primary_index->lock);
 	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
 
@@ -62,6 +59,9 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 			goto already_registered;
 	}
 
+	atomic_inc(&netfs->primary_index->parent->usage);
+	atomic_inc(&netfs->primary_index->parent->n_children);
+
 	list_add(&netfs->link, &fscache_netfs_list);
 	ret = 0;
 
@@ -71,8 +71,7 @@ already_registered:
 	up_write(&fscache_addremove_sem);
 
 	if (ret < 0) {
-		netfs->primary_index->parent = NULL;
-		__fscache_cookie_put(netfs->primary_index);
+		kmem_cache_free(fscache_cookie_jar, netfs->primary_index);
 		netfs->primary_index = NULL;
 	}
 
-- 
cgit v1.1


From b130ed5998e62879a66bad08931a2b5e832da95c Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 4 Nov 2015 15:20:24 +0000
Subject: FS-Cache: Don't override netfs's primary_index if registering failed

Only override netfs->primary_index when registering success.

Cc: stable@vger.kernel.org # v2.6.30+
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fscache/netfs.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 458cc96..9b28649 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -22,6 +22,7 @@ static LIST_HEAD(fscache_netfs_list);
 int __fscache_register_netfs(struct fscache_netfs *netfs)
 {
 	struct fscache_netfs *ptr;
+	struct fscache_cookie *cookie;
 	int ret;
 
 	_enter("{%s}", netfs->name);
@@ -29,26 +30,25 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	INIT_LIST_HEAD(&netfs->link);
 
 	/* allocate a cookie for the primary index */
-	netfs->primary_index =
-		kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+	cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
 
-	if (!netfs->primary_index) {
+	if (!cookie) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
 
 	/* initialise the primary index cookie */
-	atomic_set(&netfs->primary_index->usage, 1);
-	atomic_set(&netfs->primary_index->n_children, 0);
-	atomic_set(&netfs->primary_index->n_active, 1);
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->n_children, 0);
+	atomic_set(&cookie->n_active, 1);
 
-	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
-	netfs->primary_index->parent		= &fscache_fsdef_index;
-	netfs->primary_index->netfs_data	= netfs;
-	netfs->primary_index->flags		= 1 << FSCACHE_COOKIE_ENABLED;
+	cookie->def		= &fscache_fsdef_netfs_def;
+	cookie->parent		= &fscache_fsdef_index;
+	cookie->netfs_data	= netfs;
+	cookie->flags		= 1 << FSCACHE_COOKIE_ENABLED;
 
-	spin_lock_init(&netfs->primary_index->lock);
-	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+	spin_lock_init(&cookie->lock);
+	INIT_HLIST_HEAD(&cookie->backing_objects);
 
 	/* check the netfs type is not already present */
 	down_write(&fscache_addremove_sem);
@@ -59,9 +59,10 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 			goto already_registered;
 	}
 
-	atomic_inc(&netfs->primary_index->parent->usage);
-	atomic_inc(&netfs->primary_index->parent->n_children);
+	atomic_inc(&cookie->parent->usage);
+	atomic_inc(&cookie->parent->n_children);
 
+	netfs->primary_index = cookie;
 	list_add(&netfs->link, &fscache_netfs_list);
 	ret = 0;
 
@@ -70,10 +71,8 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 already_registered:
 	up_write(&fscache_addremove_sem);
 
-	if (ret < 0) {
-		kmem_cache_free(fscache_cookie_jar, netfs->primary_index);
-		netfs->primary_index = NULL;
-	}
+	if (ret < 0)
+		kmem_cache_free(fscache_cookie_jar, cookie);
 
 	_leave(" = %d", ret);
 	return ret;
-- 
cgit v1.1


From 95201a40604791bc4a2e8d066429be89fb82b46d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 4 Nov 2015 15:20:34 +0000
Subject: cachefiles: perform test on s_blocksize when opening cache file.

cachefiles requires that s_blocksize in the cache is not greater than
PAGE_SIZE, and performs the check every time a block is accessed.

Move the test to the place where the file is "opened", where other
file-validity tests are performed.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cachefiles/namei.c | 2 ++
 fs/cachefiles/rdwr.c  | 6 ------
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index fc1056f..c4b8934 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -655,6 +655,8 @@ lookup_again:
 			aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
 			if (!aops->bmap)
 				goto check_error;
+			if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
+				goto check_error;
 
 			object->backer = object->dentry;
 		} else {
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3cbb0e8..e76c245 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -414,9 +414,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	ASSERT(inode->i_mapping->a_ops->readpages);
 
 	/* calculate the shift required to use bmap */
-	if (inode->i_sb->s_blocksize > PAGE_SIZE)
-		goto enobufs;
-
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
@@ -711,9 +708,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	ASSERT(inode->i_mapping->a_ops->readpages);
 
 	/* calculate the shift required to use bmap */
-	if (inode->i_sb->s_blocksize > PAGE_SIZE)
-		goto all_enobufs;
-
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
 	pagevec_init(&pagevec, 0);
-- 
cgit v1.1


From 102f4d900c9c8f5ed89ae4746d493fe3ebd7ba64 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 4 Nov 2015 15:20:42 +0000
Subject: FS-Cache: Handle a write to the page immediately beyond the EOF
 marker

Handle a write being requested to the page immediately beyond the EOF
marker on a cache object.  Currently this gets an assertion failure in
CacheFiles because the EOF marker is used there to encode information about
a partial page at the EOF - which could lead to an unknown blank spot in
the file if we extend the file over it.

The problem is actually in fscache where we check the index of the page
being written against store_limit.  store_limit is set to the number of
pages that we're allowed to store by fscache_set_store_limit() - which
means it's one more than the index of the last page we're allowed to store.
The problem is that we permit writing to a page with an index _equal_ to
the store limit - when we should reject that case.

Whilst we're at it, change the triggered assertion in CacheFiles to just
return -ENOBUFS instead.

The assertion failure looks something like this:

CacheFiles: Assertion failed
1000 < 7b1 is false
------------[ cut here ]------------
kernel BUG at fs/cachefiles/rdwr.c:962!
...
RIP: 0010:[<ffffffffa02c9e83>]  [<ffffffffa02c9e83>] cachefiles_write_page+0x273/0x2d0 [cachefiles]

Cc: stable@vger.kernel.org # v2.6.31+; earlier - that + backport of a17754f (at least)
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cachefiles/rdwr.c | 67 +++++++++++++++++++++++++++++-----------------------
 fs/fscache/page.c    |  2 +-
 2 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index e76c245..7a6b02f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -899,6 +899,15 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 	cache = container_of(object->fscache.cache,
 			     struct cachefiles_cache, cache);
 
+	pos = (loff_t)page->index << PAGE_SHIFT;
+
+	/* We mustn't write more data than we have, so we have to beware of a
+	 * partial page at EOF.
+	 */
+	eof = object->fscache.store_limit_l;
+	if (pos >= eof)
+		goto error;
+
 	/* write the page to the backing filesystem and let it store it in its
 	 * own time */
 	path.mnt = cache->mnt;
@@ -906,40 +915,38 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 	file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
-	} else {
-		pos = (loff_t) page->index << PAGE_SHIFT;
-
-		/* we mustn't write more data than we have, so we have
-		 * to beware of a partial page at EOF */
-		eof = object->fscache.store_limit_l;
-		len = PAGE_SIZE;
-		if (eof & ~PAGE_MASK) {
-			ASSERTCMP(pos, <, eof);
-			if (eof - pos < PAGE_SIZE) {
-				_debug("cut short %llx to %llx",
-				       pos, eof);
-				len = eof - pos;
-				ASSERTCMP(pos + len, ==, eof);
-			}
-		}
-
-		data = kmap(page);
-		ret = __kernel_write(file, data, len, &pos);
-		kunmap(page);
-		if (ret != len)
-			ret = -EIO;
-		fput(file);
+		goto error_2;
 	}
 
-	if (ret < 0) {
-		if (ret == -EIO)
-			cachefiles_io_error_obj(
-				object, "Write page to backing file failed");
-		ret = -ENOBUFS;
+	len = PAGE_SIZE;
+	if (eof & ~PAGE_MASK) {
+		if (eof - pos < PAGE_SIZE) {
+			_debug("cut short %llx to %llx",
+			       pos, eof);
+			len = eof - pos;
+			ASSERTCMP(pos + len, ==, eof);
+		}
 	}
 
-	_leave(" = %d", ret);
-	return ret;
+	data = kmap(page);
+	ret = __kernel_write(file, data, len, &pos);
+	kunmap(page);
+	fput(file);
+	if (ret != len)
+		goto error_eio;
+
+	_leave(" = 0");
+	return 0;
+
+error_eio:
+	ret = -EIO;
+error_2:
+	if (ret == -EIO)
+		cachefiles_io_error_obj(object,
+					"Write page to backing file failed");
+error:
+	_leave(" = -ENOBUFS [%d]", ret);
+	return -ENOBUFS;
 }
 
 /*
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 79483b3..6b35fc4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -816,7 +816,7 @@ static void fscache_write_op(struct fscache_operation *_op)
 		goto superseded;
 	page = results[0];
 	_debug("gang %d [%lx]", n, page->index);
-	if (page->index > op->store_limit) {
+	if (page->index >= op->store_limit) {
 		fscache_stat(&fscache_n_store_pages_over_limit);
 		goto superseded;
 	}
-- 
cgit v1.1


From b582ef5c53040c5feef4c96a8f9585b6831e2441 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Mon, 26 Oct 2015 15:48:19 +0000
Subject: binfmt_elf: Don't clobber passed executable's file header

Do not clobber the buffer space passed from `search_binary_handler' and
originally preloaded by `prepare_binprm' with the executable's file
header by overwriting it with its interpreter's file header.  Instead
keep the buffer space intact and directly use the data structure locally
allocated for the interpreter's file header, fixing a bug introduced in
2.1.14 with loadable module support (linux-mips.org commit beb11695
[Import of Linux/MIPS 2.1.14], predating kernel.org repo's history).
Adjust the amount of data read from the interpreter's file accordingly.

This was not an issue before loadable module support, because back then
`load_elf_binary' was executed only once for a given ELF executable,
whether the function succeeded or failed.

With loadable module support supported and enabled, upon a failure of
`load_elf_binary' -- which may for example be caused by architecture
code rejecting an executable due to a missing hardware feature requested
in the file header -- a module load is attempted and then the function
reexecuted by `search_binary_handler'.  With the executable's file
header replaced with its interpreter's file header the executable can
then be erroneously accepted in this subsequent attempt.

Cc: stable@vger.kernel.org # all the way back
Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6b65996..e6572a6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -759,16 +759,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			 */
 			would_dump(bprm, interpreter);
 
-			retval = kernel_read(interpreter, 0, bprm->buf,
-					     BINPRM_BUF_SIZE);
-			if (retval != BINPRM_BUF_SIZE) {
+			/* Get the exec headers */
+			retval = kernel_read(interpreter, 0,
+					     (void *)&loc->interp_elf_ex,
+					     sizeof(loc->interp_elf_ex));
+			if (retval != sizeof(loc->interp_elf_ex)) {
 				if (retval >= 0)
 					retval = -EIO;
 				goto out_free_dentry;
 			}
 
-			/* Get the exec headers */
-			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
 			break;
 		}
 		elf_ppnt++;
-- 
cgit v1.1


From e9bb1f9b12e33fc0b2169b3937dc50c44ad87dd5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 17 Oct 2015 16:26:08 -0500
Subject: fs/pipe.c: preserve alloc_file() error code

If sys_pipe() was unable to allocate a 'struct file', it always failed
with ENFILE, which means "The number of simultaneously open files in the
system would exceed a system-imposed limit." However, alloc_file()
actually returns an ERR_PTR value and might fail with other error codes.
Currently, in addition to ENFILE, it can fail with ENOMEM, potentially
when there are few open files in the system.  Update sys_pipe() to
preserve this error code.

In a prior submission of a similar patch (1) some concern was raised
about introducing a new error code for sys_pipe().  However, for most
system calls, programs cannot assume that new error codes will never be
introduced.  In addition, ENOMEM was, in fact, already a possible error
code for sys_pipe(), in the case where the file descriptor table could
not be expanded due to insufficient memory.

	(1) http://comments.gmane.org/gmane.linux.kernel/1357942

Signed-off-by: Eric Biggers <ebiggers3@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 8865f79..997de34 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -693,17 +693,20 @@ int create_pipe_files(struct file **res, int flags)
 
 	d_instantiate(path.dentry, inode);
 
-	err = -ENFILE;
 	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
-	if (IS_ERR(f))
+	if (IS_ERR(f)) {
+		err = PTR_ERR(f);
 		goto err_dentry;
+	}
 
 	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
 	f->private_data = inode->i_pipe;
 
 	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
-	if (IS_ERR(res[0]))
+	if (IS_ERR(res[0])) {
+		err = PTR_ERR(res[0]);
 		goto err_file;
+	}
 
 	path_get(&path);
 	res[0]->private_data = inode->i_pipe;
-- 
cgit v1.1


From 6ae08069939f17422835448acae76bda8d96b16a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 17 Oct 2015 16:26:09 -0500
Subject: fs/pipe.c: return error code rather than 0 in pipe_write()

pipe_write() would return 0 if it failed to merge the beginning of the
data to write with the last, partially filled pipe buffer.  It should
return an error code instead.  Userspace programs could be confused by
write() returning 0 when called with a nonzero 'count'.

The EFAULT error case was a regression from f0d1bec9d5 ("new helper:
copy_page_from_iter()"), while the ops->confirm() error case was a much
older bug.

Test program:

	#include <assert.h>
	#include <errno.h>
	#include <unistd.h>

	int main(void)
	{
		int fd[2];
		char data[1] = {0};

		assert(0 == pipe(fd));
		assert(1 == write(fd[1], data, 1));

		/* prior to this patch, write() returned 0 here  */
		assert(-1 == write(fd[1], NULL, 1));
		assert(errno == EFAULT);
	}

Cc: stable@vger.kernel.org # at least v3.15+
Signed-off-by: Eric Biggers <ebiggers3@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 997de34..42cf8dd 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -366,18 +366,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		int offset = buf->offset + buf->len;
 
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
-			int error = ops->confirm(pipe, buf);
-			if (error)
+			ret = ops->confirm(pipe, buf);
+			if (ret)
 				goto out;
 
 			ret = copy_page_from_iter(buf->page, offset, chars, from);
 			if (unlikely(ret < chars)) {
-				error = -EFAULT;
+				ret = -EFAULT;
 				goto out;
 			}
 			do_wakeup = 1;
-			buf->len += chars;
-			ret = chars;
+			buf->len += ret;
 			if (!iov_iter_count(from))
 				goto out;
 		}
-- 
cgit v1.1


From 034ae4bac93afb483aaf8531e3a6de2ee7f1be6e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 5 Nov 2015 16:53:47 -0800
Subject: fs: fix inode.c kernel-doc warning

Fix kernel-doc warning in fs/inode.c:

..//fs/inode.c:1606: warning: No description found for parameter 'inode'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 78a17b8..1be5f90 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1597,6 +1597,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 /**
  *	touch_atime	-	update the access time
  *	@path: the &struct path to update
+ *	@inode: inode to update
  *
  *	Update the accessed time on an inode and mark it for writeback.
  *	This function automatically handles read only file systems and media,
-- 
cgit v1.1


From 88a578d823c1d8acb298cf317f76a3220d307d2e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 5 Nov 2015 16:54:04 -0800
Subject: fs: fix writeback.c kernel-doc warnings

Fix kernel-doc warnings in fs/fs-writeback.c by moving a #define macro
to after the function's opening brace. Also #undef this macro at the
end of the function.

..//fs/fs-writeback.c:1984: warning: Excess function parameter 'inode' description in 'I_DIRTY_INODE'
..//fs/fs-writeback.c:1984: warning: Excess function parameter 'flags' description in 'I_DIRTY_INODE'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 206a68b..023f6a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1981,9 +1981,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 	struct super_block *sb = inode->i_sb;
 	int dirtytime;
 
@@ -2093,6 +2093,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 out_unlock_inode:
 	spin_unlock(&inode->i_lock);
 
+#undef I_DIRTY_INODE
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-- 
cgit v1.1


From 54d15714f7497094a0839537f0d47946edb45761 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Mon, 26 Oct 2015 15:47:57 +0000
Subject: binfmt_elf: Correct `arch_check_elf's description

Correct `arch_check_elf's description, mistakenly copied and pasted from
`arch_elf_pt_proc'.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e6572a6..b0b9a77 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -487,7 +487,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
 }
 
 /**
- * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * arch_check_elf() - check an ELF executable
  * @ehdr:	The main ELF header
  * @has_interp:	True if the ELF has an interpreter, else false.
  * @state:	Architecture-specific state preserved throughout the process
-- 
cgit v1.1


From 5c50002963369c7c622b18ff751719eadbe225c5 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 13 Oct 2015 16:51:02 -0600
Subject: vfs: remove unused wrapper block_page_mkwrite()

The function currently called "__block_page_mkwrite()" used to be called
"block_page_mkwrite()" until a wrapper for this function was added by:

commit 24da4fab5a61 ("vfs: Create __block_page_mkwrite() helper passing
	error values back")

This wrapper, the current "block_page_mkwrite()", is currently unused.
__block_page_mkwrite() is used directly by ext4, nilfs2 and xfs.

Remove the unused wrapper, rename __block_page_mkwrite() back to
block_page_mkwrite() and update the comment above block_page_mkwrite().

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.com>
Cc: Jan Kara <jack@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c       | 24 ++----------------------
 fs/ext4/inode.c   |  4 ++--
 fs/nilfs2/file.c  |  2 +-
 fs/xfs/xfs_file.c |  2 +-
 4 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 51aff02..4f4cd95 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2420,9 +2420,9 @@ EXPORT_SYMBOL(block_commit_write);
  * unlock the page.
  *
  * Direct callers of this function should protect against filesystem freezing
- * using sb_start_write() - sb_end_write() functions.
+ * using sb_start_pagefault() - sb_end_pagefault() functions.
  */
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 			 get_block_t get_block)
 {
 	struct page *page = vmf->page;
@@ -2459,26 +2459,6 @@ out_unlock:
 	unlock_page(page);
 	return ret;
 }
-EXPORT_SYMBOL(__block_page_mkwrite);
-
-int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		   get_block_t get_block)
-{
-	int ret;
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
-	sb_start_pagefault(sb);
-
-	/*
-	 * Update file times before taking page lock. We may end up failing the
-	 * fault so this update may be superfluous but who really cares...
-	 */
-	file_update_time(vma->vm_file);
-
-	ret = __block_page_mkwrite(vma, vmf, get_block);
-	sb_end_pagefault(sb);
-	return block_page_mkwrite_return(ret);
-}
 EXPORT_SYMBOL(block_page_mkwrite);
 
 /*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7d1aad1..ea433a7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5283,7 +5283,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	    !ext4_should_journal_data(inode) &&
 	    !ext4_nonda_switch(inode->i_sb)) {
 		do {
-			ret = __block_page_mkwrite(vma, vmf,
+			ret = block_page_mkwrite(vma, vmf,
 						   ext4_da_get_block_prep);
 		} while (ret == -ENOSPC &&
 		       ext4_should_retry_alloc(inode->i_sb, &retries));
@@ -5330,7 +5330,7 @@ retry_alloc:
 		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
-	ret = __block_page_mkwrite(vma, vmf, get_block);
+	ret = block_page_mkwrite(vma, vmf, get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
 			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 54575e3..088ba00 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out;
 
 	file_update_time(vma->vm_file);
-	ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+	ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
 	if (ret) {
 		nilfs_transaction_abort(inode->i_sb);
 		goto out;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb4..f80e90f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1506,7 +1506,7 @@ xfs_filemap_page_mkwrite(
 		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
 				    xfs_end_io_dax_write);
 	} else {
-		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
 		ret = block_page_mkwrite_return(ret);
 	}
 
-- 
cgit v1.1


From 7c7afc440c8ec311ae135df892cc489436a1ff00 Mon Sep 17 00:00:00 2001
From: Tzvetelin Katchov <katchov@gmail.com>
Date: Wed, 7 Oct 2015 20:15:21 -0400
Subject: fs: 9p: cache.h: Add #define of include guard

The include file was intended to have an include guard, but the #define
part is missing.

Signed-off-by: Tzvetelin Katchov <katchov@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/cache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 2f96754..247e47e 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -21,6 +21,7 @@
  */
 
 #ifndef _9P_CACHE_H
+#define _9P_CACHE_H
 #ifdef CONFIG_9P_FSCACHE
 #include <linux/fscache.h>
 #include <linux/spinlock.h>
-- 
cgit v1.1


From dbd3ca50753e70e09cad747dce23b1a7683a3342 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 9 Nov 2015 09:23:40 -0700
Subject: fs/block_dev.c: Remove WARN_ON() when inode writeback fails

If a block device is hot removed and later last reference to device
is put, we try to writeback the dirty inode. But device is gone and
that writeback fails.

Currently we do a WARN_ON() which does not seem to be the right thing.
Convert it to a ratelimited kernel warning.

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
[jmoyer@redhat.com: get rid of unnecessary name initialization, 80 cols]
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0a793c7..bb0dfb1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
-static void bdev_write_inode(struct inode *inode)
+static void bdev_write_inode(struct block_device *bdev)
 {
+	struct inode *inode = bdev->bd_inode;
+	int ret;
+
 	spin_lock(&inode->i_lock);
 	while (inode->i_state & I_DIRTY) {
 		spin_unlock(&inode->i_lock);
-		WARN_ON_ONCE(write_inode_now(inode, true));
+		ret = write_inode_now(inode, true);
+		if (ret) {
+			char name[BDEVNAME_SIZE];
+			pr_warn_ratelimited("VFS: Dirty inode writeback failed "
+					    "for block device %s (err=%d).\n",
+					    bdevname(bdev, name), ret);
+		}
 		spin_lock(&inode->i_lock);
 	}
 	spin_unlock(&inode->i_lock);
@@ -1504,7 +1513,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		 * ->release can cause the queue to disappear, so flush all
 		 * dirty data before.
 		 */
-		bdev_write_inode(bdev->bd_inode);
+		bdev_write_inode(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-- 
cgit v1.1


From cadfbb6ec2e55171479191046142c927a8b12d87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Tue, 10 Nov 2015 19:42:49 -0700
Subject: dax_io(): don't let non-error value escape via retval instead of
 EFAULT

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # 4.0+
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/dax.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index a86d3cc..7b653e9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -169,8 +169,10 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 		else
 			len = iov_iter_zero(max - pos, iter);
 
-		if (!len)
+		if (!len) {
+			retval = -EFAULT;
 			break;
+		}
 
 		pos += len;
 		addr += len;
-- 
cgit v1.1


From 152d7bd80dca5ce77ec2d7313149a2ab990e808e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 12 Nov 2015 18:33:54 -0800
Subject: dax: fix __dax_pmd_fault crash

Since 4.3 introduced devm_memremap_pages() the pfns handled by DAX may
optionally have a struct page backing.  When a mapped pfn reaches
vmf_insert_pfn_pmd() it fails with a crash signature like the following:

 kernel BUG at mm/huge_memory.c:905!
 [..]
 Call Trace:
  [<ffffffff812a73ba>] __dax_pmd_fault+0x2ea/0x5b0
  [<ffffffffa01a4182>] xfs_filemap_pmd_fault+0x92/0x150 [xfs]
  [<ffffffff811fbe02>] handle_mm_fault+0x312/0x1b50

Fix this by falling back to 4K mappings in the pfn_valid() case.  Longer
term, vmf_insert_pfn_pmd() needs to grow support for architectures that
can provide a 'pmd_special' capability.

Cc: <stable@vger.kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 131fd35a..bff20cc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -627,6 +627,13 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
 			goto fallback;
 
+		/*
+		 * TODO: teach vmf_insert_pfn_pmd() to support
+		 * 'pte_special' for pmds
+		 */
+		if (pfn_valid(pfn))
+			goto fallback;
+
 		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
 			int i;
 			for (i = 0; i < PTRS_PER_PMD; i++)
-- 
cgit v1.1


From c361016adeb5acb17a67000689602b7515930f13 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 15:25:07 +0100
Subject: vfs: Check attribute names in posix acl xattr handers

The get and set operations of the POSIX ACL xattr handlers failed to check
the attribute names, so all names with "system.posix_acl_access" or
"system.posix_acl_default" as a prefix were accepted.  Reject invalid names
from now on.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/posix_acl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4fb17de..b1a66e8 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -768,6 +768,8 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
 	struct posix_acl *acl;
 	int error;
 
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
 	if (!IS_POSIXACL(d_backing_inode(dentry)))
 		return -EOPNOTSUPP;
 	if (d_is_symlink(dentry))
@@ -793,6 +795,8 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
 	struct posix_acl *acl = NULL;
 	int ret;
 
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
 	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
 	if (!inode->i_op->set_acl)
-- 
cgit v1.1


From dae5f57a72a6c913926cf3569edb0556a409a3df Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 3 Nov 2015 15:25:08 +0100
Subject: vfs: Fix the posix_acl_xattr_list return value

When a filesystem that contains POSIX ACLs is mounted without ACL support
(-o noacl), the appropriate behavior is not to list any existing POSIX ACL
xattrs.  The return value for list xattr handlers in this case is 0, not an
error code: several filesystems that use the POSIX ACL xattr handlers do
not expect the list operation to fail.

Symlinks cannot have ACLs, so posix_acl_xattr_list will never be called for
symlinks in the first place.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/posix_acl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index b1a66e8..2fdca61 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -833,9 +833,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
 	size_t size;
 
 	if (!IS_POSIXACL(d_backing_inode(dentry)))
-		return -EOPNOTSUPP;
-	if (d_is_symlink(dentry))
-		return -EOPNOTSUPP;
+		return 0;
 
 	if (type == ACL_TYPE_ACCESS)
 		xname = POSIX_ACL_XATTR_ACCESS;
-- 
cgit v1.1


From 13d3408f100186d2755d9ac1ad8b5de9e1975a92 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:48 +0200
Subject: ubifs: Remove unused security xattr handler

Ubifs installs a security xattr handler in sb->s_xattr but doesn't use the
generic_{get,set,list,remove}xattr inode operations needed for processing
this list of attribute handlers; the handler is never called.  Instead,
ubifs uses its own xattr handlers which also process security xattrs.

Remove the dead code.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: linux-mtd@lists.infradead.org
Cc: Subodh Nijsure <snijsure@grid-net.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ubifs/super.c |  1 -
 fs/ubifs/ubifs.h |  1 -
 fs/ubifs/xattr.c | 40 ----------------------------------------
 3 files changed, 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9547a278..c71edca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2037,7 +2037,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	if (c->max_inode_sz > MAX_LFS_FILESIZE)
 		sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
 	sb->s_op = &ubifs_super_operations;
-	sb->s_xattr = ubifs_xattr_handlers;
 
 	mutex_lock(&c->umount_mutex);
 	err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index de75902..33b6ee7 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1470,7 +1470,6 @@ extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
 extern const struct super_operations ubifs_super_operations;
-extern const struct xattr_handler *ubifs_xattr_handlers[];
 extern const struct address_space_operations ubifs_file_address_operations;
 extern const struct file_operations ubifs_file_operations;
 extern const struct inode_operations ubifs_file_inode_operations;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index fd65b3f..513815c 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -582,46 +582,6 @@ out_free:
 	return err;
 }
 
-static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
-				 const char *name, size_t name_len, int flags)
-{
-	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-
-	return total_len;
-}
-
-static int security_getxattr(struct dentry *d, const char *name, void *buffer,
-		      size_t size, int flags)
-{
-	return ubifs_getxattr(d, name, buffer, size);
-}
-
-static int security_setxattr(struct dentry *d, const char *name,
-			     const void *value, size_t size, int flags,
-			     int handler_flags)
-{
-	return ubifs_setxattr(d, name, value, size, flags);
-}
-
-static const struct xattr_handler ubifs_xattr_security_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.list   = security_listxattr,
-	.get    = security_getxattr,
-	.set    = security_setxattr,
-};
-
-const struct xattr_handler *ubifs_xattr_handlers[] = {
-	&ubifs_xattr_security_handler,
-	NULL,
-};
-
 static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
 		      void *fs_info)
 {
-- 
cgit v1.1


From e282fb7f3bce68089ae2b40c227b015c23764a38 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:49 +0200
Subject: hfsplus: Remove unused xattr handler list operations

The list operations can never be called; they are even documented to be
unused.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/xattr.c          | 11 -----------
 fs/hfsplus/xattr_security.c | 11 -----------
 fs/hfsplus/xattr_trusted.c  | 11 -----------
 fs/hfsplus/xattr_user.c     | 11 -----------
 4 files changed, 44 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 416b1db..e898499 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -893,19 +893,8 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
 	return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	/*
-	 * This method is not used.
-	 * It is used hfsplus_listxattr() instead of generic_listxattr().
-	 */
-	return -EOPNOTSUPP;
-}
-
 const struct xattr_handler hfsplus_xattr_osx_handler = {
 	.prefix	= XATTR_MAC_OSX_PREFIX,
-	.list	= hfsplus_osx_listxattr,
 	.get	= hfsplus_osx_getxattr,
 	.set	= hfsplus_osx_setxattr,
 };
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index aacff00..024e61c 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -29,16 +29,6 @@ static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
 				XATTR_SECURITY_PREFIX_LEN);
 }
 
-static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	/*
-	 * This method is not used.
-	 * It is used hfsplus_listxattr() instead of generic_listxattr().
-	 */
-	return -EOPNOTSUPP;
-}
-
 static int hfsplus_initxattrs(struct inode *inode,
 				const struct xattr *xattr_array,
 				void *fs_info)
@@ -92,7 +82,6 @@ int hfsplus_init_inode_security(struct inode *inode,
 
 const struct xattr_handler hfsplus_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= hfsplus_security_listxattr,
 	.get	= hfsplus_security_getxattr,
 	.set	= hfsplus_security_setxattr,
 };
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index bcf65089..6186157 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -26,19 +26,8 @@ static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
 				XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
-static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	/*
-	 * This method is not used.
-	 * It is used hfsplus_listxattr() instead of generic_listxattr().
-	 */
-	return -EOPNOTSUPP;
-}
-
 const struct xattr_handler hfsplus_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= hfsplus_trusted_listxattr,
 	.get	= hfsplus_trusted_getxattr,
 	.set	= hfsplus_trusted_setxattr,
 };
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 5aa0e6d..3b4caba 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -26,19 +26,8 @@ static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
-static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	/*
-	 * This method is not used.
-	 * It is used hfsplus_listxattr() instead of generic_listxattr().
-	 */
-	return -EOPNOTSUPP;
-}
-
 const struct xattr_handler hfsplus_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
-	.list	= hfsplus_user_listxattr,
 	.get	= hfsplus_user_getxattr,
 	.set	= hfsplus_user_setxattr,
 };
-- 
cgit v1.1


From bf781714b3e1421a0ebcd0137d081e6566a89f15 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:50 +0200
Subject: jffs2: Add missing capability check for listing trusted xattrs

The vfs checks if a task has the appropriate access for get and set
operations, but it cannot do that for the list operation; the file system
must check for that itself.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/xattr_trusted.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ceaf9c6..bbd20c1 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -39,6 +39,9 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
 {
 	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
 	if (list && retlen<=list_size) {
 		strcpy(list, XATTR_TRUSTED_PREFIX);
 		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
-- 
cgit v1.1


From d9a82a04033f87bbd06efb29f78c0170a38154a8 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:51 +0200
Subject: xattr handlers: Pass handler to operations instead of flags

The xattr_handler operations are currently all passed a file system
specific flags value which the operations can use to disambiguate between
different handlers; some file systems use that to distinguish the xattr
namespace, for example.  In some oprations, it would be useful to also have
access to the handler prefix.  To allow that, pass a pointer to the handler
to operations instead of the flags value alone.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c                  | 18 +++++++++--------
 fs/9p/xattr_security.c       | 10 ++++++----
 fs/9p/xattr_trusted.c        | 10 ++++++----
 fs/9p/xattr_user.c           | 10 ++++++----
 fs/ext2/xattr.c              |  7 +++----
 fs/ext2/xattr_security.c     | 15 ++++++++------
 fs/ext2/xattr_trusted.c      | 15 ++++++++------
 fs/ext2/xattr_user.c         | 15 ++++++++------
 fs/ext4/xattr.c              |  7 +++----
 fs/ext4/xattr_security.c     | 15 ++++++++------
 fs/ext4/xattr_trusted.c      | 15 ++++++++------
 fs/ext4/xattr_user.c         | 15 ++++++++------
 fs/f2fs/xattr.c              | 47 +++++++++++++++++++++++++-------------------
 fs/gfs2/xattr.c              | 13 +++++++-----
 fs/hfsplus/xattr.c           | 10 ++++++----
 fs/hfsplus/xattr_security.c  | 10 ++++++----
 fs/hfsplus/xattr_trusted.c   | 10 ++++++----
 fs/hfsplus/xattr_user.c      | 10 ++++++----
 fs/jffs2/security.c          | 16 +++++++++------
 fs/jffs2/xattr.c             |  9 +++++----
 fs/jffs2/xattr_trusted.c     | 16 +++++++++------
 fs/jffs2/xattr_user.c        | 16 +++++++++------
 fs/nfs/nfs4proc.c            | 34 +++++++++++++++++++-------------
 fs/ocfs2/xattr.c             | 45 +++++++++++++++++++++++++-----------------
 fs/posix_acl.c               | 28 ++++++++++++--------------
 fs/reiserfs/xattr.c          | 16 +++++++--------
 fs/reiserfs/xattr_security.c | 13 ++++++------
 fs/reiserfs/xattr_trusted.c  | 13 ++++++------
 fs/reiserfs/xattr_user.c     | 13 ++++++------
 fs/squashfs/xattr.c          | 36 ++++++++++++++++++++-------------
 fs/xattr.c                   | 15 +++++++-------
 fs/xfs/xfs_xattr.c           | 10 ++++++----
 32 files changed, 306 insertions(+), 226 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 31c0103..e6fe824 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -230,11 +230,13 @@ static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
 	return v9fs_xattr_get(dentry, full_name, buffer, size);
 }
 
-static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
-			      void *buffer, size_t size, int type)
+static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
+			      struct dentry *dentry, const char *name,
+			      void *buffer, size_t size)
 {
 	struct v9fs_session_info *v9ses;
 	struct posix_acl *acl;
+	int type = handler->flags;
 	int error;
 
 	if (strcmp(name, "") != 0)
@@ -278,9 +280,9 @@ static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
 }
 
 
-static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
-			      const void *value, size_t size,
-			      int flags, int type)
+static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+			      struct dentry *dentry, const char *name,
+			      const void *value, size_t size, int flags)
 {
 	int retval;
 	struct posix_acl *acl;
@@ -297,7 +299,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
 		return v9fs_remote_set_acl(dentry, name,
-					   value, size, flags, type);
+					   value, size, flags, handler->flags);
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
@@ -316,7 +318,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	} else
 		acl = NULL;
 
-	switch (type) {
+	switch (handler->flags) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
@@ -360,7 +362,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	}
 	retval = v9fs_xattr_set(dentry, name, value, size, flags);
 	if (!retval)
-		set_cached_acl(inode, type, acl);
+		set_cached_acl(inode, handler->flags, acl);
 err_out:
 	posix_acl_release(acl);
 	return retval;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
index cb247a1..c0a470a 100644
--- a/fs/9p/xattr_security.c
+++ b/fs/9p/xattr_security.c
@@ -19,8 +19,9 @@
 #include <linux/slab.h>
 #include "xattr.h"
 
-static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
-			void *buffer, size_t size, int type)
+static int v9fs_xattr_security_get(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
 {
 	int retval;
 	char *full_name;
@@ -46,8 +47,9 @@ static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
 	return retval;
 }
 
-static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags, int type)
+static int v9fs_xattr_security_set(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	int retval;
 	char *full_name;
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
index e30d33b..b888a4e 100644
--- a/fs/9p/xattr_trusted.c
+++ b/fs/9p/xattr_trusted.c
@@ -19,8 +19,9 @@
 #include <linux/slab.h>
 #include "xattr.h"
 
-static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
-			void *buffer, size_t size, int type)
+static int v9fs_xattr_trusted_get(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
 {
 	int retval;
 	char *full_name;
@@ -46,8 +47,9 @@ static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
 	return retval;
 }
 
-static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags, int type)
+static int v9fs_xattr_trusted_set(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	int retval;
 	char *full_name;
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
index d0b701b..06f136c 100644
--- a/fs/9p/xattr_user.c
+++ b/fs/9p/xattr_user.c
@@ -19,8 +19,9 @@
 #include <linux/slab.h>
 #include "xattr.h"
 
-static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
-			void *buffer, size_t size, int type)
+static int v9fs_xattr_user_get(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
 {
 	int retval;
 	char *full_name;
@@ -46,8 +47,9 @@ static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
 	return retval;
 }
 
-static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags, int type)
+static int v9fs_xattr_user_set(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	int retval;
 	char *full_name;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0b6bfd3..fa70848 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -293,10 +293,9 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 			ext2_xattr_handler(entry->e_name_index);
 
 		if (handler) {
-			size_t size = handler->list(dentry, buffer, rest,
-						    entry->e_name,
-						    entry->e_name_len,
-						    handler->flags);
+			size_t size = handler->list(handler, dentry, buffer,
+						    rest, entry->e_name,
+						    entry->e_name_len);
 			if (buffer) {
 				if (size > rest) {
 					error = -ERANGE;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 702fc68..dfb0875 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -8,8 +8,9 @@
 #include "xattr.h"
 
 static size_t
-ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-			 const char *name, size_t name_len, int type)
+ext2_xattr_security_list(const struct xattr_handler *handler,
+			 struct dentry *dentry, char *list, size_t list_size,
+			 const char *name, size_t name_len)
 {
 	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -23,8 +24,9 @@ ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
 }
 
 static int
-ext2_xattr_security_get(struct dentry *dentry, const char *name,
-		       void *buffer, size_t size, int type)
+ext2_xattr_security_get(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -33,8 +35,9 @@ ext2_xattr_security_get(struct dentry *dentry, const char *name,
 }
 
 static int
-ext2_xattr_security_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+ext2_xattr_security_set(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 42b6e98..3150dd3 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -9,8 +9,9 @@
 #include "xattr.h"
 
 static size_t
-ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
+ext2_xattr_trusted_list(const struct xattr_handler *handler,
+			struct dentry *dentry, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
 	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -27,8 +28,9 @@ ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
 }
 
 static int
-ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+ext2_xattr_trusted_get(const struct xattr_handler *handler,
+		       struct dentry *dentry, const char *name,
+		       void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -37,8 +39,9 @@ ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
 }
 
 static int
-ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+ext2_xattr_trusted_set(const struct xattr_handler *handler,
+		       struct dentry *dentry, const char *name,
+		       const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index ecdc460..339a49b 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,8 +11,9 @@
 #include "xattr.h"
 
 static size_t
-ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
+ext2_xattr_user_list(const struct xattr_handler *handler,
+		     struct dentry *dentry, char *list, size_t list_size,
+		     const char *name, size_t name_len)
 {
 	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -29,8 +30,9 @@ ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
 }
 
 static int
-ext2_xattr_user_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+ext2_xattr_user_get(const struct xattr_handler *handler,
+		    struct dentry *dentry, const char *name,
+		    void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -41,8 +43,9 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name,
 }
 
 static int
-ext2_xattr_user_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+ext2_xattr_user_set(const struct xattr_handler *handler,
+		    struct dentry *dentry, const char *name,
+		    const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 984448c..6b6b3e7 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -405,10 +405,9 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
 			ext4_xattr_handler(entry->e_name_index);
 
 		if (handler) {
-			size_t size = handler->list(dentry, buffer, rest,
-						    entry->e_name,
-						    entry->e_name_len,
-						    handler->flags);
+			size_t size = handler->list(handler, dentry, buffer,
+						    rest, entry->e_name,
+						    entry->e_name_len);
 			if (buffer) {
 				if (size > rest)
 					return -ERANGE;
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 95d90e0..36f4c1a 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -12,8 +12,9 @@
 #include "xattr.h"
 
 static size_t
-ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
+ext4_xattr_security_list(const struct xattr_handler *handler,
+			 struct dentry *dentry, char *list, size_t list_size,
+			 const char *name, size_t name_len)
 {
 	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -28,8 +29,9 @@ ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
 }
 
 static int
-ext4_xattr_security_get(struct dentry *dentry, const char *name,
-		       void *buffer, size_t size, int type)
+ext4_xattr_security_get(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -38,8 +40,9 @@ ext4_xattr_security_get(struct dentry *dentry, const char *name,
 }
 
 static int
-ext4_xattr_security_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+ext4_xattr_security_set(const struct xattr_handler *handler,
+			struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 891ee2d..4880890 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,8 +13,9 @@
 #include "xattr.h"
 
 static size_t
-ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
+ext4_xattr_trusted_list(const struct xattr_handler *handler,
+			struct dentry *dentry, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
 	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -31,8 +32,9 @@ ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
 }
 
 static int
-ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
-		size_t size, int type)
+ext4_xattr_trusted_get(const struct xattr_handler *handler,
+		       struct dentry *dentry, const char *name, void *buffer,
+		       size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -41,8 +43,9 @@ ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
 }
 
 static int
-ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+ext4_xattr_trusted_set(const struct xattr_handler *handler,
+		       struct dentry *dentry, const char *name,
+		       const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 6ed932b..d2dec33 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,8 +12,9 @@
 #include "xattr.h"
 
 static size_t
-ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len, int type)
+ext4_xattr_user_list(const struct xattr_handler *handler,
+		     struct dentry *dentry, char *list, size_t list_size,
+		     const char *name, size_t name_len)
 {
 	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -30,8 +31,9 @@ ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
 }
 
 static int
-ext4_xattr_user_get(struct dentry *dentry, const char *name,
-		    void *buffer, size_t size, int type)
+ext4_xattr_user_get(const struct xattr_handler *handler,
+		    struct dentry *dentry, const char *name,
+		    void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -42,8 +44,9 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name,
 }
 
 static int
-ext4_xattr_user_set(struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags, int type)
+ext4_xattr_user_set(const struct xattr_handler *handler,
+		    struct dentry *dentry, const char *name,
+		    const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 4de2286..e643173 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,14 +25,15 @@
 #include "f2fs.h"
 #include "xattr.h"
 
-static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t len, int type)
+static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
+		struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
 	int total_len, prefix_len = 0;
 	const char *prefix = NULL;
 
-	switch (type) {
+	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
 		if (!test_opt(sbi, XATTR_USER))
 			return -EOPNOTSUPP;
@@ -62,12 +63,13 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
 	return total_len;
 }
 
-static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
+		struct dentry *dentry, const char *name, void *buffer,
+		size_t size)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
 
-	switch (type) {
+	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
 		if (!test_opt(sbi, XATTR_USER))
 			return -EOPNOTSUPP;
@@ -83,15 +85,17 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
 	}
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
+	return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+			     buffer, size, NULL);
 }
 
-static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+		struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
 
-	switch (type) {
+	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
 		if (!test_opt(sbi, XATTR_USER))
 			return -EOPNOTSUPP;
@@ -108,17 +112,18 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 
-	return f2fs_setxattr(d_inode(dentry), type, name,
+	return f2fs_setxattr(d_inode(dentry), handler->flags, name,
 					value, size, NULL, flags);
 }
 
-static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t len, int type)
+static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
+		struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t len)
 {
 	const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
 	size_t size;
 
-	if (type != F2FS_XATTR_INDEX_ADVISE)
+	if (handler->flags != F2FS_XATTR_INDEX_ADVISE)
 		return 0;
 
 	size = strlen(xname) + 1;
@@ -127,8 +132,9 @@ static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
 	return size;
 }
 
-static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
+		struct dentry *dentry, const char *name, void *buffer,
+		size_t size)
 {
 	struct inode *inode = d_inode(dentry);
 
@@ -140,8 +146,9 @@ static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
 	return sizeof(char);
 }
 
-static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+		struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
 {
 	struct inode *inode = d_inode(dentry);
 
@@ -462,8 +469,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 		if (!handler)
 			continue;
 
-		size = handler->list(dentry, buffer, rest, entry->e_name,
-				entry->e_name_len, handler->flags);
+		size = handler->list(handler, dentry, buffer, rest,
+				     entry->e_name, entry->e_name_len);
 		if (buffer && size > rest) {
 			error = -ERANGE;
 			goto cleanup;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa..53ce76a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -583,11 +583,13 @@ out:
  *
  * Returns: actual size of data on success, -errno on error
  */
-static int gfs2_xattr_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+static int gfs2_xattr_get(const struct xattr_handler *handler,
+			  struct dentry *dentry, const char *name,
+			  void *buffer, size_t size)
 {
 	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
 	struct gfs2_ea_location el;
+	int type = handler->flags;
 	int error;
 
 	if (!ip->i_eattr)
@@ -1227,11 +1229,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 	return error;
 }
 
-static int gfs2_xattr_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+static int gfs2_xattr_set(const struct xattr_handler *handler,
+			  struct dentry *dentry, const char *name,
+			  const void *value, size_t size, int flags)
 {
 	return __gfs2_xattr_set(d_inode(dentry), name, value,
-				size, flags, type);
+				size, flags, handler->flags);
 }
 
 
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e898499..e41a010 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -849,8 +849,9 @@ end_removexattr:
 	return err;
 }
 
-static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
-					void *buffer, size_t size, int type)
+static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
+				struct dentry *dentry, const char *name,
+				void *buffer, size_t size)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -871,8 +872,9 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
 	return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
 }
 
-static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+				struct dentry *dentry, const char *name,
+				const void *buffer, size_t size, int flags)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 024e61c..72a68a3 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -13,16 +13,18 @@
 #include "xattr.h"
 #include "acl.h"
 
-static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
-					void *buffer, size_t size, int type)
+static int hfsplus_security_getxattr(const struct xattr_handler *handler,
+				     struct dentry *dentry, const char *name,
+				     void *buffer, size_t size)
 {
 	return hfsplus_getxattr(dentry, name, buffer, size,
 				XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN);
 }
 
-static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+				     struct dentry *dentry, const char *name,
+				     const void *buffer, size_t size, int flags)
 {
 	return hfsplus_setxattr(dentry, name, buffer, size, flags,
 				XATTR_SECURITY_PREFIX,
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 6186157..95a7704 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -11,16 +11,18 @@
 #include "hfsplus_fs.h"
 #include "xattr.h"
 
-static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
-					void *buffer, size_t size, int type)
+static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
+				    struct dentry *dentry, const char *name,
+				    void *buffer, size_t size)
 {
 	return hfsplus_getxattr(dentry, name, buffer, size,
 				XATTR_TRUSTED_PREFIX,
 				XATTR_TRUSTED_PREFIX_LEN);
 }
 
-static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+				    struct dentry *dentry, const char *name,
+				    const void *buffer, size_t size, int flags)
 {
 	return hfsplus_setxattr(dentry, name, buffer, size, flags,
 				XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 3b4caba..6fc269b 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -11,16 +11,18 @@
 #include "hfsplus_fs.h"
 #include "xattr.h"
 
-static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
-					void *buffer, size_t size, int type)
+static int hfsplus_user_getxattr(const struct xattr_handler *handler,
+				 struct dentry *dentry, const char *name,
+				 void *buffer, size_t size)
 {
 
 	return hfsplus_getxattr(dentry, name, buffer, size,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
-static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+				 struct dentry *dentry, const char *name,
+				 const void *buffer, size_t size, int flags)
 {
 	return hfsplus_setxattr(dentry, name, buffer, size, flags,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index d4b43fb..bf12fe5 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -48,8 +48,9 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
 }
 
 /* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
-				   void *buffer, size_t size, int type)
+static int jffs2_security_getxattr(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -58,8 +59,9 @@ static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
 				 name, buffer, size);
 }
 
-static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int jffs2_security_setxattr(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *buffer, size_t size, int flags)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -68,8 +70,10 @@ static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
+static size_t jffs2_security_listxattr(const struct xattr_handler *handler,
+				       struct dentry *dentry, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len)
 {
 	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
 
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index f092fee..4c2c036 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1001,11 +1001,12 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		if (!xhandle)
 			continue;
 		if (buffer) {
-			rc = xhandle->list(dentry, buffer+len, size-len,
-					   xd->xname, xd->name_len, xd->flags);
+			rc = xhandle->list(xhandle, dentry, buffer + len,
+					   size - len, xd->xname,
+					   xd->name_len);
 		} else {
-			rc = xhandle->list(dentry, NULL, 0, xd->xname,
-					   xd->name_len, xd->flags);
+			rc = xhandle->list(xhandle, dentry, NULL, 0,
+					   xd->xname, xd->name_len);
 		}
 		if (rc < 0)
 			goto out;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index bbd20c1..a562da0 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,8 +16,9 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
 
-static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
+				  struct dentry *dentry, const char *name,
+				  void *buffer, size_t size)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -25,8 +26,9 @@ static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
 				 name, buffer, size);
 }
 
-static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+				  struct dentry *dentry, const char *name,
+				  const void *buffer, size_t size, int flags)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -34,8 +36,10 @@ static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
+static size_t jffs2_trusted_listxattr(const struct xattr_handler *handler,
+				      struct dentry *dentry, char *list,
+				      size_t list_size, const char *name,
+				      size_t name_len)
 {
 	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
 
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index a71391e..cbc0472 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,8 +16,9 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
 
-static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
-			       void *buffer, size_t size, int type)
+static int jffs2_user_getxattr(const struct xattr_handler *handler,
+			       struct dentry *dentry, const char *name,
+			       void *buffer, size_t size)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -25,8 +26,9 @@ static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
 				 name, buffer, size);
 }
 
-static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
-		const void *buffer, size_t size, int flags, int type)
+static int jffs2_user_setxattr(const struct xattr_handler *handler,
+			       struct dentry *dentry, const char *name,
+			       const void *buffer, size_t size, int flags)
 {
 	if (!strcmp(name, ""))
 		return -EINVAL;
@@ -34,8 +36,10 @@ static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
+static size_t jffs2_user_listxattr(const struct xattr_handler *handler,
+				   struct dentry *dentry, char *list,
+				   size_t list_size, const char *name,
+				   size_t name_len)
 {
 	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e5ff69..ab84c4d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6249,9 +6249,10 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
-static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *key,
 				   const void *buf, size_t buflen,
-				   int flags, int type)
+				   int flags)
 {
 	if (strcmp(key, "") != 0)
 		return -EINVAL;
@@ -6259,8 +6260,9 @@ static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
 	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
 }
 
-static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
-				   void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *key,
+				   void *buf, size_t buflen)
 {
 	if (strcmp(key, "") != 0)
 		return -EINVAL;
@@ -6268,9 +6270,10 @@ static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
 	return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
 }
 
-static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler,
+				       struct dentry *dentry, char *list,
 				       size_t list_len, const char *name,
-				       size_t name_len, int type)
+				       size_t name_len)
 {
 	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
 
@@ -6288,9 +6291,10 @@ static inline int nfs4_server_supports_labels(struct nfs_server *server)
 	return server->caps & NFS_CAP_SECURITY_LABEL;
 }
 
-static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
-				   const void *buf, size_t buflen,
-				   int flags, int type)
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+				     struct dentry *dentry, const char *key,
+				     const void *buf, size_t buflen,
+				     int flags)
 {
 	if (security_ismaclabel(key))
 		return nfs4_set_security_label(dentry, buf, buflen);
@@ -6298,17 +6302,19 @@ static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
 	return -EOPNOTSUPP;
 }
 
-static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
-				   void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+				     struct dentry *dentry, const char *key,
+				     void *buf, size_t buflen)
 {
 	if (security_ismaclabel(key))
 		return nfs4_get_security_label(d_inode(dentry), buf, buflen);
 	return -EOPNOTSUPP;
 }
 
-static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
-				       size_t list_len, const char *name,
-				       size_t name_len, int type)
+static size_t nfs4_xattr_list_nfs4_label(const struct xattr_handler *handler,
+					 struct dentry *dentry, char *list,
+					 size_t list_len, const char *name,
+					 size_t name_len)
 {
 	size_t len = 0;
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ebfdea7..e9164f0 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7229,9 +7229,10 @@ leave:
 /*
  * 'security' attributes support
  */
-static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
+static size_t ocfs2_xattr_security_list(const struct xattr_handler *handler,
+					struct dentry *dentry, char *list,
 					size_t list_size, const char *name,
-					size_t name_len, int type)
+					size_t name_len)
 {
 	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -7244,8 +7245,9 @@ static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
 	return total_len;
 }
 
-static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
-				    void *buffer, size_t size, int type)
+static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
+				    struct dentry *dentry, const char *name,
+				    void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -7253,8 +7255,9 @@ static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
 			       name, buffer, size);
 }
 
-static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+				    struct dentry *dentry, const char *name,
+				    const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -7319,9 +7322,10 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
 /*
  * 'trusted' attributes support
  */
-static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
+static size_t ocfs2_xattr_trusted_list(const struct xattr_handler *handler,
+				       struct dentry *dentry, char *list,
 				       size_t list_size, const char *name,
-				       size_t name_len, int type)
+				       size_t name_len)
 {
 	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -7337,8 +7341,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
 	return total_len;
 }
 
-static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -7346,8 +7351,9 @@ static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
 			       name, buffer, size);
 }
 
-static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
@@ -7366,9 +7372,10 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
 /*
  * 'user' attributes support
  */
-static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
+static size_t ocfs2_xattr_user_list(const struct xattr_handler *handler,
+				    struct dentry *dentry, char *list,
 				    size_t list_size, const char *name,
-				    size_t name_len, int type)
+				    size_t name_len)
 {
 	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -7385,8 +7392,9 @@ static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
 	return total_len;
 }
 
-static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
+static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
+				struct dentry *dentry, const char *name,
+				void *buffer, size_t size)
 {
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
@@ -7398,8 +7406,9 @@ static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
 			       buffer, size);
 }
 
-static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+				struct dentry *dentry, const char *name,
+				const void *value, size_t size, int flags)
 {
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2fdca61..4adde1e 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -762,8 +762,9 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
 EXPORT_SYMBOL (posix_acl_to_xattr);
 
 static int
-posix_acl_xattr_get(struct dentry *dentry, const char *name,
-		void *value, size_t size, int type)
+posix_acl_xattr_get(const struct xattr_handler *handler,
+		    struct dentry *dentry, const char *name,
+		    void *value, size_t size)
 {
 	struct posix_acl *acl;
 	int error;
@@ -775,7 +776,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
 	if (d_is_symlink(dentry))
 		return -EOPNOTSUPP;
 
-	acl = get_acl(d_backing_inode(dentry), type);
+	acl = get_acl(d_backing_inode(dentry), handler->flags);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
@@ -788,8 +789,9 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
 }
 
 static int
-posix_acl_xattr_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+posix_acl_xattr_set(const struct xattr_handler *handler,
+		    struct dentry *dentry, const char *name,
+		    const void *value, size_t size, int flags)
 {
 	struct inode *inode = d_backing_inode(dentry);
 	struct posix_acl *acl = NULL;
@@ -802,7 +804,7 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
 	if (!inode->i_op->set_acl)
 		return -EOPNOTSUPP;
 
-	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+	if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return value ? -EACCES : 0;
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
@@ -819,27 +821,23 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
 		}
 	}
 
-	ret = inode->i_op->set_acl(inode, acl, type);
+	ret = inode->i_op->set_acl(inode, acl, handler->flags);
 out:
 	posix_acl_release(acl);
 	return ret;
 }
 
 static size_t
-posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
+posix_acl_xattr_list(const struct xattr_handler *handler,
+		     struct dentry *dentry, char *list, size_t list_size,
+		     const char *name, size_t name_len)
 {
-	const char *xname;
+	const char *xname = handler->prefix;
 	size_t size;
 
 	if (!IS_POSIXACL(d_backing_inode(dentry)))
 		return 0;
 
-	if (type == ACL_TYPE_ACCESS)
-		xname = POSIX_ACL_XATTR_ACCESS;
-	else
-		xname = POSIX_ACL_XATTR_DEFAULT;
-
 	size = strlen(xname) + 1;
 	if (list && size <= list_size)
 		memcpy(list, xname, size);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e87f9b5..66b26fd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -778,7 +778,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 	if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	return handler->get(dentry, name, buffer, size, handler->flags);
+	return handler->get(handler, dentry, name, buffer, size);
 }
 
 /*
@@ -797,7 +797,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	return handler->set(dentry, name, value, size, flags, handler->flags);
+	return handler->set(handler, dentry, name, value, size, flags);
 }
 
 /*
@@ -814,7 +814,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 	if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
 }
 
 struct listxattr_buf {
@@ -842,14 +842,14 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
 		if (!handler)	/* Unsupported xattr name */
 			return 0;
 		if (b->buf) {
-			size = handler->list(b->dentry, b->buf + b->pos,
-					 b->size, name, namelen,
-					 handler->flags);
+			size = handler->list(handler, b->dentry,
+					     b->buf + b->pos, b->size, name,
+					     namelen);
 			if (size > b->size)
 				return -ERANGE;
 		} else {
-			size = handler->list(b->dentry, NULL, 0, name,
-					     namelen, handler->flags);
+			size = handler->list(handler, b->dentry,
+					     NULL, 0, name, namelen);
 		}
 
 		b->pos += size;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 9a3b061..ac659af 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,8 +9,8 @@
 #include <linux/uaccess.h>
 
 static int
-security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
-		int handler_flags)
+security_get(const struct xattr_handler *handler, struct dentry *dentry,
+	     const char *name, void *buffer, size_t size)
 {
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
@@ -22,8 +22,8 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
 }
 
 static int
-security_set(struct dentry *dentry, const char *name, const void *buffer,
-	     size_t size, int flags, int handler_flags)
+security_set(const struct xattr_handler *handler, struct dentry *dentry,
+	     const char *name, const void *buffer, size_t size, int flags)
 {
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
@@ -34,8 +34,9 @@ security_set(struct dentry *dentry, const char *name, const void *buffer,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
-			    const char *name, size_t namelen, int handler_flags)
+static size_t security_list(const struct xattr_handler *handler,
+			    struct dentry *dentry, char *list, size_t list_len,
+			    const char *name, size_t namelen)
 {
 	const size_t len = namelen + 1;
 
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index e4f1343..a338adf 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,8 +8,8 @@
 #include <linux/uaccess.h>
 
 static int
-trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
-	    int handler_flags)
+trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
+	    const char *name, void *buffer, size_t size)
 {
 	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
 		return -EINVAL;
@@ -21,8 +21,8 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
 }
 
 static int
-trusted_set(struct dentry *dentry, const char *name, const void *buffer,
-	    size_t size, int flags, int handler_flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
+	    const char *name, const void *buffer, size_t size, int flags)
 {
 	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
 		return -EINVAL;
@@ -33,8 +33,9 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
-			   const char *name, size_t name_len, int handler_flags)
+static size_t trusted_list(const struct xattr_handler *handler,
+			   struct dentry *dentry, char *list, size_t list_size,
+			   const char *name, size_t name_len)
 {
 	const size_t len = name_len + 1;
 
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index d0b08d3..39c9667 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,8 +7,8 @@
 #include <linux/uaccess.h>
 
 static int
-user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
-	 int handler_flags)
+user_get(const struct xattr_handler *handler, struct dentry *dentry,
+	 const char *name, void *buffer, size_t size)
 {
 
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
@@ -19,8 +19,8 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
 }
 
 static int
-user_set(struct dentry *dentry, const char *name, const void *buffer,
-	 size_t size, int flags, int handler_flags)
+user_set(const struct xattr_handler *handler, struct dentry *dentry,
+	 const char *name, const void *buffer, size_t size, int flags)
 {
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
 		return -EINVAL;
@@ -30,8 +30,9 @@ user_set(struct dentry *dentry, const char *name, const void *buffer,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len, int handler_flags)
+static size_t user_list(const struct xattr_handler *handler,
+			struct dentry *dentry, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
 	const size_t len = name_len + 1;
 
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e5e0ddf..4ae1e4f 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -68,8 +68,8 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
 		name_size = le16_to_cpu(entry.size);
 		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
 		if (handler)
-			prefix_size = handler->list(d, buffer, rest, NULL,
-				name_size, handler->flags);
+			prefix_size = handler->list(handler, d, buffer, rest,
+						    NULL, name_size);
 		if (prefix_size) {
 			if (buffer) {
 				if (prefix_size + name_size + 1 > rest) {
@@ -215,16 +215,18 @@ failed:
 /*
  * User namespace support
  */
-static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
-	const char *name, size_t name_len, int type)
+static size_t squashfs_user_list(const struct xattr_handler *handler,
+				 struct dentry *d, char *list, size_t list_size,
+				 const char *name, size_t name_len)
 {
 	if (list && XATTR_USER_PREFIX_LEN <= list_size)
 		memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 	return XATTR_USER_PREFIX_LEN;
 }
 
-static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
-	size_t size, int type)
+static int squashfs_user_get(const struct xattr_handler *handler,
+			     struct dentry *d, const char *name, void *buffer,
+			     size_t size)
 {
 	if (name[0] == '\0')
 		return  -EINVAL;
@@ -242,8 +244,10 @@ static const struct xattr_handler squashfs_xattr_user_handler = {
 /*
  * Trusted namespace support
  */
-static size_t squashfs_trusted_list(struct dentry *d, char *list,
-	size_t list_size, const char *name, size_t name_len, int type)
+static size_t squashfs_trusted_list(const struct xattr_handler *handler,
+				    struct dentry *d, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return 0;
@@ -253,8 +257,9 @@ static size_t squashfs_trusted_list(struct dentry *d, char *list,
 	return XATTR_TRUSTED_PREFIX_LEN;
 }
 
-static int squashfs_trusted_get(struct dentry *d, const char *name,
-	void *buffer, size_t size, int type)
+static int squashfs_trusted_get(const struct xattr_handler *handler,
+				struct dentry *d, const char *name,
+				void *buffer, size_t size)
 {
 	if (name[0] == '\0')
 		return  -EINVAL;
@@ -272,16 +277,19 @@ static const struct xattr_handler squashfs_xattr_trusted_handler = {
 /*
  * Security namespace support
  */
-static size_t squashfs_security_list(struct dentry *d, char *list,
-	size_t list_size, const char *name, size_t name_len, int type)
+static size_t squashfs_security_list(const struct xattr_handler *handler,
+				     struct dentry *d, char *list,
+				     size_t list_size, const char *name,
+				     size_t name_len)
 {
 	if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
 		memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
 	return XATTR_SECURITY_PREFIX_LEN;
 }
 
-static int squashfs_security_get(struct dentry *d, const char *name,
-	void *buffer, size_t size, int type)
+static int squashfs_security_get(const struct xattr_handler *handler,
+				 struct dentry *d, const char *name,
+				 void *buffer, size_t size)
 {
 	if (name[0] == '\0')
 		return  -EINVAL;
diff --git a/fs/xattr.c b/fs/xattr.c
index 072fee1..44377b6 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -720,7 +720,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	return handler->get(dentry, name, buffer, size, handler->flags);
+	return handler->get(handler, dentry, name, buffer, size);
 }
 
 /*
@@ -735,15 +735,15 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 	if (!buffer) {
 		for_each_xattr_handler(handlers, handler) {
-			size += handler->list(dentry, NULL, 0, NULL, 0,
-					      handler->flags);
+			size += handler->list(handler, dentry, NULL, 0,
+					      NULL, 0);
 		}
 	} else {
 		char *buf = buffer;
 
 		for_each_xattr_handler(handlers, handler) {
-			size = handler->list(dentry, buf, buffer_size,
-					     NULL, 0, handler->flags);
+			size = handler->list(handler, dentry, buf, buffer_size,
+					     NULL, 0);
 			if (size > buffer_size)
 				return -ERANGE;
 			buf += size;
@@ -767,7 +767,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	return handler->set(dentry, name, value, size, flags, handler->flags);
+	return handler->set(handler, dentry, name, value, size, flags);
 }
 
 /*
@@ -782,8 +782,7 @@ generic_removexattr(struct dentry *dentry, const char *name)
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	return handler->set(dentry, name, NULL, 0,
-			    XATTR_REPLACE, handler->flags);
+	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
 }
 
 EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c0368151..b1850e1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,9 +32,10 @@
 
 
 static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
-		void *value, size_t size, int xflags)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
+		const char *name, void *value, size_t size)
 {
+	int xflags = handler->flags;
 	struct xfs_inode *ip = XFS_I(d_inode(dentry));
 	int error, asize = size;
 
@@ -54,9 +55,10 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
 }
 
 static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags, int xflags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
+		const char *name, const void *value, size_t size, int flags)
 {
+	int xflags = handler->flags;
 	struct xfs_inode *ip = XFS_I(d_inode(dentry));
 
 	if (strcmp(name, "") == 0)
-- 
cgit v1.1


From e409de992e3ea3674393465f07cc71c948edd87a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:52 +0200
Subject: 9p: xattr simplifications

Now that the xattr handler is passed to the xattr handler operations, we
can use the same get and set operations for the user, trusted, and security
xattr namespaces.  In those namespaces, we can access the full attribute
name by "reattaching" the name prefix the vfs has skipped for us.  Add a
xattr_full_name helper to make this obvious in the code.

For the "system.posix_acl_access" and "system.posix_acl_default"
attributes, handler->prefix is the full attribute name; the suffix is the
empty string.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Cc: v9fs-developer@lists.sourceforge.net
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/Makefile         |  5 +--
 fs/9p/acl.c            | 51 +++----------------------------
 fs/9p/xattr.c          | 42 ++++++++++++++++++++++++++
 fs/9p/xattr.h          |  3 --
 fs/9p/xattr_security.c | 82 --------------------------------------------------
 fs/9p/xattr_trusted.c  | 82 --------------------------------------------------
 fs/9p/xattr_user.c     | 82 --------------------------------------------------
 fs/xattr.c             | 24 +++++++++++++++
 8 files changed, 72 insertions(+), 299 deletions(-)
 delete mode 100644 fs/9p/xattr_security.c
 delete mode 100644 fs/9p/xattr_trusted.c
 delete mode 100644 fs/9p/xattr_user.c

(limited to 'fs')

diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ff7be98..9619cca 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -10,10 +10,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 	vfs_dentry.o \
 	v9fs.o \
 	fid.o  \
-	xattr.o \
-	xattr_user.o \
-	xattr_trusted.o
+	xattr.o
 
 9p-$(CONFIG_9P_FSCACHE) += cache.o
 9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
-9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index e6fe824..a7e2889 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -212,31 +212,12 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 	return 0;
 }
 
-static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
-			       void *buffer, size_t size, int type)
-{
-	char *full_name;
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		full_name =  POSIX_ACL_XATTR_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		full_name = POSIX_ACL_XATTR_DEFAULT;
-		break;
-	default:
-		BUG();
-	}
-	return v9fs_xattr_get(dentry, full_name, buffer, size);
-}
-
 static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 			      struct dentry *dentry, const char *name,
 			      void *buffer, size_t size)
 {
 	struct v9fs_session_info *v9ses;
 	struct posix_acl *acl;
-	int type = handler->flags;
 	int error;
 
 	if (strcmp(name, "") != 0)
@@ -247,9 +228,9 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 	 * We allow set/get/list of acl when access=client is not specified
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+		return v9fs_xattr_get(dentry, handler->prefix, buffer, size);
 
-	acl = v9fs_get_cached_acl(d_inode(dentry), type);
+	acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
@@ -260,26 +241,6 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 	return error;
 }
 
-static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
-			      const void *value, size_t size,
-			      int flags, int type)
-{
-	char *full_name;
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		full_name =  POSIX_ACL_XATTR_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		full_name = POSIX_ACL_XATTR_DEFAULT;
-		break;
-	default:
-		BUG();
-	}
-	return v9fs_xattr_set(dentry, full_name, value, size, flags);
-}
-
-
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			      struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags)
@@ -298,8 +259,8 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	 * xattr value. We leave it to the server to validate
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_remote_set_acl(dentry, name,
-					   value, size, flags, handler->flags);
+		return v9fs_xattr_set(dentry, handler->prefix, value, size,
+				      flags);
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
@@ -320,7 +281,6 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 
 	switch (handler->flags) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
 			umode_t mode = inode->i_mode;
 			retval = posix_acl_equiv_mode(acl, &mode);
@@ -351,7 +311,6 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
 		if (!S_ISDIR(inode->i_mode)) {
 			retval = acl ? -EINVAL : 0;
 			goto err_out;
@@ -360,7 +319,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	default:
 		BUG();
 	}
-	retval = v9fs_xattr_set(dentry, name, value, size, flags);
+	retval = v9fs_xattr_set(dentry, handler->prefix, value, size, flags);
 	if (!retval)
 		set_cached_acl(inode, handler->flags, acl);
 err_out:
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 0cf44b6..e3d026a 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -137,6 +137,48 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
 }
 
+static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
+				  struct dentry *dentry, const char *name,
+				  void *buffer, size_t size)
+{
+	const char *full_name = xattr_full_name(handler, name);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+				  struct dentry *dentry, const char *name,
+				  const void *value, size_t size, int flags)
+{
+	const char *full_name = xattr_full_name(handler, name);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+static struct xattr_handler v9fs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= v9fs_xattr_handler_get,
+	.set	= v9fs_xattr_handler_set,
+};
+
+static struct xattr_handler v9fs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.get	= v9fs_xattr_handler_get,
+	.set	= v9fs_xattr_handler_set,
+};
+
+#ifdef CONFIG_9P_FS_SECURITY
+static struct xattr_handler v9fs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= v9fs_xattr_handler_get,
+	.set	= v9fs_xattr_handler_set,
+};
+#endif
+
 const struct xattr_handler *v9fs_xattr_handlers[] = {
 	&v9fs_xattr_user_handler,
 	&v9fs_xattr_trusted_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index d3e2ea3..c63c3be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -19,9 +19,6 @@
 #include <net/9p/client.h>
 
 extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern struct xattr_handler v9fs_xattr_user_handler;
-extern struct xattr_handler v9fs_xattr_trusted_handler;
-extern struct xattr_handler v9fs_xattr_security_handler;
 extern const struct xattr_handler v9fs_xattr_acl_access_handler;
 extern const struct xattr_handler v9fs_xattr_acl_default_handler;
 
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
deleted file mode 100644
index c0a470a..0000000
--- a/fs/9p/xattr_security.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_security_get(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			void *buffer, size_t size)
-{
-	int retval;
-	char *full_name;
-	size_t name_len;
-	size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	name_len = strlen(name);
-	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
-	if (!full_name)
-		return -ENOMEM;
-	memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
-	memcpy(full_name+prefix_len, name, name_len);
-	full_name[prefix_len + name_len] = '\0';
-
-	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
-	kfree(full_name);
-	return retval;
-}
-
-static int v9fs_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
-{
-	int retval;
-	char *full_name;
-	size_t name_len;
-	size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	name_len = strlen(name);
-	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
-	if (!full_name)
-		return -ENOMEM;
-	memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
-	memcpy(full_name + prefix_len, name, name_len);
-	full_name[prefix_len + name_len] = '\0';
-
-	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
-	kfree(full_name);
-	return retval;
-}
-
-struct xattr_handler v9fs_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.get	= v9fs_xattr_security_get,
-	.set	= v9fs_xattr_security_set,
-};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
deleted file mode 100644
index b888a4e..0000000
--- a/fs/9p/xattr_trusted.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_trusted_get(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			void *buffer, size_t size)
-{
-	int retval;
-	char *full_name;
-	size_t name_len;
-	size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	name_len = strlen(name);
-	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
-	if (!full_name)
-		return -ENOMEM;
-	memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
-	memcpy(full_name+prefix_len, name, name_len);
-	full_name[prefix_len + name_len] = '\0';
-
-	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
-	kfree(full_name);
-	return retval;
-}
-
-static int v9fs_xattr_trusted_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
-{
-	int retval;
-	char *full_name;
-	size_t name_len;
-	size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	name_len = strlen(name);
-	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
-	if (!full_name)
-		return -ENOMEM;
-	memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
-	memcpy(full_name + prefix_len, name, name_len);
-	full_name[prefix_len + name_len] = '\0';
-
-	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
-	kfree(full_name);
-	return retval;
-}
-
-struct xattr_handler v9fs_xattr_trusted_handler = {
-	.prefix	= XATTR_TRUSTED_PREFIX,
-	.get	= v9fs_xattr_trusted_get,
-	.set	= v9fs_xattr_trusted_set,
-};
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
deleted file mode 100644
index 06f136c..0000000
--- a/fs/9p/xattr_user.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_user_get(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			void *buffer, size_t size)
-{
-	int retval;
-	char *full_name;
-	size_t name_len;
-	size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	name_len = strlen(name);
-	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
-	if (!full_name)
-		return -ENOMEM;
-	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
-	memcpy(full_name+prefix_len, name, name_len);
-	full_name[prefix_len + name_len] = '\0';
-
-	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
-	kfree(full_name);
-	return retval;
-}
-
-static int v9fs_xattr_user_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
-{
-	int retval;
-	char *full_name;
-	size_t name_len;
-	size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
-	name_len = strlen(name);
-	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
-	if (!full_name)
-		return -ENOMEM;
-	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
-	memcpy(full_name + prefix_len, name, name_len);
-	full_name[prefix_len + name_len] = '\0';
-
-	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
-	kfree(full_name);
-	return retval;
-}
-
-struct xattr_handler v9fs_xattr_user_handler = {
-	.prefix	= XATTR_USER_PREFIX,
-	.get	= v9fs_xattr_user_get,
-	.set	= v9fs_xattr_user_set,
-};
diff --git a/fs/xattr.c b/fs/xattr.c
index 44377b6..9b932b9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -790,6 +790,30 @@ EXPORT_SYMBOL(generic_listxattr);
 EXPORT_SYMBOL(generic_setxattr);
 EXPORT_SYMBOL(generic_removexattr);
 
+/**
+ * xattr_full_name  -  Compute full attribute name from suffix
+ *
+ * @handler:	handler of the xattr_handler operation
+ * @name:	name passed to the xattr_handler operation
+ *
+ * The get and set xattr handler operations are called with the remainder of
+ * the attribute name after skipping the handler's prefix: for example, "foo"
+ * is passed to the get operation of a handler with prefix "user." to get
+ * attribute "user.foo".  The full name is still "there" in the name though.
+ *
+ * Note: the list xattr handler operation when called from the vfs is passed a
+ * NULL name; some file systems use this operation internally, with varying
+ * semantics.
+ */
+const char *xattr_full_name(const struct xattr_handler *handler,
+			    const char *name)
+{
+	size_t prefix_len = strlen(handler->prefix);
+
+	return name - prefix_len;
+}
+EXPORT_SYMBOL(xattr_full_name);
+
 /*
  * Allocate new xattr and copy in the value; but leave the name to callers.
  */
-- 
cgit v1.1


From 0ddaf72c1da8b84bc2896e1997ef3f1a22765194 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:53 +0200
Subject: squashfs: xattr simplifications

Now that the xattr handler is passed to the xattr handler operations, we
have access to the attribute name prefix, so simplify the squashfs xattr
handlers a bit.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/squashfs/xattr.c | 90 ++++++++++++++++++-----------------------------------
 1 file changed, 31 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 4ae1e4f..6a4cc34 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -212,96 +212,68 @@ failed:
 }
 
 
-/*
- * User namespace support
- */
-static size_t squashfs_user_list(const struct xattr_handler *handler,
-				 struct dentry *d, char *list, size_t list_size,
-				 const char *name, size_t name_len)
+static size_t squashfs_xattr_handler_list(const struct xattr_handler *handler,
+					  struct dentry *d, char *list,
+					  size_t list_size, const char *name,
+					  size_t name_len)
 {
-	if (list && XATTR_USER_PREFIX_LEN <= list_size)
-		memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
-	return XATTR_USER_PREFIX_LEN;
+	int len = strlen(handler->prefix);
+
+	if (list && len <= list_size)
+		memcpy(list, handler->prefix, len);
+	return len;
 }
 
-static int squashfs_user_get(const struct xattr_handler *handler,
-			     struct dentry *d, const char *name, void *buffer,
-			     size_t size)
+static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
+				      struct dentry *d, const char *name,
+				      void *buffer, size_t size)
 {
 	if (name[0] == '\0')
 		return  -EINVAL;
 
-	return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
+	return squashfs_xattr_get(d_inode(d), handler->flags, name,
 		buffer, size);
 }
 
+/*
+ * User namespace support
+ */
 static const struct xattr_handler squashfs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
-	.list	= squashfs_user_list,
-	.get	= squashfs_user_get
+	.flags	= SQUASHFS_XATTR_USER,
+	.list	= squashfs_xattr_handler_list,
+	.get	= squashfs_xattr_handler_get
 };
 
 /*
  * Trusted namespace support
  */
-static size_t squashfs_trusted_list(const struct xattr_handler *handler,
-				    struct dentry *d, char *list,
-				    size_t list_size, const char *name,
-				    size_t name_len)
+static size_t squashfs_trusted_xattr_handler_list(const struct xattr_handler *handler,
+						  struct dentry *d, char *list,
+						  size_t list_size, const char *name,
+						  size_t name_len)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return 0;
-
-	if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
-		memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
-	return XATTR_TRUSTED_PREFIX_LEN;
-}
-
-static int squashfs_trusted_get(const struct xattr_handler *handler,
-				struct dentry *d, const char *name,
-				void *buffer, size_t size)
-{
-	if (name[0] == '\0')
-		return  -EINVAL;
-
-	return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
-		buffer, size);
+	return squashfs_xattr_handler_list(handler, d, list, list_size, name,
+					   name_len);
 }
 
 static const struct xattr_handler squashfs_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= squashfs_trusted_list,
-	.get	= squashfs_trusted_get
+	.flags	= SQUASHFS_XATTR_TRUSTED,
+	.list	= squashfs_trusted_xattr_handler_list,
+	.get	= squashfs_xattr_handler_get
 };
 
 /*
  * Security namespace support
  */
-static size_t squashfs_security_list(const struct xattr_handler *handler,
-				     struct dentry *d, char *list,
-				     size_t list_size, const char *name,
-				     size_t name_len)
-{
-	if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
-		memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
-	return XATTR_SECURITY_PREFIX_LEN;
-}
-
-static int squashfs_security_get(const struct xattr_handler *handler,
-				 struct dentry *d, const char *name,
-				 void *buffer, size_t size)
-{
-	if (name[0] == '\0')
-		return  -EINVAL;
-
-	return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
-		buffer, size);
-}
-
 static const struct xattr_handler squashfs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= squashfs_security_list,
-	.get	= squashfs_security_get
+	.flags	= SQUASHFS_XATTR_SECURITY,
+	.list	= squashfs_xattr_handler_list,
+	.get	= squashfs_xattr_handler_get
 };
 
 static const struct xattr_handler *squashfs_xattr_handler(int type)
-- 
cgit v1.1


From 29608d208b3619b3b508a6871622db789611d8a3 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sun, 4 Oct 2015 19:18:54 +0200
Subject: f2fs: xattr simplifications

Now that the xattr handler is passed to the xattr handler operations, we
have access to the attribute name prefix, so simplify
f2fs_xattr_generic_list.

Also, f2fs_xattr_advise_list is only ever called for
f2fs_xattr_advise_handler; there is no need to double check for that.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Changman Lee <cm224.lee@samsung.com>
Cc: Chao Yu <chao2.yu@samsung.com>
Cc: linux-f2fs-devel@lists.sourceforge.net
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/f2fs/xattr.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index e643173..862368a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -30,33 +30,27 @@ static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
 		const char *name, size_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-	int total_len, prefix_len = 0;
-	const char *prefix = NULL;
+	int total_len, prefix_len;
 
 	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
 		if (!test_opt(sbi, XATTR_USER))
 			return -EOPNOTSUPP;
-		prefix = XATTR_USER_PREFIX;
-		prefix_len = XATTR_USER_PREFIX_LEN;
 		break;
 	case F2FS_XATTR_INDEX_TRUSTED:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
-		prefix = XATTR_TRUSTED_PREFIX;
-		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 		break;
 	case F2FS_XATTR_INDEX_SECURITY:
-		prefix = XATTR_SECURITY_PREFIX;
-		prefix_len = XATTR_SECURITY_PREFIX_LEN;
 		break;
 	default:
 		return -EINVAL;
 	}
 
+	prefix_len = strlen(handler->prefix);
 	total_len = prefix_len + len + 1;
 	if (list && total_len <= list_size) {
-		memcpy(list, prefix, prefix_len);
+		memcpy(list, handler->prefix, prefix_len);
 		memcpy(list + prefix_len, name, len);
 		list[prefix_len + len] = '\0';
 	}
@@ -123,9 +117,6 @@ static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
 	const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
 	size_t size;
 
-	if (handler->flags != F2FS_XATTR_INDEX_ADVISE)
-		return 0;
-
 	size = strlen(xname) + 1;
 	if (list && size <= list_size)
 		memcpy(list, xname, size);
-- 
cgit v1.1