From 537d8f93805ace30ce097736d3aac041931274b1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 29 Aug 2014 20:49:51 -0400
Subject: ext4: convert ext4_dx_find_entry() to use the ERR_PTR convention

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 90a3cdc..1421ec1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -270,8 +270,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 __u32 *start_hash);
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 		const struct qstr *d_name,
-		struct ext4_dir_entry_2 **res_dir,
-		int *err);
+		struct ext4_dir_entry_2 **res_dir);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
@@ -1258,17 +1257,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 		goto restart;
 	}
 	if (is_dx(dir)) {
-		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
+		bh = ext4_dx_find_entry(dir, d_name, res_dir);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
 		 * old fashioned way.
 		 */
-		if (err == -ENOENT)
-			return NULL;
-		if (err && err != ERR_BAD_DX_DIR)
-			return ERR_PTR(err);
-		if (bh)
+		if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
 			return bh;
 		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
 			       "falling back\n"));
@@ -1366,34 +1361,32 @@ cleanup_and_exit:
 }
 
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-		       struct ext4_dir_entry_2 **res_dir, int *err)
+		       struct ext4_dir_entry_2 **res_dir)
 {
 	struct super_block * sb = dir->i_sb;
 	struct dx_hash_info	hinfo;
 	struct dx_frame frames[2], *frame;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
-	int retval;
+	int err = 0, retval;
 
-	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-		return NULL;
+	frame = dx_probe(d_name, dir, &hinfo, frames, &err);
+	if (err)
+		return ERR_PTR(err);
 	do {
 		block = dx_get_block(frame->at);
 		bh = ext4_read_dirblock(dir, block, DIRENT);
-		if (IS_ERR(bh)) {
-			*err = PTR_ERR(bh);
+		if (IS_ERR(bh))
 			goto errout;
-		}
+
 		retval = search_dirblock(bh, dir, d_name,
 					 block << EXT4_BLOCK_SIZE_BITS(sb),
 					 res_dir);
-		if (retval == 1) { 	/* Success! */
-			dx_release(frames);
-			return bh;
-		}
+		if (retval == 1)
+			goto success;
 		brelse(bh);
 		if (retval == -1) {
-			*err = ERR_BAD_DX_DIR;
+			bh = ERR_PTR(ERR_BAD_DX_DIR);
 			goto errout;
 		}
 
@@ -1402,18 +1395,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 					       frames, NULL);
 		if (retval < 0) {
 			ext4_warning(sb,
-			     "error reading index page in directory #%lu",
-			     dir->i_ino);
-			*err = retval;
+			     "error %d reading index page in directory #%lu",
+			     retval, dir->i_ino);
+			bh = ERR_PTR(retval);
 			goto errout;
 		}
 	} while (retval == 1);
 
-	*err = -ENOENT;
+	bh = NULL;
 errout:
 	dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
-	dx_release (frames);
-	return NULL;
+success:
+	dx_release(frames);
+	return bh;
 }
 
 static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
-- 
cgit v1.1


From 1056008226769fe982236c26038a095aeb47714b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 29 Aug 2014 20:51:32 -0400
Subject: ext4: convert ext4_getblk() to use the ERR_PTR convention

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  3 +--
 fs/ext4/inode.c | 51 +++++++++++++++++++++++++--------------------------
 fs/ext4/namei.c |  9 ++++-----
 3 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0c225c..8009077 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2086,8 +2086,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 
 /* inode.c */
-struct buffer_head *ext4_getblk(handle_t *, struct inode *,
-						ext4_lblk_t, int, int *);
+struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3aa26e9..0dfc1cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -734,11 +734,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
-				ext4_lblk_t block, int create, int *errp)
+				ext4_lblk_t block, int create)
 {
 	struct ext4_map_blocks map;
 	struct buffer_head *bh;
-	int fatal = 0, err;
+	int err;
 
 	J_ASSERT(handle != NULL || create == 0);
 
@@ -747,21 +747,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	err = ext4_map_blocks(handle, inode, &map,
 			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 
-	/* ensure we send some value back into *errp */
-	*errp = 0;
-
-	if (create && err == 0)
-		err = -ENOSPC;	/* should never happen */
+	if (err == 0)
+		return create ? ERR_PTR(-ENOSPC) : NULL;
 	if (err < 0)
-		*errp = err;
-	if (err <= 0)
-		return NULL;
+		return ERR_PTR(err);
 
 	bh = sb_getblk(inode->i_sb, map.m_pblk);
-	if (unlikely(!bh)) {
-		*errp = -ENOMEM;
-		return NULL;
-	}
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
 	if (map.m_flags & EXT4_MAP_NEW) {
 		J_ASSERT(create != 0);
 		J_ASSERT(handle != NULL);
@@ -775,25 +768,26 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 		 */
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
-		fatal = ext4_journal_get_create_access(handle, bh);
-		if (!fatal && !buffer_uptodate(bh)) {
+		err = ext4_journal_get_create_access(handle, bh);
+		if (unlikely(err)) {
+			unlock_buffer(bh);
+			goto errout;
+		}
+		if (!buffer_uptodate(bh)) {
 			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 			set_buffer_uptodate(bh);
 		}
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
-		if (!fatal)
-			fatal = err;
-	} else {
+		if (unlikely(err))
+			goto errout;
+	} else
 		BUFFER_TRACE(bh, "not a new buffer");
-	}
-	if (fatal) {
-		*errp = fatal;
-		brelse(bh);
-		bh = NULL;
-	}
 	return bh;
+errout:
+	brelse(bh);
+	return ERR_PTR(err);
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -801,7 +795,12 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head *bh;
 
-	bh = ext4_getblk(handle, inode, block, create, err);
+	*err = 0;
+	bh = ext4_getblk(handle, inode, block, create);
+	if (IS_ERR(bh)) {
+		*err = PTR_ERR(bh);
+		return NULL;
+	}
 	if (!bh)
 		return bh;
 	if (buffer_uptodate(bh))
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1421ec1..26f114b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1226,8 +1226,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 				   buffer */
 	int num = 0;
 	ext4_lblk_t  nblocks;
-	int i, err = 0;
-	int namelen;
+	int i, namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
@@ -1293,10 +1292,10 @@ restart:
 					break;
 				}
 				num++;
-				bh = ext4_getblk(NULL, dir, b++, 0, &err);
-				if (unlikely(err)) {
+				bh = ext4_getblk(NULL, dir, b++, 0);
+				if (unlikely(IS_ERR(bh))) {
 					if (ra_max == 0)
-						return ERR_PTR(err);
+						return bh;
 					break;
 				}
 				bh_use[ra_max] = bh;
-- 
cgit v1.1


From 1c2150283cae895526d0db3953d13d139f4e7a03 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 29 Aug 2014 20:52:15 -0400
Subject: ext4: convert ext4_bread() to use the ERR_PTR convention

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/dir.c   |  8 +++-----
 fs/ext4/ext4.h  |  3 +--
 fs/ext4/inode.c | 14 ++++----------
 fs/ext4/namei.c | 34 ++++++++++++++++++----------------
 fs/ext4/super.c | 18 ++++++++----------
 5 files changed, 34 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0bb3f9e..c24143e 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 					&file->f_ra, file,
 					index, 1);
 			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0);
+			if (IS_ERR(bh))
+				return PTR_ERR(bh);
 		}
 
-		/*
-		 * We ignore I/O errors on directories so users have a chance
-		 * of recovering data when there's a bad sector
-		 */
 		if (!bh) {
 			if (!dir_has_error) {
 				EXT4_ERROR_FILE(file, 0,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8009077..ca53bce 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2087,8 +2087,7 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
-struct buffer_head *ext4_bread(handle_t *, struct inode *,
-						ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			 struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0dfc1cd..8aa241a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -791,27 +791,21 @@ errout:
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t block, int create, int *err)
+			       ext4_lblk_t block, int create)
 {
 	struct buffer_head *bh;
 
-	*err = 0;
 	bh = ext4_getblk(handle, inode, block, create);
-	if (IS_ERR(bh)) {
-		*err = PTR_ERR(bh);
-		return NULL;
-	}
-	if (!bh)
+	if (IS_ERR(bh))
 		return bh;
-	if (buffer_uptodate(bh))
+	if (!bh || buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	put_bh(bh);
-	*err = -EIO;
-	return NULL;
+	return ERR_PTR(-EIO);
 }
 
 int ext4_walk_page_buffers(handle_t *handle,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 26f114b..af13c90 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
 					ext4_lblk_t *block)
 {
 	struct buffer_head *bh;
-	int err = 0;
+	int err;
 
 	if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
 		     ((inode->i_size >> 10) >=
@@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle,
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
-	bh = ext4_bread(handle, inode, *block, 1, &err);
-	if (!bh)
-		return ERR_PTR(err);
+	bh = ext4_bread(handle, inode, *block, 1);
+	if (IS_ERR(bh))
+		return bh;
 	inode->i_size += inode->i_sb->s_blocksize;
 	EXT4_I(inode)->i_disksize = inode->i_size;
 	BUFFER_TRACE(bh, "get_write_access");
@@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 {
 	struct buffer_head *bh;
 	struct ext4_dir_entry *dirent;
-	int err = 0, is_dx_block = 0;
+	int is_dx_block = 0;
 
-	bh = ext4_bread(NULL, inode, block, 0, &err);
-	if (!bh) {
-		if (err == 0) {
-			ext4_error_inode(inode, __func__, line, block,
-					       "Directory hole found");
-			return ERR_PTR(-EIO);
-		}
+	bh = ext4_bread(NULL, inode, block, 0);
+	if (IS_ERR(bh)) {
 		__ext4_warning(inode->i_sb, __func__, line,
-			       "error reading directory block "
-			       "(ino %lu, block %lu)", inode->i_ino,
+			       "error %ld reading directory block "
+			       "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
 			       (unsigned long) block);
-		return ERR_PTR(err);
+
+		return bh;
+	}
+	if (!bh) {
+		ext4_error_inode(inode, __func__, line, block, "Directory hole found");
+		return ERR_PTR(-EIO);
 	}
 	dirent = (struct ext4_dir_entry *) bh->b_data;
 	/* Determine whether or not we have an index block */
@@ -640,7 +640,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
 		struct stats stats;
 		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
-		if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+		bh = ext4_bread(NULL,dir, block, 0);
+		if (!bh || IS_ERR(bh))
+			continue;
 		stats = levels?
 		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
 		   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0b28b36..896e452 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5305,7 +5305,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
-	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
 	int tocopy;
 	size_t toread;
@@ -5320,9 +5319,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 	while (toread > 0) {
 		tocopy = sb->s_blocksize - offset < toread ?
 				sb->s_blocksize - offset : toread;
-		bh = ext4_bread(NULL, inode, blk, 0, &err);
-		if (err)
-			return err;
+		bh = ext4_bread(NULL, inode, blk, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
 		if (!bh)	/* A hole? */
 			memset(data, 0, tocopy);
 		else
@@ -5343,8 +5342,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
-	int err = 0;
-	int offset = off & (sb->s_blocksize - 1);
+	int err, offset = off & (sb->s_blocksize - 1);
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
@@ -5365,14 +5363,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 		return -EIO;
 	}
 
-	bh = ext4_bread(handle, inode, blk, 1, &err);
+	bh = ext4_bread(handle, inode, blk, 1);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	if (!bh)
 		goto out;
 	BUFFER_TRACE(bh, "get write access");
 	err = ext4_journal_get_write_access(handle, bh);
 	if (err) {
 		brelse(bh);
-		goto out;
+		return err;
 	}
 	lock_buffer(bh);
 	memcpy(bh->b_data+offset, data, len);
@@ -5381,8 +5381,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	err = ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
 out:
-	if (err)
-		return err;
 	if (inode->i_size < off + len) {
 		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
-- 
cgit v1.1


From dd73b5d5cb675e2aa3b1d4952e208af1546f91c1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 29 Aug 2014 20:52:17 -0400
Subject: ext4: convert dx_probe() to use the ERR_PTR convention

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 89 +++++++++++++++++++++++----------------------------------
 1 file changed, 35 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index af13c90..e6d5165 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -253,8 +253,7 @@ static unsigned dx_node_limit(struct inode *dir);
 static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
-				 struct dx_frame *frame,
-				 int *err);
+				 struct dx_frame *frame);
 static void dx_release(struct dx_frame *frames);
 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
@@ -670,29 +669,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  */
 static struct dx_frame *
 dx_probe(const struct qstr *d_name, struct inode *dir,
-	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+	 struct dx_hash_info *hinfo, struct dx_frame *frame_in)
 {
 	unsigned count, indirect;
 	struct dx_entry *at, *entries, *p, *q, *m;
 	struct dx_root *root;
-	struct buffer_head *bh;
 	struct dx_frame *frame = frame_in;
+	struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
 	u32 hash;
 
-	frame->bh = NULL;
-	bh = ext4_read_dirblock(dir, 0, INDEX);
-	if (IS_ERR(bh)) {
-		*err = PTR_ERR(bh);
-		goto fail;
-	}
-	root = (struct dx_root *) bh->b_data;
+	frame->bh = ext4_read_dirblock(dir, 0, INDEX);
+	if (IS_ERR(frame->bh))
+		return (struct dx_frame *) frame->bh;
+
+	root = (struct dx_root *) frame->bh->b_data;
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
 	    root->info.hash_version != DX_HASH_LEGACY) {
 		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
 			     root->info.hash_version);
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
 		goto fail;
 	}
 	hinfo->hash_version = root->info.hash_version;
@@ -706,16 +701,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	if (root->info.unused_flags & 1) {
 		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
 			     root->info.unused_flags);
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
 		goto fail;
 	}
 
 	if ((indirect = root->info.indirect_levels) > 1) {
 		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
 			     root->info.indirect_levels);
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
 		goto fail;
 	}
 
@@ -725,27 +716,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	if (dx_get_limit(entries) != dx_root_limit(dir,
 						   root->info.info_length)) {
 		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
 		goto fail;
 	}
 
 	dxtrace(printk("Look up %x", hash));
-	while (1)
-	{
+	while (1) {
 		count = dx_get_count(entries);
 		if (!count || count > dx_get_limit(entries)) {
 			ext4_warning(dir->i_sb,
 				     "dx entry: no count or count > limit");
-			brelse(bh);
-			*err = ERR_BAD_DX_DIR;
-			goto fail2;
+			goto fail;
 		}
 
 		p = entries + 1;
 		q = entries + count - 1;
-		while (p <= q)
-		{
+		while (p <= q) {
 			m = p + (q - p)/2;
 			dxtrace(printk("."));
 			if (dx_get_hash(m) > hash)
@@ -754,8 +739,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 				p = m + 1;
 		}
 
-		if (0) // linear search cross check
-		{
+		if (0) { // linear search cross check
 			unsigned n = count - 1;
 			at = entries;
 			while (n--)
@@ -772,38 +756,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 
 		at = p - 1;
 		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
-		frame->bh = bh;
 		frame->entries = entries;
 		frame->at = at;
-		if (!indirect--) return frame;
-		bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
-		if (IS_ERR(bh)) {
-			*err = PTR_ERR(bh);
-			goto fail2;
+		if (!indirect--)
+			return frame;
+		frame++;
+		frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+		if (IS_ERR(frame->bh)) {
+			ret_err = (struct dx_frame *) frame->bh;
+			frame->bh = NULL;
+			goto fail;
 		}
-		entries = ((struct dx_node *) bh->b_data)->entries;
+		entries = ((struct dx_node *) frame->bh->b_data)->entries;
 
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
 			ext4_warning(dir->i_sb,
 				     "dx entry: limit != node limit");
-			brelse(bh);
-			*err = ERR_BAD_DX_DIR;
-			goto fail2;
+			goto fail;
 		}
-		frame++;
-		frame->bh = NULL;
 	}
-fail2:
+fail:
 	while (frame >= frame_in) {
 		brelse(frame->bh);
 		frame--;
 	}
-fail:
-	if (*err == ERR_BAD_DX_DIR)
+	if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
 		ext4_warning(dir->i_sb,
 			     "Corrupt dir inode %lu, running e2fsck is "
 			     "recommended.", dir->i_ino);
-	return NULL;
+	return ret_err;
 }
 
 static void dx_release (struct dx_frame *frames)
@@ -989,9 +970,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	hinfo.hash = start_hash;
 	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
-	if (!frame)
-		return err;
+	frame = dx_probe(NULL, dir, &hinfo, frames);
+	if (IS_ERR(frame))
+		return PTR_ERR(frame);
 
 	/* Add '.' and '..' from the htree header */
 	if (!start_hash && !start_minor_hash) {
@@ -1369,11 +1350,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 	struct dx_frame frames[2], *frame;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
-	int err = 0, retval;
+	int retval;
 
-	frame = dx_probe(d_name, dir, &hinfo, frames, &err);
-	if (err)
-		return ERR_PTR(err);
+	frame = dx_probe(d_name, dir, &hinfo, frames);
+	if (IS_ERR(frame))
+		return (struct buffer_head *) frame;
 	do {
 		block = dx_get_block(frame->at);
 		bh = ext4_read_dirblock(dir, block, DIRENT);
@@ -1977,9 +1958,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct ext4_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
-	if (!frame)
-		return err;
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
+	if (IS_ERR(frame))
+		return PTR_ERR(frame);
 	entries = frame->entries;
 	at = frame->at;
 	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
-- 
cgit v1.1


From f8b3b59d4d561368cf8c92d50218fc0d5be7cb46 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 29 Aug 2014 20:52:18 -0400
Subject: ext4: convert do_split() to use the ERR_PTR convention

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e6d5165..dec92b6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1509,7 +1509,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
  */
 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 			struct buffer_head **bh,struct dx_frame *frame,
-			struct dx_hash_info *hinfo, int *error)
+			struct dx_hash_info *hinfo)
 {
 	unsigned blocksize = dir->i_sb->s_blocksize;
 	unsigned count, continued;
@@ -1532,8 +1532,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	if (IS_ERR(bh2)) {
 		brelse(*bh);
 		*bh = NULL;
-		*error = PTR_ERR(bh2);
-		return NULL;
+		return (struct ext4_dir_entry_2 *) bh2;
 	}
 
 	BUFFER_TRACE(*bh, "get_write_access");
@@ -1593,8 +1592,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
 	/* Which block gets the new entry? */
-	if (hinfo->hash >= hash2)
-	{
+	if (hinfo->hash >= hash2) {
 		swap(*bh, bh2);
 		de = de2;
 	}
@@ -1614,8 +1612,7 @@ journal_error:
 	brelse(bh2);
 	*bh = NULL;
 	ext4_std_error(dir->i_sb, err);
-	*error = err;
-	return NULL;
+	return ERR_PTR(err);
 }
 
 int ext4_find_dest_de(struct inode *dir, struct inode *inode,
@@ -1838,8 +1835,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
 	ext4_handle_dirty_dirent_node(handle, dir, bh);
 
-	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-	if (!de) {
+	de = do_split(handle,dir, &bh, frame, &hinfo);
+	if (IS_ERR(de)) {
 		/*
 		 * Even if the block split failed, we have to properly write
 		 * out all the changes we did so far. Otherwise we can end up
@@ -1847,7 +1844,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 		 */
 		ext4_mark_inode_dirty(handle, dir);
 		dx_release(frames);
-		return retval;
+		return PTR_ERR(de);
 	}
 	dx_release(frames);
 
@@ -2071,9 +2068,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		}
 	}
-	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
-	if (!de)
+	de = do_split(handle, dir, &bh, frame, &hinfo);
+	if (IS_ERR(de)) {
+		err = PTR_ERR(de);
 		goto cleanup;
+	}
 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	goto cleanup;
 
-- 
cgit v1.1


From 52c826db6d4b638677683c79e6c465b99074be74 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wshilong@ddn.com>
Date: Fri, 29 Aug 2014 23:20:44 -0400
Subject: ext4: remove a duplicate call in ext4_init_new_dir()

ext4_journal_get_write_access() has just been called in ext4_append()
calling it again here is duplicated.

Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index dec92b6..51705f8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2378,10 +2378,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
 	dir_block = ext4_append(handle, inode, &block);
 	if (IS_ERR(dir_block))
 		return PTR_ERR(dir_block);
-	BUFFER_TRACE(dir_block, "get_write_access");
-	err = ext4_journal_get_write_access(handle, dir_block);
-	if (err)
-		goto out;
 	de = (struct ext4_dir_entry_2 *)dir_block->b_data;
 	ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
 	set_nlink(inode, 2);
-- 
cgit v1.1


From ee124d2746250786b306952bb8955d3171fa8e69 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 30 Aug 2014 23:34:06 -0400
Subject: ext4: use ext4_update_i_disksize instead of opencoded ones

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8aa241a..cc95dca 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2651,10 +2651,7 @@ static int ext4_da_write_end(struct file *file,
 	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
 		if (ext4_has_inline_data(inode) ||
 		    ext4_da_should_update_i_disksize(page, end)) {
-			down_write(&EXT4_I(inode)->i_data_sem);
-			if (new_i_size > EXT4_I(inode)->i_disksize)
-				EXT4_I(inode)->i_disksize = new_i_size;
-			up_write(&EXT4_I(inode)->i_data_sem);
+			ext4_update_i_disksize(inode, new_i_size);
 			/* We need to mark inode dirty even if
 			 * new_i_size is less that inode->i_size
 			 * bu greater than i_disksize.(hint delalloc)
-- 
cgit v1.1


From f8fb4f415034baeed983ca2fb0f51bd74d7370b0 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 30 Aug 2014 23:50:56 -0400
Subject: ext4: use ext4_ext_next_allocated_block instead of mext_next_extent

This allows us to make mext_next_extent static and potentially get rid
of it.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        |  2 --
 fs/ext4/extents.c     | 16 +++++++---------
 fs/ext4/move_extent.c |  2 +-
 3 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ca53bce..420c9be 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2753,8 +2753,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
-extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-			    struct ext4_extent **extent);
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74292a7..1b76834 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5304,7 +5304,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	struct ext4_ext_path *path;
 	int ret = 0, depth;
 	struct ext4_extent *extent;
-	ext4_lblk_t stop_block, current_block;
+	ext4_lblk_t stop_block;
 	ext4_lblk_t ex_start, ex_end;
 
 	/* Let path point to the last extent */
@@ -5365,17 +5365,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 					 (unsigned long) start);
 			return -EIO;
 		}
-
-		current_block = le32_to_cpu(extent->ee_block);
-		if (start > current_block) {
+		if (start > le32_to_cpu(extent->ee_block)) {
 			/* Hole, move to the next extent */
-			ret = mext_next_extent(inode, path, &extent);
-			if (ret != 0) {
+			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
+				path[depth].p_ext++;
+			} else {
+				start = ext4_ext_next_allocated_block(path);
 				ext4_ext_drop_refs(path);
 				kfree(path);
-				if (ret == 1)
-					ret = 0;
-				break;
+				continue;
 			}
 		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 671a74b..123a51b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
  * ext4_ext_path structure refers to the last extent, or a negative error
  * value on failure.
  */
-int
+static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 		      struct ext4_extent **extent)
 {
-- 
cgit v1.1


From fcf6b1b729bcd23f2b49a84fb33ffbb44712ee6a Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 30 Aug 2014 23:52:19 -0400
Subject: ext4: refactor ext4_move_extents code base

ext4_move_extents is too complex for review. It has duplicate almost
each function available in the rest of other codebase. It has useless
artificial restriction orig_offset == donor_offset. But in fact logic
of ext4_move_extents is very simple:

Iterate extents one by one (similar to ext4_fill_fiemap_extents)
   ->Iterate each page covered extent (similar to generic_perform_write)
     ->swap extents for covered by page (can be shared with IOC_MOVE_DATA)

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        |   5 +
 fs/ext4/extents.c     | 234 +++++++++++-
 fs/ext4/move_extent.c | 990 ++++++--------------------------------------------
 3 files changed, 338 insertions(+), 891 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 420c9be..cf3ad75 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2740,10 +2740,15 @@ extern int ext4_find_delalloc_range(struct inode *inode,
 				    ext4_lblk_t lblk_start,
 				    ext4_lblk_t lblk_end);
 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+				struct inode *inode2, ext4_lblk_t lblk1,
+			     ext4_lblk_t lblk2,  ext4_lblk_t count,
+			     int mark_unwritten,int *err);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1b76834..73d9ae9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -291,6 +291,19 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 	return size;
 }
 
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+			   struct ext4_ext_path *path, ext4_lblk_t lblk,
+			   int nofail)
+{
+	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+
+	return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
+			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+			EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
+			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
 /*
  * Calculate the number of metadata blocks needed
  * to allocate @blocks
@@ -1559,7 +1572,7 @@ found_extent:
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
 	int depth;
@@ -2854,24 +2867,14 @@ again:
 		 */
 		if (end >= ee_block &&
 		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
-			int split_flag = 0;
-
-			if (ext4_ext_is_unwritten(ex))
-				split_flag = EXT4_EXT_MARK_UNWRIT1 |
-					     EXT4_EXT_MARK_UNWRIT2;
-
 			/*
 			 * Split the extent in two so that 'end' is the last
 			 * block in the first new extent. Also we should not
 			 * fail removing space due to ENOSPC so try to use
 			 * reserved block if that happens.
 			 */
-			err = ext4_split_extent_at(handle, inode, path,
-					end + 1, split_flag,
-					EXT4_EX_NOCACHE |
-					EXT4_GET_BLOCKS_PRE_IO |
-					EXT4_GET_BLOCKS_METADATA_NOFAIL);
-
+			err = ext4_force_split_extent_at(handle, inode, path,
+							 end + 1, 1);
 			if (err < 0)
 				goto out;
 		}
@@ -5506,3 +5509,208 @@ out_mutex:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
+
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:	First inode
+ * @inode2:	Second inode
+ * @lblk1:	Start block for first inode
+ * @lblk2:	Start block for second inode
+ * @count:	Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:	Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ * 		i_mutex is held for both inodes
+ * 		i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *		All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+		     struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, int unwritten, int *erp)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+
+	*erp = ext4_es_remove_extent(inode1, lblk1, count);
+	if (*erp)
+		return 0;
+	*erp = ext4_es_remove_extent(inode2, lblk2, count);
+	if (*erp)
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path1)) {
+			*erp = PTR_ERR(path1);
+			break;
+		}
+		path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path2)) {
+			*erp = PTR_ERR(path2);
+			break;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have somthing to swap ? */
+		if (unlikely(!ex2 || !ex1))
+			break;
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		/* Hole handling */
+		if (!in_range(lblk1, e1_blk, e1_len) ||
+		    !in_range(lblk2, e2_blk, e2_len)) {
+			ext4_lblk_t next1, next2;
+
+			/* if hole after extent, then go to next extent */
+			next1 = ext4_ext_next_allocated_block(path1);
+			next2 = ext4_ext_next_allocated_block(path2);
+			/* If hole before extent, then shift to that extent */
+			if (e1_blk > lblk1)
+				next1 = e1_blk;
+			if (e2_blk > lblk2)
+				next2 = e1_blk;
+			/* Do we have something to swap */
+			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+				break;
+			/* Move to the rightest boundary */
+			len = next1 - lblk1;
+			if (len < next2 - lblk2)
+				len = next2 - lblk2;
+			if (len > count)
+				len = count;
+			lblk1 += len;
+			lblk2 += len;
+			count -= len;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						path1, lblk1, 0);
+			if (*erp)
+				break;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						path2,  lblk2, 0);
+			if (*erp)
+				break;
+		}
+		/* ext4_split_extent_at() may retult in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						path1, lblk1 + len, 0);
+			if (*erp)
+				break;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						path2, lblk2 + len, 0);
+			if (*erp)
+				break;
+		}
+		/* ext4_split_extent_at() may retult in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (*erp)
+			break;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (*erp)
+			break;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = cpu_to_le16(e2_len);
+		ex2->ee_len = cpu_to_le16(e1_len);
+		if (unwritten)
+			ext4_ext_mark_unwritten(ex2);
+		if (ext4_ext_is_unwritten(&tmp_ex))
+			ext4_ext_mark_unwritten(ex1);
+
+		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (*erp)
+			break;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (*erp)
+			break;
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+	repeat:
+		if (path1) {
+			ext4_ext_drop_refs(path1);
+			kfree(path1);
+			path1 = NULL;
+		}
+		if (path2) {
+			ext4_ext_drop_refs(path2);
+			kfree(path2);
+			path2 = NULL;
+		}
+	}
+	if (path1) {
+		ext4_ext_drop_refs(path1);
+		kfree(path1);
+	}
+	if (path2) {
+		ext4_ext_drop_refs(path2);
+		kfree(path2);
+	}
+	return replaced_count;
+}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 123a51b..c8f895b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -49,101 +49,6 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
 }
 
 /**
- * copy_extent_status - Copy the extent's initialization status
- *
- * @src:	an extent for getting initialize status
- * @dest:	an extent to be set the status
- */
-static void
-copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
-{
-	if (ext4_ext_is_unwritten(src))
-		ext4_ext_mark_unwritten(dest);
-	else
-		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
-}
-
-/**
- * mext_next_extent - Search for the next extent and set it to "extent"
- *
- * @inode:	inode which is searched
- * @path:	this will obtain data for the next extent
- * @extent:	pointer to the next extent we have just gotten
- *
- * Search the next extent in the array of ext4_ext_path structure (@path)
- * and set it to ext4_extent structure (@extent). In addition, the member of
- * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
- * ext4_ext_path structure refers to the last extent, or a negative error
- * value on failure.
- */
-static int
-mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-		      struct ext4_extent **extent)
-{
-	struct ext4_extent_header *eh;
-	int ppos, leaf_ppos = path->p_depth;
-
-	ppos = leaf_ppos;
-	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
-		/* leaf block */
-		*extent = ++path[ppos].p_ext;
-		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
-		return 0;
-	}
-
-	while (--ppos >= 0) {
-		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
-		    path[ppos].p_idx) {
-			int cur_ppos = ppos;
-
-			/* index block */
-			path[ppos].p_idx++;
-			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
-			if (path[ppos+1].p_bh)
-				brelse(path[ppos+1].p_bh);
-			path[ppos+1].p_bh =
-				sb_bread(inode->i_sb, path[ppos].p_block);
-			if (!path[ppos+1].p_bh)
-				return -EIO;
-			path[ppos+1].p_hdr =
-				ext_block_hdr(path[ppos+1].p_bh);
-
-			/* Halfway index block */
-			while (++cur_ppos < leaf_ppos) {
-				path[cur_ppos].p_idx =
-					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
-				path[cur_ppos].p_block =
-					ext4_idx_pblock(path[cur_ppos].p_idx);
-				if (path[cur_ppos+1].p_bh)
-					brelse(path[cur_ppos+1].p_bh);
-				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
-					path[cur_ppos].p_block);
-				if (!path[cur_ppos+1].p_bh)
-					return -EIO;
-				path[cur_ppos+1].p_hdr =
-					ext_block_hdr(path[cur_ppos+1].p_bh);
-			}
-
-			path[leaf_ppos].p_ext = *extent = NULL;
-
-			eh = path[leaf_ppos].p_hdr;
-			if (le16_to_cpu(eh->eh_entries) == 0)
-				/* empty leaf is found */
-				return -ENODATA;
-
-			/* leaf block */
-			path[leaf_ppos].p_ext = *extent =
-				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
-			path[leaf_ppos].p_block =
-					ext4_ext_pblock(path[leaf_ppos].p_ext);
-			return 0;
-		}
-	}
-	/* We found the last extent */
-	return 1;
-}
-
-/**
  * ext4_double_down_write_data_sem - Acquire two inodes' write lock
  *                                   of i_data_sem
  *
@@ -178,417 +83,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
 }
 
 /**
- * mext_insert_across_blocks - Insert extents across leaf block
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @o_start:		first original extent to be changed
- * @o_end:		last original extent to be changed
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- *
- * Allocate a new leaf block and insert extents into it. Return 0 on success,
- * or a negative error value on failure.
- */
-static int
-mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
-		struct ext4_extent *o_start, struct ext4_extent *o_end,
-		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
-		struct ext4_extent *end_ext)
-{
-	struct ext4_ext_path *orig_path = NULL;
-	ext4_lblk_t eblock = 0;
-	int new_flag = 0;
-	int end_flag = 0;
-	int err = 0;
-
-	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
-		if (o_start == o_end) {
-
-			/*       start_ext   new_ext    end_ext
-			 * donor |---------|-----------|--------|
-			 * orig  |------------------------------|
-			 */
-			end_flag = 1;
-		} else {
-
-			/*       start_ext   new_ext   end_ext
-			 * donor |---------|----------|---------|
-			 * orig  |---------------|--------------|
-			 */
-			o_end->ee_block = end_ext->ee_block;
-			o_end->ee_len = end_ext->ee_len;
-			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-		}
-
-		o_start->ee_len = start_ext->ee_len;
-		eblock = le32_to_cpu(start_ext->ee_block);
-		new_flag = 1;
-
-	} else if (start_ext->ee_len && new_ext->ee_len &&
-		   !end_ext->ee_len && o_start == o_end) {
-
-		/*	 start_ext	new_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_start->ee_len = start_ext->ee_len;
-		eblock = le32_to_cpu(start_ext->ee_block);
-		new_flag = 1;
-
-	} else if (!start_ext->ee_len && new_ext->ee_len &&
-		   end_ext->ee_len && o_start == o_end) {
-
-		/*	  new_ext	end_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_end->ee_block = end_ext->ee_block;
-		o_end->ee_len = end_ext->ee_len;
-		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-
-		/*
-		 * Set 0 to the extent block if new_ext was
-		 * the first block.
-		 */
-		if (new_ext->ee_block)
-			eblock = le32_to_cpu(new_ext->ee_block);
-
-		new_flag = 1;
-	} else {
-		ext4_debug("ext4 move extent: Unexpected insert case\n");
-		return -EIO;
-	}
-
-	if (new_flag) {
-		err = get_ext_path(orig_inode, eblock, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					orig_path, new_ext, 0))
-			goto out;
-	}
-
-	if (end_flag) {
-		err = get_ext_path(orig_inode,
-				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					   orig_path, end_ext, 0))
-			goto out;
-	}
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-
-	return err;
-
-}
-
-/**
- * mext_insert_inside_block - Insert new extent to the extent block
- *
- * @o_start:		first original extent to be moved
- * @o_end:		last original extent to be moved
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- * @eh:			extent header of target leaf block
- * @range_to_move:	used to decide how to insert extent
- *
- * Insert extents into the leaf block. The extent (@o_start) is overwritten
- * by inserted extents.
- */
-static void
-mext_insert_inside_block(struct ext4_extent *o_start,
-			      struct ext4_extent *o_end,
-			      struct ext4_extent *start_ext,
-			      struct ext4_extent *new_ext,
-			      struct ext4_extent *end_ext,
-			      struct ext4_extent_header *eh,
-			      int range_to_move)
-{
-	int i = 0;
-	unsigned long len;
-
-	/* Move the existing extents */
-	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
-		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
-			(unsigned long)(o_end + 1);
-		memmove(o_end + 1 + range_to_move, o_end + 1, len);
-	}
-
-	/* Insert start entry */
-	if (start_ext->ee_len)
-		o_start[i++].ee_len = start_ext->ee_len;
-
-	/* Insert new entry */
-	if (new_ext->ee_len) {
-		o_start[i] = *new_ext;
-		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
-	}
-
-	/* Insert end entry */
-	if (end_ext->ee_len)
-		o_start[i] = *end_ext;
-
-	/* Increment the total entries counter on the extent block */
-	le16_add_cpu(&eh->eh_entries, range_to_move);
-}
-
-/**
- * mext_insert_extents - Insert new extent
- *
- * @handle:	journal handle
- * @orig_inode:	original inode
- * @orig_path:	path indicates first extent to be changed
- * @o_start:	first original extent to be changed
- * @o_end:	last original extent to be changed
- * @start_ext:	first new extent to be inserted
- * @new_ext:	middle of new extent to be inserted
- * @end_ext:	last new extent to be inserted
- *
- * Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
- * on success, or a negative error value on failure.
- */
-static int
-mext_insert_extents(handle_t *handle, struct inode *orig_inode,
-			 struct ext4_ext_path *orig_path,
-			 struct ext4_extent *o_start,
-			 struct ext4_extent *o_end,
-			 struct ext4_extent *start_ext,
-			 struct ext4_extent *new_ext,
-			 struct ext4_extent *end_ext)
-{
-	struct  ext4_extent_header *eh;
-	unsigned long need_slots, slots_range;
-	int	range_to_move, depth, ret;
-
-	/*
-	 * The extents need to be inserted
-	 * start_extent + new_extent + end_extent.
-	 */
-	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
-		(new_ext->ee_len ? 1 : 0);
-
-	/* The number of slots between start and end */
-	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
-		/ sizeof(struct ext4_extent);
-
-	/* Range to move the end of extent */
-	range_to_move = need_slots - slots_range;
-	depth = orig_path->p_depth;
-	orig_path += depth;
-	eh = orig_path->p_hdr;
-
-	if (depth) {
-		/* Register to journal */
-		BUFFER_TRACE(orig_path->p_bh, "get_write_access");
-		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
-		if (ret)
-			return ret;
-	}
-
-	/* Expansion */
-	if (range_to_move > 0 &&
-		(range_to_move > le16_to_cpu(eh->eh_max)
-			- le16_to_cpu(eh->eh_entries))) {
-
-		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
-					o_end, start_ext, new_ext, end_ext);
-		if (ret < 0)
-			return ret;
-	} else
-		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
-						end_ext, eh, range_to_move);
-
-	return ext4_ext_dirty(handle, orig_inode, orig_path);
-}
-
-/**
- * mext_leaf_block - Move one leaf extent block into the inode.
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @orig_path:		path indicates first extent to be changed
- * @dext:		donor extent
- * @from:		start offset on the target file
- *
- * In order to insert extents into the leaf block, we must divide the extent
- * in the leaf block into three extents. The one is located to be inserted
- * extents, and the others are located around it.
- *
- * Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling mext_insert_extents() with
- * created extents. Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_leaf_block(handle_t *handle, struct inode *orig_inode,
-		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
-		     ext4_lblk_t *from)
-{
-	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
-	struct ext4_extent new_ext, start_ext, end_ext;
-	ext4_lblk_t new_ext_end;
-	int oext_alen, new_ext_alen, end_ext_alen;
-	int depth = ext_depth(orig_inode);
-	int ret;
-
-	start_ext.ee_block = end_ext.ee_block = 0;
-	o_start = o_end = oext = orig_path[depth].p_ext;
-	oext_alen = ext4_ext_get_actual_len(oext);
-	start_ext.ee_len = end_ext.ee_len = 0;
-
-	new_ext.ee_block = cpu_to_le32(*from);
-	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
-	new_ext.ee_len = dext->ee_len;
-	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
-	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-
-	/*
-	 * Case: original extent is first
-	 * oext      |--------|
-	 * new_ext      |--|
-	 * start_ext |--|
-	 */
-	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
-		le32_to_cpu(new_ext.ee_block) <
-		le32_to_cpu(oext->ee_block) + oext_alen) {
-		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
-					       le32_to_cpu(oext->ee_block));
-		start_ext.ee_block = oext->ee_block;
-		copy_extent_status(oext, &start_ext);
-	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
-		prev_ext = oext - 1;
-		/*
-		 * We can merge new_ext into previous extent,
-		 * if these are contiguous and same extent type.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
-					       &new_ext)) {
-			o_start = prev_ext;
-			start_ext.ee_len = cpu_to_le16(
-				ext4_ext_get_actual_len(prev_ext) +
-				new_ext_alen);
-			start_ext.ee_block = oext->ee_block;
-			copy_extent_status(prev_ext, &start_ext);
-			new_ext.ee_len = 0;
-		}
-	}
-
-	/*
-	 * Case: new_ext_end must be less than oext
-	 * oext      |-----------|
-	 * new_ext       |-------|
-	 */
-	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		EXT4_ERROR_INODE(orig_inode,
-			"new_ext_end(%u) should be less than or equal to "
-			"oext->ee_block(%u) + oext_alen(%d) - 1",
-			new_ext_end, le32_to_cpu(oext->ee_block),
-			oext_alen);
-		ret = -EIO;
-		goto out;
-	}
-
-	/*
-	 * Case: new_ext is smaller than original extent
-	 * oext    |---------------|
-	 * new_ext |-----------|
-	 * end_ext             |---|
-	 */
-	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
-		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
-		end_ext.ee_len =
-			cpu_to_le16(le32_to_cpu(oext->ee_block) +
-			oext_alen - 1 - new_ext_end);
-		copy_extent_status(oext, &end_ext);
-		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
-		ext4_ext_store_pblock(&end_ext,
-			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
-		end_ext.ee_block =
-			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
-			oext_alen - end_ext_alen);
-	}
-
-	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
-				o_end, &start_ext, &new_ext, &end_ext);
-out:
-	return ret;
-}
-
-/**
- * mext_calc_swap_extents - Calculate extents for extent swapping.
- *
- * @tmp_dext:		the extent that will belong to the original inode
- * @tmp_oext:		the extent that will belong to the donor inode
- * @orig_off:		block offset of original inode
- * @donor_off:		block offset of donor inode
- * @max_count:		the maximum length of extents
- *
- * Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_calc_swap_extents(struct ext4_extent *tmp_dext,
-			      struct ext4_extent *tmp_oext,
-			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
-			      ext4_lblk_t max_count)
-{
-	ext4_lblk_t diff, orig_diff;
-	struct ext4_extent dext_old, oext_old;
-
-	BUG_ON(orig_off != donor_off);
-
-	/* original and donor extents have to cover the same block offset */
-	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
-	    le32_to_cpu(tmp_oext->ee_block) +
-			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
-		return -ENODATA;
-
-	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
-	    le32_to_cpu(tmp_dext->ee_block) +
-			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
-		return -ENODATA;
-
-	dext_old = *tmp_dext;
-	oext_old = *tmp_oext;
-
-	/* When tmp_dext is too large, pick up the target range. */
-	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-
-	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
-	le32_add_cpu(&tmp_dext->ee_block, diff);
-	le16_add_cpu(&tmp_dext->ee_len, -diff);
-
-	if (max_count < ext4_ext_get_actual_len(tmp_dext))
-		tmp_dext->ee_len = cpu_to_le16(max_count);
-
-	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
-
-	/* Adjust extent length if donor extent is larger than orig */
-	if (ext4_ext_get_actual_len(tmp_dext) >
-	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
-		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
-						orig_diff);
-
-	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
-
-	copy_extent_status(&oext_old, tmp_dext);
-	copy_extent_status(&dext_old, tmp_oext);
-
-	return 0;
-}
-
-/**
  * mext_check_coverage - Check that all extents in range has the same type
  *
  * @inode:		inode in question
@@ -647,129 +141,6 @@ out:
  *
  * Return replaced block count.
  */
-static int
-mext_replace_branches(handle_t *handle, struct inode *orig_inode,
-			   struct inode *donor_inode, ext4_lblk_t from,
-			   ext4_lblk_t count, int *err)
-{
-	struct ext4_ext_path *orig_path = NULL;
-	struct ext4_ext_path *donor_path = NULL;
-	struct ext4_extent *oext, *dext;
-	struct ext4_extent tmp_dext, tmp_oext;
-	ext4_lblk_t orig_off = from, donor_off = from;
-	int depth;
-	int replaced_count = 0;
-	int dext_alen;
-
-	*err = ext4_es_remove_extent(orig_inode, from, count);
-	if (*err)
-		goto out;
-
-	*err = ext4_es_remove_extent(donor_inode, from, count);
-	if (*err)
-		goto out;
-
-	/* Get the original extent for the block "orig_off" */
-	*err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (*err)
-		goto out;
-
-	/* Get the donor extent for the head */
-	*err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (*err)
-		goto out;
-	depth = ext_depth(orig_inode);
-	oext = orig_path[depth].p_ext;
-	tmp_oext = *oext;
-
-	depth = ext_depth(donor_inode);
-	dext = donor_path[depth].p_ext;
-	if (unlikely(!dext))
-		goto missing_donor_extent;
-	tmp_dext = *dext;
-
-	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-				      donor_off, count);
-	if (*err)
-		goto out;
-
-	/* Loop for the donor extents */
-	while (1) {
-		/* The extent for donor must be found. */
-		if (unlikely(!dext)) {
-		missing_donor_extent:
-			EXT4_ERROR_INODE(donor_inode,
-				   "The extent for donor must be found");
-			*err = -EIO;
-			goto out;
-		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			EXT4_ERROR_INODE(donor_inode,
-				"Donor offset(%u) and the first block of donor "
-				"extent(%u) should be equal",
-				donor_off,
-				le32_to_cpu(tmp_dext.ee_block));
-			*err = -EIO;
-			goto out;
-		}
-
-		/* Set donor extent to orig extent */
-		*err = mext_leaf_block(handle, orig_inode,
-					   orig_path, &tmp_dext, &orig_off);
-		if (*err)
-			goto out;
-
-		/* Set orig extent to donor extent */
-		*err = mext_leaf_block(handle, donor_inode,
-					   donor_path, &tmp_oext, &donor_off);
-		if (*err)
-			goto out;
-
-		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
-		replaced_count += dext_alen;
-		donor_off += dext_alen;
-		orig_off += dext_alen;
-
-		BUG_ON(replaced_count > count);
-		/* Already moved the expected blocks */
-		if (replaced_count >= count)
-			break;
-
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		*err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(orig_inode);
-		oext = orig_path[depth].p_ext;
-		tmp_oext = *oext;
-
-		if (donor_path)
-			ext4_ext_drop_refs(donor_path);
-		*err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(donor_inode);
-		dext = donor_path[depth].p_ext;
-		tmp_dext = *dext;
-
-		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-					   donor_off, count - replaced_count);
-		if (*err)
-			goto out;
-	}
-
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (donor_path) {
-		ext4_ext_drop_refs(donor_path);
-		kfree(donor_path);
-	}
-
-	return replaced_count;
-}
 
 /**
  * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
@@ -783,7 +154,7 @@ out:
  */
 static int
 mext_page_double_lock(struct inode *inode1, struct inode *inode2,
-		      pgoff_t index, struct page *page[2])
+		      pgoff_t index1, pgoff_t index2, struct page *page[2])
 {
 	struct address_space *mapping[2];
 	unsigned fl = AOP_FLAG_NOFS;
@@ -793,15 +164,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 		mapping[0] = inode1->i_mapping;
 		mapping[1] = inode2->i_mapping;
 	} else {
+		pgoff_t tmp = index1;
+		index1 = index2;
+		index2 = tmp;
 		mapping[0] = inode2->i_mapping;
 		mapping[1] = inode1->i_mapping;
 	}
 
-	page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+	page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
 	if (!page[0])
 		return -ENOMEM;
 
-	page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
 	if (!page[1]) {
 		unlock_page(page[0]);
 		page_cache_release(page[0]);
@@ -905,13 +279,14 @@ out:
  */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
-		  pgoff_t orig_page_offset, int data_offset_in_page,
-		  int block_len_in_page, int unwritten, int *err)
+		     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+		     int data_offset_in_page,
+		     int block_len_in_page, int unwritten, int *err)
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct page *pagep[2] = {NULL, NULL};
 	handle_t *handle;
-	ext4_lblk_t orig_blk_offset;
+	ext4_lblk_t orig_blk_offset, donor_blk_offset;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int w_flags = 0;
 	unsigned int tmp_data_size, data_size, replaced_size;
@@ -939,6 +314,9 @@ again:
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 
+	donor_blk_offset = donor_page_offset * blocks_per_page +
+		data_offset_in_page;
+
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -959,7 +337,7 @@ again:
 	replaced_size = data_size;
 
 	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
-				     pagep);
+				     donor_page_offset, pagep);
 	if (unlikely(*err < 0))
 		goto stop_journal;
 	/*
@@ -978,7 +356,7 @@ again:
 		if (*err)
 			goto drop_data_sem;
 
-		unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+		unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
 						 block_len_in_page, 1, err);
 		if (*err)
 			goto drop_data_sem;
@@ -994,9 +372,10 @@ again:
 			*err = -EBUSY;
 			goto drop_data_sem;
 		}
-		replaced_count = mext_replace_branches(handle, orig_inode,
-						donor_inode, orig_blk_offset,
-						block_len_in_page, err);
+		replaced_count = ext4_swap_extents(handle, orig_inode,
+						   donor_inode, orig_blk_offset,
+						   donor_blk_offset,
+						   block_len_in_page, 1, err);
 	drop_data_sem:
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		goto unlock_pages;
@@ -1014,9 +393,9 @@ data_copy:
 		goto unlock_pages;
 	}
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
-	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-					       orig_blk_offset,
-					       block_len_in_page, err);
+	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 1, err);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (*err) {
 		if (replaced_count) {
@@ -1061,9 +440,9 @@ repair_branches:
 	 * Try to swap extents to it's original places
 	 */
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
-	replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
-					       orig_blk_offset,
-					       block_len_in_page, &err2);
+	replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 0, &err2);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (replaced_count != block_len_in_page) {
 		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
@@ -1093,10 +472,14 @@ mext_check_arguments(struct inode *orig_inode,
 		     struct inode *donor_inode, __u64 orig_start,
 		     __u64 donor_start, __u64 *len)
 {
-	ext4_lblk_t orig_blocks, donor_blocks;
+	__u64 orig_eof, donor_eof;
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 
+	orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+	donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
+
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
 			   " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1112,7 +495,7 @@ mext_check_arguments(struct inode *orig_inode,
 		ext4_debug("ext4 move extent: The argument files should "
 			"not be swapfile [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
+		return -EBUSY;
 	}
 
 	/* Ext4 move extent supports only extent based file */
@@ -1132,67 +515,28 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	/* Start offset should be same */
-	if (orig_start != donor_start) {
+	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
 		ext4_debug("ext4 move extent: orig and donor's start "
-			"offset are not same [ino:orig %lu, donor %lu]\n",
+			"offset are not alligned [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 
 	if ((orig_start >= EXT_MAX_BLOCKS) ||
+	    (donor_start >= EXT_MAX_BLOCKS) ||
 	    (*len > EXT_MAX_BLOCKS) ||
+	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
 	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
 		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
 			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-
-	if (orig_inode->i_size > donor_inode->i_size) {
-		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start >= donor_blocks) {
-			ext4_debug("ext4 move extent: orig start offset "
-			"[%llu] should be less than donor file blocks "
-			"[%u] [ino:orig %lu, donor %lu]\n",
-			orig_start, donor_blocks,
-			orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start + *len > donor_blocks) {
-			ext4_debug("ext4 move extent: End offset [%llu] should "
-				"be less than donor file blocks [%u]."
-				"So adjust length from %llu to %llu "
-				"[ino:orig %lu, donor %lu]\n",
-				orig_start + *len, donor_blocks,
-				*len, donor_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = donor_blocks - orig_start;
-		}
-	} else {
-		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
-		if (orig_start >= orig_blocks) {
-			ext4_debug("ext4 move extent: start offset [%llu] "
-				"should be less than original file blocks "
-				"[%u] [ino:orig %lu, donor %lu]\n",
-				 orig_start, orig_blocks,
-				orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		if (orig_start + *len > orig_blocks) {
-			ext4_debug("ext4 move extent: Adjust length "
-				"from %llu to %llu. Because it should be "
-				"less than original file blocks "
-				"[ino:orig %lu, donor %lu]\n",
-				*len, orig_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = orig_blocks - orig_start;
-		}
-	}
-
+	if (orig_eof < orig_start + *len - 1)
+		*len = orig_eof - orig_start;
+	if (donor_eof < donor_start + *len - 1)
+		*len = donor_eof - donor_start;
 	if (!*len) {
 		ext4_debug("ext4 move extent: len should not be 0 "
 			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
@@ -1245,23 +589,16 @@ mext_check_arguments(struct inode *orig_inode,
  * 7:Return 0 on success, or a negative error value on failure.
  */
 int
-ext4_move_extents(struct file *o_filp, struct file *d_filp,
-		 __u64 orig_start, __u64 donor_start, __u64 len,
-		 __u64 *moved_len)
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		  __u64 donor_blk, __u64 len, __u64 *moved_len)
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct inode *donor_inode = file_inode(d_filp);
-	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
-	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
-	ext4_lblk_t block_start = orig_start;
-	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
-	ext4_lblk_t rest_blocks;
-	pgoff_t orig_page_offset = 0, seq_end_page;
-	int ret, depth, last_extent = 0;
+	struct ext4_ext_path *path = NULL;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
-	int data_offset_in_page;
-	int block_len_in_page;
-	int unwritten;
+	ext4_lblk_t o_end, o_start = orig_blk;
+	ext4_lblk_t d_start = donor_blk;
+	int ret;
 
 	if (orig_inode->i_sb != donor_inode->i_sb) {
 		ext4_debug("ext4 move extent: The argument files "
@@ -1303,121 +640,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	/* Protect extent tree against block allocations via delalloc */
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
-	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
-				    donor_start, &len);
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
 	if (ret)
 		goto out;
+	o_end = o_start + len;
 
-	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
-	block_end = block_start + len - 1;
-	if (file_end < block_end)
-		len -= block_end - file_end;
-
-	ret = get_ext_path(orig_inode, block_start, &orig_path);
-	if (ret)
-		goto out;
-
-	/* Get path structure to check the hole */
-	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
-	if (ret)
-		goto out;
+	while (o_start < o_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk, next_blk;
+		pgoff_t orig_page_index, donor_page_index;
+		int offset_in_page;
+		int unwritten, cur_len;
 
-	depth = ext_depth(orig_inode);
-	ext_cur = holecheck_path[depth].p_ext;
-
-	/*
-	 * Get proper starting location of block replacement if block_start was
-	 * within the hole.
-	 */
-	if (le32_to_cpu(ext_cur->ee_block) +
-		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
-		/*
-		 * The hole exists between extents or the tail of
-		 * original file.
-		 */
-		last_extent = mext_next_extent(orig_inode,
-					holecheck_path, &ext_cur);
-		if (last_extent < 0) {
-			ret = last_extent;
-			goto out;
-		}
-		last_extent = mext_next_extent(orig_inode, orig_path,
-							&ext_dummy);
-		if (last_extent < 0) {
-			ret = last_extent;
+		ret = get_ext_path(orig_inode, o_start, &path);
+		if (ret)
 			goto out;
+		ex = path[path->p_depth].p_ext;
+		next_blk = ext4_ext_next_allocated_block(path);
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* Check hole before the start pos */
+		if (cur_blk + cur_len - 1 < o_start) {
+			if (next_blk == EXT_MAX_BLOCKS) {
+				o_start = o_end;
+				ret = -ENODATA;
+				goto out;
+			}
+			d_start += next_blk - o_start;
+			o_start = next_blk;
+			goto repeat;
+		/* Check hole after the start pos */
+		} else if (cur_blk > o_start) {
+			/* Skip hole */
+			d_start += cur_blk - o_start;
+			o_start = cur_blk;
+			/* Extent inside requested range ?*/
+			if (cur_blk >= o_end)
+				goto out;
+		} else { /* in_range(o_start, o_blk, o_len) */
+			cur_len += cur_blk - o_start;
 		}
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
-		/* The hole exists at the beginning of original file. */
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	else
-		seq_start = block_start;
-
-	/* No blocks within the specified range. */
-	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
-		ext4_debug("ext4 move extent: The specified range of file "
-							"may be the hole\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* Adjust start blocks */
-	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
-			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
-		     max(le32_to_cpu(ext_cur->ee_block), block_start);
-
-	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
-		seq_blocks += add_blocks;
-
-		/* Adjust tail blocks */
-		if (seq_start + seq_blocks - 1 > block_end)
-			seq_blocks = block_end - seq_start + 1;
-
-		ext_prev = ext_cur;
-		last_extent = mext_next_extent(orig_inode, holecheck_path,
-						&ext_cur);
-		if (last_extent < 0) {
-			ret = last_extent;
-			break;
-		}
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-
-		/*
-		 * Extend the length of contiguous block (seq_blocks)
-		 * if extents are contiguous.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode,
-					       ext_prev, ext_cur) &&
-		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
-		    !last_extent)
-			continue;
-
-		/* Is original extent is unwritten */
-		unwritten = ext4_ext_is_unwritten(ext_prev);
-
-		data_offset_in_page = seq_start % blocks_per_page;
-
-		/*
-		 * Calculate data blocks count that should be swapped
-		 * at the first page.
-		 */
-		if (data_offset_in_page + seq_blocks > blocks_per_page) {
-			/* Swapped blocks are across pages */
-			block_len_in_page =
-					blocks_per_page - data_offset_in_page;
-		} else {
-			/* Swapped blocks are in a page */
-			block_len_in_page = seq_blocks;
-		}
-
-		orig_page_offset = seq_start >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_end_page = (seq_start + seq_blocks - 1) >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-		rest_blocks = seq_blocks;
-
+		unwritten = ext4_ext_is_unwritten(ex);
+		if (o_end - o_start < cur_len)
+			cur_len = o_end - o_start;
+
+		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+					       orig_inode->i_blkbits);
+		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+					       donor_inode->i_blkbits);
+		offset_in_page = o_start % blocks_per_page;
+		if (cur_len > blocks_per_page- offset_in_page)
+			cur_len = blocks_per_page - offset_in_page;
 		/*
 		 * Up semaphore to avoid following problems:
 		 * a. transaction deadlock among ext4_journal_start,
@@ -1426,76 +700,36 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 *    in move_extent_per_page
 		 */
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
-
-		while (orig_page_offset <= seq_end_page) {
-
-			/* Swap original branches with new branches */
-			block_len_in_page = move_extent_per_page(
-						o_filp, donor_inode,
-						orig_page_offset,
-						data_offset_in_page,
-						block_len_in_page,
-						unwritten, &ret);
-
-			/* Count how many blocks we have exchanged */
-			*moved_len += block_len_in_page;
-			if (ret < 0)
-				break;
-			if (*moved_len > len) {
-				EXT4_ERROR_INODE(orig_inode,
-					"We replaced blocks too much! "
-					"sum of replaced: %llu requested: %llu",
-					*moved_len, len);
-				ret = -EIO;
-				break;
-			}
-
-			orig_page_offset++;
-			data_offset_in_page = 0;
-			rest_blocks -= block_len_in_page;
-			if (rest_blocks > blocks_per_page)
-				block_len_in_page = blocks_per_page;
-			else
-				block_len_in_page = rest_blocks;
-		}
-
+		/* Swap original branches with new branches */
+		move_extent_per_page(o_filp, donor_inode,
+				     orig_page_index, donor_page_index,
+				     offset_in_page, cur_len,
+				     unwritten, &ret);
 		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		if (ret < 0)
 			break;
-
-		/* Decrease buffer counter */
-		if (holecheck_path)
-			ext4_ext_drop_refs(holecheck_path);
-		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
-		if (ret)
-			break;
-		depth = holecheck_path->p_depth;
-
-		/* Decrease buffer counter */
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		ret = get_ext_path(orig_inode, seq_start, &orig_path);
-		if (ret)
-			break;
-
-		ext_cur = holecheck_path[depth].p_ext;
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-		seq_blocks = 0;
-
+		o_start += cur_len;
+		d_start += cur_len;
+	repeat:
+		if (path) {
+			ext4_ext_drop_refs(path);
+			kfree(path);
+			path = NULL;
+		}
 	}
+	*moved_len = o_start - orig_blk;
+	if (*moved_len > len)
+		*moved_len = len;
+
 out:
 	if (*moved_len) {
 		ext4_discard_preallocations(orig_inode);
 		ext4_discard_preallocations(donor_inode);
 	}
 
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (holecheck_path) {
-		ext4_ext_drop_refs(holecheck_path);
-		kfree(holecheck_path);
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
 	}
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
-- 
cgit v1.1


From 19008f6dfa16d23afcd09dceaa598bb6da8de4b1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 31 Aug 2014 15:03:14 -0400
Subject: ext4: fix ext4_swap_extents() error handling

If ext4_ext_find_extent() returns an error, we have to clear path1 or
path2 or else we would end up trying to free an ERR_PTR, which would
be bad.

Also eliminate some redundant code and mark the error paths as unlikely()

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 62 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73d9ae9..d009373 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -869,7 +869,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	if (!path) {
 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 				GFP_NOFS);
-		if (!path)
+		if (unlikely(!path))
 			return ERR_PTR(-ENOMEM);
 		alloc = 1;
 	}
@@ -889,7 +889,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
 					    flags);
-		if (IS_ERR(bh)) {
+		if (unlikely(IS_ERR(bh))) {
 			ret = PTR_ERR(bh);
 			goto err;
 		}
@@ -5545,10 +5545,10 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
 
 	*erp = ext4_es_remove_extent(inode1, lblk1, count);
-	if (*erp)
+	if (unlikely(*erp))
 		return 0;
 	*erp = ext4_es_remove_extent(inode2, lblk2, count);
-	if (*erp)
+	if (unlikely(*erp))
 		return 0;
 
 	while (count) {
@@ -5558,20 +5558,24 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		int split = 0;
 
 		path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
-		if (IS_ERR(path1)) {
+		if (unlikely(IS_ERR(path1))) {
 			*erp = PTR_ERR(path1);
-			break;
+			path1 = NULL;
+		finish:
+			count = 0;
+			goto repeat;
 		}
 		path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
-		if (IS_ERR(path2)) {
+		if (unlikely(IS_ERR(path2))) {
 			*erp = PTR_ERR(path2);
-			break;
+			path2 = NULL;
+			goto finish;
 		}
 		ex1 = path1[path1->p_depth].p_ext;
 		ex2 = path2[path2->p_depth].p_ext;
 		/* Do we have somthing to swap ? */
 		if (unlikely(!ex2 || !ex1))
-			break;
+			goto finish;
 
 		e1_blk = le32_to_cpu(ex1->ee_block);
 		e2_blk = le32_to_cpu(ex2->ee_block);
@@ -5593,7 +5597,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 				next2 = e1_blk;
 			/* Do we have something to swap */
 			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
-				break;
+				goto finish;
 			/* Move to the rightest boundary */
 			len = next1 - lblk1;
 			if (len < next2 - lblk2)
@@ -5611,15 +5615,15 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode1,
 						path1, lblk1, 0);
-			if (*erp)
-				break;
+			if (unlikely(*erp))
+				goto finish;
 		}
 		if (e2_blk < lblk2) {
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode2,
 						path2,  lblk2, 0);
-			if (*erp)
-				break;
+			if (unlikely(*erp))
+				goto finish;
 		}
 		/* ext4_split_extent_at() may retult in leaf extent split,
 		 * path must to be revalidated. */
@@ -5637,15 +5641,15 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode1,
 						path1, lblk1 + len, 0);
-			if (*erp)
-				break;
+			if (unlikely(*erp))
+				goto finish;
 		}
 		if (len != e2_len) {
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode2,
 						path2, lblk2 + len, 0);
 			if (*erp)
-				break;
+				goto finish;
 		}
 		/* ext4_split_extent_at() may retult in leaf extent split,
 		 * path must to be revalidated. */
@@ -5654,11 +5658,11 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 
 		BUG_ON(e2_len != e1_len);
 		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
-		if (*erp)
-			break;
+		if (unlikely(*erp))
+			goto finish;
 		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
-		if (*erp)
-			break;
+		if (unlikely(*erp))
+			goto finish;
 
 		/* Both extents are fully inside boundaries. Swap it now */
 		tmp_ex = *ex1;
@@ -5675,8 +5679,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
 		*erp = ext4_ext_dirty(handle, inode2, path2 +
 				      path2->p_depth);
-		if (*erp)
-			break;
+		if (unlikely(*erp))
+			goto finish;
 		*erp = ext4_ext_dirty(handle, inode1, path1 +
 				      path1->p_depth);
 		/*
@@ -5685,8 +5689,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		 * only due to journal error, so full transaction will be
 		 * aborted anyway.
 		 */
-		if (*erp)
-			break;
+		if (unlikely(*erp))
+			goto finish;
 		lblk1 += len;
 		lblk2 += len;
 		replaced_count += len;
@@ -5704,13 +5708,5 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 			path2 = NULL;
 		}
 	}
-	if (path1) {
-		ext4_ext_drop_refs(path1);
-		kfree(path1);
-	}
-	if (path2) {
-		ext4_ext_drop_refs(path2);
-		kfree(path2);
-	}
 	return replaced_count;
 }
-- 
cgit v1.1


From 713e8dde3e71e92db2d8cc8459d236ce1fb576ce Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:32:09 -0400
Subject: ext4: fix ZERO_RANGE bug hidden by flag aliasing

We accidently aliased EXT4_EX_NOCACHE and EXT4_GET_CONVERT_UNWRITTEN
falgs, which apparently was hiding a bug that was unmasked when this
flag aliasing issue was addressed (see the subsequent commit).  The
reproduction case was:

   fsx -N 10000 -l 500000 -r 4096 -t 4096 -w 4096 -Z -R -W /vdb/junk

... which would cause fsx to report corruption in the data file.

The fix we have is a bit of an overkill, but I'd much rather be
conservative for now, and we can optimize ZERO_RANGE_FL handling
later.  The fact that we need to zap the extent_status cache for the
inode is unfortunate, but correctness is far more important than
performance.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: Namjae Jeon <namjae.jeon@samsung.com>
---
 fs/ext4/extents.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d009373..bf205f7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4802,7 +4802,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		max_blocks -= lblk;
 
 	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
-		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+		EXT4_EX_NOCACHE;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
@@ -4840,15 +4841,21 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		ext4_inode_block_unlocked_dio(inode);
 		inode_dio_wait(inode);
 
+		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+					     flags, mode);
+		if (ret)
+			goto out_dio;
 		/*
 		 * Remove entire range from the extent status tree.
+		 *
+		 * ext4_es_remove_extent(inode, lblk, max_blocks) is
+		 * NOT sufficient.  I'm not sure why this is the case,
+		 * but let's be conservative and remove the extent
+		 * status tree for the entire inode.  There should be
+		 * no outstanding delalloc extents thanks to the
+		 * filemap_write_and_wait_range() call above.
 		 */
-		ret = ext4_es_remove_extent(inode, lblk, max_blocks);
-		if (ret)
-			goto out_dio;
-
-		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
-					     flags, mode);
+		ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
 		if (ret)
 			goto out_dio;
 	}
-- 
cgit v1.1


From bd30d702fc320085f178d22866b32fdc4736c991 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:33:09 -0400
Subject: ext4: fix accidental flag aliasing in ext4_map_blocks flags

Commit b8a8684502a0f introduced an accidental flag aliasing between
EXT4_EX_NOCACHE and EXT4_GET_BLOCKS_CONVERT_UNWRITTEN.

Fortunately, this didn't introduce any untorward side effects --- we
got lucky.  Nevertheless, fix this and leave a warning to hopefully
avoid this from happening in the future.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cf3ad75..550b4f9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -569,6 +569,7 @@ enum {
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
 	/* Convert written extents to unwritten */
 #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0400
+/* DO NOT ASSIGN ADDITIONAL FLAG VALUES WITHOUT ADJUSTING THE FLAGS BELOW */
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -579,8 +580,8 @@ enum {
  * caching the extents when reading from the extent tree while a
  * truncate or punch hole operation is in progress.
  */
-#define EXT4_EX_NOCACHE				0x0400
-#define EXT4_EX_FORCE_CACHE			0x0800
+#define EXT4_EX_NOCACHE				0x0800
+#define EXT4_EX_FORCE_CACHE			0x1000
 
 /*
  * Flags used by ext4_free_blocks
-- 
cgit v1.1


From 705912ca95f4bbdbb3be753e46bf30d6be15a5e8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:34:09 -0400
Subject: ext4: teach ext4_ext_find_extent() to free path on error

Right now, there are a places where it is all to easy to leak memory
on an error path, via a usage like this:

	struct ext4_ext_path *path = NULL

	while (...) {
		...
		path = ext4_ext_find_extent(inode, block, path, 0);
		if (IS_ERR(path)) {
			/* oops, if path was non-NULL before the call to
			   ext4_ext_find_extent, we've leaked it!  :-(  */
			...
			return PTR_ERR(path);
		}
		...
	}

Unfortunately, there some code paths where we are doing the following
instead:

	path = ext4_ext_find_extent(inode, block, orig_path, 0);

and where it's important that we _not_ free orig_path in the case
where ext4_ext_find_extent() returns an error.

So change the function signature of ext4_ext_find_extent() so that it
takes a struct ext4_ext_path ** for its third argument, and by
default, on an error, it will free the struct ext4_ext_path, and then
zero out the struct ext4_ext_path * pointer.  In order to avoid
causing problems, we add a flag EXT4_EX_NOFREE_ON_ERR which causes
ext4_ext_find_extent() to use the original behavior of forcing the
caller to deal with freeing the original path pointer on the error
case.

The goal is to get rid of EXT4_EX_NOFREE_ON_ERR entirely, but this
allows for a gentle transition and makes the patches easier to verify.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        |  3 ++-
 fs/ext4/extents.c     | 28 ++++++++++++++++++----------
 fs/ext4/move_extent.c |  2 +-
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 550b4f9..696e51a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -582,6 +582,7 @@ enum {
  */
 #define EXT4_EX_NOCACHE				0x0800
 #define EXT4_EX_FORCE_CACHE			0x1000
+#define EXT4_EX_NOFREE_ON_ERR			0x2000
 
 /*
  * Flags used by ext4_free_blocks
@@ -2733,7 +2734,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
 				  struct ext4_ext_path *,
 				  struct ext4_extent *, int);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
-						  struct ext4_ext_path *,
+						  struct ext4_ext_path **,
 						  int flags);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bf205f7..0ced78c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -855,11 +855,13 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 
 struct ext4_ext_path *
 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
-		     struct ext4_ext_path *path, int flags)
+		     struct ext4_ext_path **orig_path, int flags)
 {
 	struct ext4_extent_header *eh;
 	struct buffer_head *bh;
-	short int depth, i, ppos = 0, alloc = 0;
+	struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
+	short int depth, i, ppos = 0;
+	short free_on_err = (flags & EXT4_EX_NOFREE_ON_ERR) == 0;
 	int ret;
 
 	eh = ext_inode_hdr(inode);
@@ -871,7 +873,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 				GFP_NOFS);
 		if (unlikely(!path))
 			return ERR_PTR(-ENOMEM);
-		alloc = 1;
+		free_on_err = 1;
 	}
 	path[0].p_hdr = eh;
 	path[0].p_bh = NULL;
@@ -923,8 +925,11 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 err:
 	ext4_ext_drop_refs(path);
-	if (alloc)
+	if (free_on_err) {
 		kfree(path);
+		if (orig_path)
+			*orig_path = NULL;
+	}
 	return ERR_PTR(ret);
 }
 
@@ -1356,7 +1361,7 @@ repeat:
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-				    path, gb_flags);
+				    &path, gb_flags | EXT4_EX_NOFREE_ON_ERR);
 		if (IS_ERR(path))
 			err = PTR_ERR(path);
 	} else {
@@ -1369,7 +1374,7 @@ repeat:
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-				    path, gb_flags);
+				    &path, gb_flags | EXT4_EX_NOFREE_ON_ERR);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -2152,7 +2157,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 			path = NULL;
 		}
 
-		path = ext4_ext_find_extent(inode, block, path, 0);
+		path = ext4_ext_find_extent(inode, block, &path, 0);
 		if (IS_ERR(path)) {
 			up_read(&EXT4_I(inode)->i_data_sem);
 			err = PTR_ERR(path);
@@ -3313,7 +3318,8 @@ static int ext4_split_extent(handle_t *handle,
 	 * result in split of original leaf or extent zeroout.
 	 */
 	ext4_ext_drop_refs(path);
-	path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+	path = ext4_ext_find_extent(inode, map->m_lblk, &path,
+				    EXT4_EX_NOFREE_ON_ERR);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	depth = ext_depth(inode);
@@ -3697,7 +3703,8 @@ static int ext4_convert_initialized_extents(handle_t *handle,
 		if (err < 0)
 			goto out;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+		path = ext4_ext_find_extent(inode, map->m_lblk, &path,
+					    EXT4_EX_NOFREE_ON_ERR);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -3769,7 +3776,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 		if (err < 0)
 			goto out;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+		path = ext4_ext_find_extent(inode, map->m_lblk, &path,
+					    EXT4_EX_NOFREE_ON_ERR);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c8f895b..5e2465a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
 	int ret = 0;
 	struct ext4_ext_path *path;
 
-	path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+	path = ext4_ext_find_extent(inode, lblock, orig_path, EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
 		ret = PTR_ERR(path);
 	else if (path[ext_depth(inode)].p_ext == NULL)
-- 
cgit v1.1


From e8b83d9303317fb068ad83d87991b610fe990ed5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:35:09 -0400
Subject: ext4: collapse ext4_convert_initialized_extents()

The function ext4_convert_initialized_extents() is only called by a
single function --- ext4_ext_convert_initalized_extents().  Inline the
code and get rid of the unnecessary bits in order to simplify the code.

Rename ext4_ext_convert_initalized_extents() to
convert_initalized_extents() since it's a static function that is
actually only used in a single caller, ext4_ext_map_blocks().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 136 +++++++++++++++++++++++-------------------------------
 1 file changed, 59 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0ced78c..5fc5e2b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3677,67 +3677,6 @@ static int ext4_split_convert_extents(handle_t *handle,
 	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
 
-static int ext4_convert_initialized_extents(handle_t *handle,
-					    struct inode *inode,
-					    struct ext4_map_blocks *map,
-					    struct ext4_ext_path *path)
-{
-	struct ext4_extent *ex;
-	ext4_lblk_t ee_block;
-	unsigned int ee_len;
-	int depth;
-	int err = 0;
-
-	depth = ext_depth(inode);
-	ex = path[depth].p_ext;
-	ee_block = le32_to_cpu(ex->ee_block);
-	ee_len = ext4_ext_get_actual_len(ex);
-
-	ext_debug("%s: inode %lu, logical"
-		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
-		  (unsigned long long)ee_block, ee_len);
-
-	if (ee_block != map->m_lblk || ee_len > map->m_len) {
-		err = ext4_split_convert_extents(handle, inode, map, path,
-				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
-		if (err < 0)
-			goto out;
-		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, &path,
-					    EXT4_EX_NOFREE_ON_ERR);
-		if (IS_ERR(path)) {
-			err = PTR_ERR(path);
-			goto out;
-		}
-		depth = ext_depth(inode);
-		ex = path[depth].p_ext;
-		if (!ex) {
-			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
-					 (unsigned long) map->m_lblk);
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	err = ext4_ext_get_access(handle, inode, path + depth);
-	if (err)
-		goto out;
-	/* first mark the extent as unwritten */
-	ext4_ext_mark_unwritten(ex);
-
-	/* note: ext4_ext_correct_indexes() isn't needed here because
-	 * borders are not changed
-	 */
-	ext4_ext_try_to_merge(handle, inode, path, ex);
-
-	/* Mark modified extent as dirty */
-	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
-	ext4_ext_show_leaf(inode, path);
-	return err;
-}
-
-
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 						struct inode *inode,
 						struct ext4_map_blocks *map,
@@ -3974,12 +3913,15 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 }
 
 static int
-ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
-			struct ext4_map_blocks *map,
-			struct ext4_ext_path *path, int flags,
-			unsigned int allocated, ext4_fsblk_t newblock)
+convert_initialized_extent(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map,
+			   struct ext4_ext_path *path, int flags,
+			   unsigned int allocated, ext4_fsblk_t newblock)
 {
-	int ret = 0;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	unsigned int ee_len;
+	int depth;
 	int err = 0;
 
 	/*
@@ -3989,20 +3931,60 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
 	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
 		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
 
-	ret = ext4_convert_initialized_extents(handle, inode, map,
-						path);
-	if (ret >= 0) {
-		ext4_update_inode_fsync_trans(handle, inode, 1);
-		err = check_eofblocks_fl(handle, inode, map->m_lblk,
-					 path, map->m_len);
-	} else
-		err = ret;
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	ext_debug("%s: inode %lu, logical"
+		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
+		  (unsigned long long)ee_block, ee_len);
+
+	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+		err = ext4_split_convert_extents(handle, inode, map, path,
+				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+		if (err < 0)
+			return err;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, &path,
+					    EXT4_EX_NOFREE_ON_ERR);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+		if (!ex) {
+			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+					 (unsigned long) map->m_lblk);
+			return -EIO;
+		}
+	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		return err;
+	/* first mark the extent as unwritten */
+	ext4_ext_mark_unwritten(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(handle, inode, path, ex);
+
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+	if (err)
+		return err;
+	ext4_ext_show_leaf(inode, path);
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
+	if (err)
+		return err;
 	map->m_flags |= EXT4_MAP_UNWRITTEN;
 	if (allocated > map->m_len)
 		allocated = map->m_len;
 	map->m_len = allocated;
-
-	return err ? err : allocated;
+	return allocated;
 }
 
 static int
@@ -4342,7 +4324,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			 */
 			if ((!ext4_ext_is_unwritten(ex)) &&
 			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
-				allocated = ext4_ext_convert_initialized_extent(
+				allocated = convert_initialized_extent(
 						handle, inode, map, path, flags,
 						allocated, newblock);
 				goto out2;
-- 
cgit v1.1


From 4f224b8b7be6856a3ceaf7f9d9c1860d467174ae Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:36:09 -0400
Subject: ext4: drop EXT4_EX_NOFREE_ON_ERR in convert_initialized_extent()

Transfer responsibility of freeing struct ext4_ext_path on error to
ext4_ext_find_extent().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5fc5e2b..acb92ac 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3915,9 +3915,10 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map,
-			   struct ext4_ext_path *path, int flags,
+			   struct ext4_ext_path **ppath, int flags,
 			   unsigned int allocated, ext4_fsblk_t newblock)
 {
+	struct ext4_ext_path *path = *ppath;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block;
 	unsigned int ee_len;
@@ -3946,8 +3947,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 		if (err < 0)
 			return err;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, &path,
-					    EXT4_EX_NOFREE_ON_ERR);
+		path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = ext_depth(inode);
@@ -4325,8 +4325,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			if ((!ext4_ext_is_unwritten(ex)) &&
 			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
 				allocated = convert_initialized_extent(
-						handle, inode, map, path, flags,
-						allocated, newblock);
+						handle, inode, map, &path,
+						flags, allocated, newblock);
 				goto out2;
 			} else if (!ext4_ext_is_unwritten(ex))
 				goto out;
-- 
cgit v1.1


From dfe5080939ea4686b3414b5d970a9b26733c57a4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:37:09 -0400
Subject: ext4: drop EXT4_EX_NOFREE_ON_ERR from rest of extents handling code

Drop EXT4_EX_NOFREE_ON_ERR from ext4_ext_create_new_leaf(),
ext4_split_extent(), ext4_convert_unwritten_extents_endio().

This requires fixing all of their callers to potentially
ext4_ext_find_extent() to free the struct ext4_ext_path object in case
of an error, and there are interlocking dependencies all the way up to
ext4_ext_map_blocks(), ext4_swap_extents(), and
ext4_ext_remove_space().

Once this is done, we can drop the EXT4_EX_NOFREE_ON_ERR flag since it
is no longer necessary.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   3 +-
 fs/ext4/extents.c | 112 +++++++++++++++++++++++++++---------------------------
 fs/ext4/migrate.c |   2 +-
 3 files changed, 59 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 696e51a..4a5a6b9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -582,7 +582,6 @@ enum {
  */
 #define EXT4_EX_NOCACHE				0x0800
 #define EXT4_EX_FORCE_CACHE			0x1000
-#define EXT4_EX_NOFREE_ON_ERR			0x2000
 
 /*
  * Flags used by ext4_free_blocks
@@ -2731,7 +2730,7 @@ extern int ext4_can_extents_be_merged(struct inode *inode,
 				      struct ext4_extent *ex1,
 				      struct ext4_extent *ex2);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *,
-				  struct ext4_ext_path *,
+				  struct ext4_ext_path **,
 				  struct ext4_extent *, int);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 						  struct ext4_ext_path **,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index acb92ac..ccdd2af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -98,14 +98,14 @@ static void ext4_extent_block_csum_set(struct inode *inode,
 
 static int ext4_split_extent(handle_t *handle,
 				struct inode *inode,
-				struct ext4_ext_path *path,
+				struct ext4_ext_path **ppath,
 				struct ext4_map_blocks *map,
 				int split_flag,
 				int flags);
 
 static int ext4_split_extent_at(handle_t *handle,
 			     struct inode *inode,
-			     struct ext4_ext_path *path,
+			     struct ext4_ext_path **ppath,
 			     ext4_lblk_t split,
 			     int split_flag,
 			     int flags);
@@ -293,12 +293,13 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 
 static inline int
 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
-			   struct ext4_ext_path *path, ext4_lblk_t lblk,
+			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
 			   int nofail)
 {
+	struct ext4_ext_path *path = *ppath;
 	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
 
-	return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
+	return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
 			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
 			EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
 			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
@@ -861,7 +862,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	struct buffer_head *bh;
 	struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
 	short int depth, i, ppos = 0;
-	short free_on_err = (flags & EXT4_EX_NOFREE_ON_ERR) == 0;
 	int ret;
 
 	eh = ext_inode_hdr(inode);
@@ -873,7 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 				GFP_NOFS);
 		if (unlikely(!path))
 			return ERR_PTR(-ENOMEM);
-		free_on_err = 1;
 	}
 	path[0].p_hdr = eh;
 	path[0].p_bh = NULL;
@@ -925,11 +924,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 err:
 	ext4_ext_drop_refs(path);
-	if (free_on_err) {
-		kfree(path);
-		if (orig_path)
-			*orig_path = NULL;
-	}
+	kfree(path);
+	if (orig_path)
+		*orig_path = NULL;
 	return ERR_PTR(ret);
 }
 
@@ -1332,9 +1329,10 @@ out:
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
 				    unsigned int mb_flags,
 				    unsigned int gb_flags,
-				    struct ext4_ext_path *path,
+				    struct ext4_ext_path **ppath,
 				    struct ext4_extent *newext)
 {
+	struct ext4_ext_path *path = *ppath;
 	struct ext4_ext_path *curp;
 	int depth, i, err = 0;
 
@@ -1361,7 +1359,7 @@ repeat:
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-				    &path, gb_flags | EXT4_EX_NOFREE_ON_ERR);
+				    ppath, gb_flags);
 		if (IS_ERR(path))
 			err = PTR_ERR(path);
 	} else {
@@ -1374,7 +1372,7 @@ repeat:
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-				    &path, gb_flags | EXT4_EX_NOFREE_ON_ERR);
+				    ppath, gb_flags);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -1914,9 +1912,10 @@ out:
  * creating new leaf in the no-space case.
  */
 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *path,
+				struct ext4_ext_path **ppath,
 				struct ext4_extent *newext, int gb_flags)
 {
+	struct ext4_ext_path *path = *ppath;
 	struct ext4_extent_header *eh;
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
@@ -2048,7 +2047,7 @@ prepend:
 	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
 		mb_flags = EXT4_MB_USE_RESERVED;
 	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
-				       path, newext);
+				       ppath, newext);
 	if (err)
 		goto cleanup;
 	depth = ext_depth(inode);
@@ -2878,7 +2877,7 @@ again:
 			 * fail removing space due to ENOSPC so try to use
 			 * reserved block if that happens.
 			 */
-			err = ext4_force_split_extent_at(handle, inode, path,
+			err = ext4_force_split_extent_at(handle, inode, &path,
 							 end + 1, 1);
 			if (err < 0)
 				goto out;
@@ -3019,12 +3018,13 @@ again:
 		}
 	}
 out:
-	ext4_ext_drop_refs(path);
-	kfree(path);
-	if (err == -EAGAIN) {
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
 		path = NULL;
-		goto again;
 	}
+	if (err == -EAGAIN)
+		goto again;
 	ext4_journal_stop(handle);
 
 	return err;
@@ -3138,11 +3138,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  */
 static int ext4_split_extent_at(handle_t *handle,
 			     struct inode *inode,
-			     struct ext4_ext_path *path,
+			     struct ext4_ext_path **ppath,
 			     ext4_lblk_t split,
 			     int split_flag,
 			     int flags)
 {
+	struct ext4_ext_path *path = *ppath;
 	ext4_fsblk_t newblock;
 	ext4_lblk_t ee_block;
 	struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3213,7 +3214,7 @@ static int ext4_split_extent_at(handle_t *handle,
 	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
 		ext4_ext_mark_unwritten(ex2);
 
-	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
 	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
 			if (split_flag & EXT4_EXT_DATA_VALID1) {
@@ -3279,11 +3280,12 @@ fix_extent_len:
  */
 static int ext4_split_extent(handle_t *handle,
 			      struct inode *inode,
-			      struct ext4_ext_path *path,
+			      struct ext4_ext_path **ppath,
 			      struct ext4_map_blocks *map,
 			      int split_flag,
 			      int flags)
 {
+	struct ext4_ext_path *path = *ppath;
 	ext4_lblk_t ee_block;
 	struct ext4_extent *ex;
 	unsigned int ee_len, depth;
@@ -3306,7 +3308,7 @@ static int ext4_split_extent(handle_t *handle,
 				       EXT4_EXT_MARK_UNWRIT2;
 		if (split_flag & EXT4_EXT_DATA_VALID2)
 			split_flag1 |= EXT4_EXT_DATA_VALID1;
-		err = ext4_split_extent_at(handle, inode, path,
+		err = ext4_split_extent_at(handle, inode, ppath,
 				map->m_lblk + map->m_len, split_flag1, flags1);
 		if (err)
 			goto out;
@@ -3318,8 +3320,7 @@ static int ext4_split_extent(handle_t *handle,
 	 * result in split of original leaf or extent zeroout.
 	 */
 	ext4_ext_drop_refs(path);
-	path = ext4_ext_find_extent(inode, map->m_lblk, &path,
-				    EXT4_EX_NOFREE_ON_ERR);
+	path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	depth = ext_depth(inode);
@@ -3339,7 +3340,7 @@ static int ext4_split_extent(handle_t *handle,
 			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
 						     EXT4_EXT_MARK_UNWRIT2);
 		}
-		err = ext4_split_extent_at(handle, inode, path,
+		err = ext4_split_extent_at(handle, inode, ppath,
 				map->m_lblk, split_flag1, flags);
 		if (err)
 			goto out;
@@ -3373,9 +3374,10 @@ out:
 static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct inode *inode,
 					   struct ext4_map_blocks *map,
-					   struct ext4_ext_path *path,
+					   struct ext4_ext_path **ppath,
 					   int flags)
 {
+	struct ext4_ext_path *path = *ppath;
 	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
@@ -3599,7 +3601,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		}
 	}
 
-	allocated = ext4_split_extent(handle, inode, path,
+	allocated = ext4_split_extent(handle, inode, ppath,
 				      &split_map, split_flag, flags);
 	if (allocated < 0)
 		err = allocated;
@@ -3638,9 +3640,10 @@ out:
 static int ext4_split_convert_extents(handle_t *handle,
 					struct inode *inode,
 					struct ext4_map_blocks *map,
-					struct ext4_ext_path *path,
+					struct ext4_ext_path **ppath,
 					int flags)
 {
+	struct ext4_ext_path *path = *ppath;
 	ext4_lblk_t eof_block;
 	ext4_lblk_t ee_block;
 	struct ext4_extent *ex;
@@ -3674,14 +3677,15 @@ static int ext4_split_convert_extents(handle_t *handle,
 		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
 	}
 	flags |= EXT4_GET_BLOCKS_PRE_IO;
-	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+	return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
 }
 
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 						struct inode *inode,
 						struct ext4_map_blocks *map,
-						struct ext4_ext_path *path)
+						struct ext4_ext_path **ppath)
 {
+	struct ext4_ext_path *path = *ppath;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block;
 	unsigned int ee_len;
@@ -3710,17 +3714,14 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 			     inode->i_ino, (unsigned long long)ee_block, ee_len,
 			     (unsigned long long)map->m_lblk, map->m_len);
 #endif
-		err = ext4_split_convert_extents(handle, inode, map, path,
+		err = ext4_split_convert_extents(handle, inode, map, ppath,
 						 EXT4_GET_BLOCKS_CONVERT);
 		if (err < 0)
-			goto out;
+			return err;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, map->m_lblk, &path,
-					    EXT4_EX_NOFREE_ON_ERR);
-		if (IS_ERR(path)) {
-			err = PTR_ERR(path);
-			goto out;
-		}
+		path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
 		depth = ext_depth(inode);
 		ex = path[depth].p_ext;
 	}
@@ -3942,7 +3943,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 		  (unsigned long long)ee_block, ee_len);
 
 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
-		err = ext4_split_convert_extents(handle, inode, map, path,
+		err = ext4_split_convert_extents(handle, inode, map, ppath,
 				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
 		if (err < 0)
 			return err;
@@ -3990,9 +3991,10 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 static int
 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
-			struct ext4_ext_path *path, int flags,
+			struct ext4_ext_path **ppath, int flags,
 			unsigned int allocated, ext4_fsblk_t newblock)
 {
+	struct ext4_ext_path *path = *ppath;
 	int ret = 0;
 	int err = 0;
 	ext4_io_end_t *io = ext4_inode_aio(inode);
@@ -4014,8 +4016,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 
 	/* get_block() before submit the IO, split the extent */
 	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
-		ret = ext4_split_convert_extents(handle, inode, map,
-					 path, flags | EXT4_GET_BLOCKS_CONVERT);
+		ret = ext4_split_convert_extents(handle, inode, map, ppath,
+					 flags | EXT4_GET_BLOCKS_CONVERT);
 		if (ret <= 0)
 			goto out;
 		/*
@@ -4033,7 +4035,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 	/* IO end_io complete, convert the filled extent to written */
 	if (flags & EXT4_GET_BLOCKS_CONVERT) {
 		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
-							path);
+							   ppath);
 		if (ret >= 0) {
 			ext4_update_inode_fsync_trans(handle, inode, 1);
 			err = check_eofblocks_fl(handle, inode, map->m_lblk,
@@ -4071,7 +4073,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 	}
 
 	/* buffered write, writepage time, convert*/
-	ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
 	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -4332,7 +4334,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 				goto out;
 
 			ret = ext4_ext_handle_unwritten_extents(
-				handle, inode, map, path, flags,
+				handle, inode, map, &path, flags,
 				allocated, newblock);
 			if (ret < 0)
 				err = ret;
@@ -4479,7 +4481,7 @@ got_allocated_blocks:
 		err = check_eofblocks_fl(handle, inode, map->m_lblk,
 					 path, ar.len);
 	if (!err)
-		err = ext4_ext_insert_extent(handle, inode, path,
+		err = ext4_ext_insert_extent(handle, inode, &path,
 					     &newex, flags);
 
 	if (!err && set_unwritten) {
@@ -5611,18 +5613,18 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		if (e1_blk < lblk1) {
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode1,
-						path1, lblk1, 0);
+						&path1, lblk1, 0);
 			if (unlikely(*erp))
 				goto finish;
 		}
 		if (e2_blk < lblk2) {
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode2,
-						path2,  lblk2, 0);
+						&path2,  lblk2, 0);
 			if (unlikely(*erp))
 				goto finish;
 		}
-		/* ext4_split_extent_at() may retult in leaf extent split,
+		/* ext4_split_extent_at() may result in leaf extent split,
 		 * path must to be revalidated. */
 		if (split)
 			goto repeat;
@@ -5637,18 +5639,18 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		if (len != e1_len) {
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode1,
-						path1, lblk1 + len, 0);
+						&path1, lblk1 + len, 0);
 			if (unlikely(*erp))
 				goto finish;
 		}
 		if (len != e2_len) {
 			split = 1;
 			*erp = ext4_force_split_extent_at(handle, inode2,
-						path2, lblk2 + len, 0);
+						&path2, lblk2 + len, 0);
 			if (*erp)
 				goto finish;
 		}
-		/* ext4_split_extent_at() may retult in leaf extent split,
+		/* ext4_split_extent_at() may result in leaf extent split,
 		 * path must to be revalidated. */
 		if (split)
 			goto repeat;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d3567f2..aff7bdf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -81,7 +81,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
 				goto err_out;
 		}
 	}
-	retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+	retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
 err_out:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (path) {
-- 
cgit v1.1


From 523f431ccfffd3022e80e13befb9594f54b5607e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:38:09 -0400
Subject: ext4: call ext4_ext_drop_refs() from ext4_ext_find_extent()

In nearly all of the calls to ext4_ext_find_extent() where the caller
is trying to recycle the path object, ext4_ext_drop_refs() gets called
to release the buffer heads before the path object gets overwritten.
To simplify things for the callers, and to avoid the possibility of a
memory leak, make ext4_ext_find_extent() responsible for dropping the
buffers.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ccdd2af..4f4d523 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -867,8 +867,10 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
 
-	/* account possible depth increase */
-	if (!path) {
+	if (path)
+		ext4_ext_drop_refs(path);
+	else {
+		/* account possible depth increase */
 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 				GFP_NOFS);
 		if (unlikely(!path))
@@ -1356,7 +1358,6 @@ repeat:
 			goto out;
 
 		/* refill path */
-		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
 				    ppath, gb_flags);
@@ -1369,7 +1370,6 @@ repeat:
 			goto out;
 
 		/* refill path */
-		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
 				    ppath, gb_flags);
@@ -3319,7 +3319,6 @@ static int ext4_split_extent(handle_t *handle,
 	 * Update path is required because previous ext4_split_extent_at() may
 	 * result in split of original leaf or extent zeroout.
 	 */
-	ext4_ext_drop_refs(path);
 	path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
@@ -3718,7 +3717,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 						 EXT4_GET_BLOCKS_CONVERT);
 		if (err < 0)
 			return err;
-		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
@@ -3947,7 +3945,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
 		if (err < 0)
 			return err;
-		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
-- 
cgit v1.1


From b7ea89ad0a6b855172158a999d3f5008403f4011 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:39:09 -0400
Subject: ext4: allow a NULL argument to ext4_ext_drop_refs()

Teach ext4_ext_drop_refs() to accept a NULL argument, much like
kfree().  This allows us to drop a lot of checks to make sure path is
non-NULL before calling ext4_ext_drop_refs().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c        | 48 ++++++++++++++++++------------------------------
 fs/ext4/extents_status.c |  6 ++----
 fs/ext4/migrate.c        |  6 ++----
 fs/ext4/move_extent.c    | 20 +++++++-------------
 4 files changed, 29 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4f4d523..538f9a4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -709,9 +709,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
-	int depth = path->p_depth;
-	int i;
+	int depth, i;
 
+	if (!path)
+		return;
+	depth = path->p_depth;
 	for (i = 0; i <= depth; i++, path++)
 		if (path->p_bh) {
 			brelse(path->p_bh);
@@ -2125,10 +2127,8 @@ merge:
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 
 cleanup:
-	if (npath) {
-		ext4_ext_drop_refs(npath);
-		kfree(npath);
-	}
+	ext4_ext_drop_refs(npath);
+	kfree(npath);
 	return err;
 }
 
@@ -2283,11 +2283,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		block = es.es_lblk + es.es_len;
 	}
 
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
-
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	return err;
 }
 
@@ -3018,11 +3015,9 @@ again:
 		}
 	}
 out:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-		path = NULL;
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	path = NULL;
 	if (err == -EAGAIN)
 		goto again;
 	ext4_journal_stop(handle);
@@ -4611,10 +4606,8 @@ out:
 	map->m_pblk = newblock;
 	map->m_len = allocated;
 out2:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 
 	trace_ext4_ext_map_blocks_exit(inode, flags, map,
 				       err ? err : allocated);
@@ -5693,16 +5686,11 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		count -= len;
 
 	repeat:
-		if (path1) {
-			ext4_ext_drop_refs(path1);
-			kfree(path1);
-			path1 = NULL;
-		}
-		if (path2) {
-			ext4_ext_drop_refs(path2);
-			kfree(path2);
-			path2 = NULL;
-		}
+		ext4_ext_drop_refs(path1);
+		kfree(path1);
+		ext4_ext_drop_refs(path2);
+		kfree(path2);
+		path1 = path2 = NULL;
 	}
 	return replaced_count;
 }
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 0b7e28e..8ffff96 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -499,10 +499,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
 		}
 	}
 out:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 }
 
 static void ext4_es_insert_extent_ind_check(struct inode *inode,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index aff7bdf..061c300 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -84,10 +84,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
 err_out:
 	up_write((&EXT4_I(inode)->i_data_sem));
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	lb->first_pblock = 0;
 	return retval;
 }
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5e2465a..a34c076 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -113,10 +113,8 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
 	}
 	ret = 1;
 out:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	return ret;
 }
 
@@ -711,11 +709,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 		o_start += cur_len;
 		d_start += cur_len;
 	repeat:
-		if (path) {
-			ext4_ext_drop_refs(path);
-			kfree(path);
-			path = NULL;
-		}
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		path = NULL;
 	}
 	*moved_len = o_start - orig_blk;
 	if (*moved_len > len)
@@ -727,10 +723,8 @@ out:
 		ext4_discard_preallocations(donor_inode);
 	}
 
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
 	ext4_inode_resume_unlocked_dio(donor_inode);
-- 
cgit v1.1


From 10809df84a4d868db61af621bae3658494165279 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:40:09 -0400
Subject: ext4: teach ext4_ext_find_extent() to realloc path if necessary

This adds additional safety in case for some reason we end reusing a
path structure which isn't big enough for current depth of the inode.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4_extents.h |  1 +
 fs/ext4/extents.c      | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a867f5c..3c93815 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh)
 struct ext4_ext_path {
 	ext4_fsblk_t			p_block;
 	__u16				p_depth;
+	__u16				p_maxdepth;
 	struct ext4_extent		*p_ext;
 	struct ext4_extent_idx		*p_idx;
 	struct ext4_extent_header	*p_hdr;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 538f9a4..c94c748 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -869,14 +869,20 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
 
-	if (path)
+	if (path) {
 		ext4_ext_drop_refs(path);
-	else {
+		if (depth > path[0].p_maxdepth) {
+			kfree(path);
+			*orig_path = path = NULL;
+		}
+	}
+	if (!path) {
 		/* account possible depth increase */
 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 				GFP_NOFS);
 		if (unlikely(!path))
 			return ERR_PTR(-ENOMEM);
+		path[0].p_maxdepth = depth + 1;
 	}
 	path[0].p_hdr = eh;
 	path[0].p_bh = NULL;
@@ -1820,6 +1826,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
 		sizeof(struct ext4_extent_idx);
 	s += sizeof(struct ext4_extent_header);
 
+	path[1].p_maxdepth = path[0].p_maxdepth;
 	memcpy(path[0].p_hdr, path[1].p_hdr, s);
 	path[0].p_depth = 0;
 	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
@@ -2150,12 +2157,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		/* find extent for this block */
 		down_read(&EXT4_I(inode)->i_data_sem);
 
-		if (path && ext_depth(inode) != depth) {
-			/* depth was changed. we have to realloc path */
-			kfree(path);
-			path = NULL;
-		}
-
 		path = ext4_ext_find_extent(inode, block, &path, 0);
 		if (IS_ERR(path)) {
 			up_read(&EXT4_I(inode)->i_data_sem);
@@ -2173,7 +2174,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		}
 		ex = path[depth].p_ext;
 		next = ext4_ext_next_allocated_block(path);
-		ext4_ext_drop_refs(path);
 
 		flags = 0;
 		exists = 0;
@@ -2897,7 +2897,7 @@ again:
 			ext4_journal_stop(handle);
 			return -ENOMEM;
 		}
-		path[0].p_depth = depth;
+		path[0].p_maxdepth = path[0].p_depth = depth;
 		path[0].p_hdr = ext_inode_hdr(inode);
 		i = 0;
 
-- 
cgit v1.1


From ee4bd0d963b75cbad9bfb59b547146671c7a655a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:41:09 -0400
Subject: ext4: reuse path object in ext4_ext_shift_extents()

Now that the semantics of ext4_ext_find_extent() are much cleaner,
it's safe and more efficient to reuse the path object across the
multiple calls to ext4_ext_find_extent() in ext4_ext_shift_extents().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c94c748..22828e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5306,26 +5306,21 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 
 	depth = path->p_depth;
 	extent = path[depth].p_ext;
-	if (!extent) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-		return ret;
-	}
+	if (!extent)
+		goto out;
 
 	stop_block = le32_to_cpu(extent->ee_block) +
 			ext4_ext_get_actual_len(extent);
-	ext4_ext_drop_refs(path);
-	kfree(path);
 
 	/* Nothing to shift, if hole is at the end of file */
 	if (start >= stop_block)
-		return ret;
+		goto out;
 
 	/*
 	 * Don't start shifting extents until we make sure the hole is big
 	 * enough to accomodate the shift.
 	 */
-	path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+	path = ext4_ext_find_extent(inode, start - 1, &path, 0);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	depth = path->p_depth;
@@ -5338,8 +5333,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 		ex_start = 0;
 		ex_end = 0;
 	}
-	ext4_ext_drop_refs(path);
-	kfree(path);
 
 	if ((start == ex_start && shift > ex_start) ||
 	    (shift > start - ex_end))
@@ -5347,7 +5340,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 
 	/* Its safe to start updating extents */
 	while (start < stop_block) {
-		path = ext4_ext_find_extent(inode, start, NULL, 0);
+		path = ext4_ext_find_extent(inode, start, &path, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
@@ -5363,19 +5356,17 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 				path[depth].p_ext++;
 			} else {
 				start = ext4_ext_next_allocated_block(path);
-				ext4_ext_drop_refs(path);
-				kfree(path);
 				continue;
 			}
 		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
 				handle, &start);
-		ext4_ext_drop_refs(path);
-		kfree(path);
 		if (ret)
 			break;
 	}
-
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	return ret;
 }
 
-- 
cgit v1.1


From 3bdf14b4d7a3a7416577e9f9f421dbf29b5b6747 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:42:09 -0400
Subject: ext4: reuse path object in ext4_move_extents()

Reuse the path object in ext4_move_extents() so we don't unnecessarily
free and reallocate it.

Also clean up the get_ext_path() wrapper so that it has the same
semantics of freeing the path object on error as ext4_ext_find_extent().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index a34c076..7bf970d 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -32,20 +32,21 @@
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-		struct ext4_ext_path **orig_path)
+		struct ext4_ext_path **ppath)
 {
-	int ret = 0;
 	struct ext4_ext_path *path;
 
-	path = ext4_ext_find_extent(inode, lblock, orig_path, EXT4_EX_NOCACHE);
+	path = ext4_ext_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
-		ret = PTR_ERR(path);
-	else if (path[ext_depth(inode)].p_ext == NULL)
-		ret = -ENODATA;
-	else
-		*orig_path = path;
-
-	return ret;
+		return PTR_ERR(path);
+	if (path[ext_depth(inode)].p_ext == NULL) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		*ppath = NULL;
+		return -ENODATA;
+	}
+	*ppath = path;
+	return 0;
 }
 
 /**
@@ -667,7 +668,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 			}
 			d_start += next_blk - o_start;
 			o_start = next_blk;
-			goto repeat;
+			continue;
 		/* Check hole after the start pos */
 		} else if (cur_blk > o_start) {
 			/* Skip hole */
@@ -708,10 +709,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 			break;
 		o_start += cur_len;
 		d_start += cur_len;
-	repeat:
-		ext4_ext_drop_refs(path);
-		kfree(path);
-		path = NULL;
 	}
 	*moved_len = o_start - orig_blk;
 	if (*moved_len > len)
-- 
cgit v1.1


From ed8a1a766af7371bfbe41857a3a11496b4165143 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 14:43:09 -0400
Subject: ext4: rename ext4_ext_find_extent() to ext4_find_extent()

Make the function name less redundant.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h           |  8 ++++----
 fs/ext4/extents.c        | 38 +++++++++++++++++++-------------------
 fs/ext4/extents_status.c |  2 +-
 fs/ext4/migrate.c        |  3 +--
 fs/ext4/move_extent.c    |  4 ++--
 5 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4a5a6b9..c07f43f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -573,7 +573,7 @@ enum {
 
 /*
  * The bit position of these flags must not overlap with any of the
- * EXT4_GET_BLOCKS_*.  They are used by ext4_ext_find_extent(),
+ * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
  * read_extent_tree_block(), ext4_split_extent_at(),
  * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
  * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
@@ -2732,9 +2732,9 @@ extern int ext4_can_extents_be_merged(struct inode *inode,
 extern int ext4_ext_insert_extent(handle_t *, struct inode *,
 				  struct ext4_ext_path **,
 				  struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
-						  struct ext4_ext_path **,
-						  int flags);
+extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
+					      struct ext4_ext_path **,
+					      int flags);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 extern int ext4_find_delalloc_range(struct inode *inode,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 22828e4..3ac1686 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -857,8 +857,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 }
 
 struct ext4_ext_path *
-ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
-		     struct ext4_ext_path **orig_path, int flags)
+ext4_find_extent(struct inode *inode, ext4_lblk_t block,
+		 struct ext4_ext_path **orig_path, int flags)
 {
 	struct ext4_extent_header *eh;
 	struct buffer_head *bh;
@@ -1366,7 +1366,7 @@ repeat:
 			goto out;
 
 		/* refill path */
-		path = ext4_ext_find_extent(inode,
+		path = ext4_find_extent(inode,
 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
 				    ppath, gb_flags);
 		if (IS_ERR(path))
@@ -1378,7 +1378,7 @@ repeat:
 			goto out;
 
 		/* refill path */
-		path = ext4_ext_find_extent(inode,
+		path = ext4_find_extent(inode,
 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
 				    ppath, gb_flags);
 		if (IS_ERR(path)) {
@@ -1951,7 +1951,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 		/*
 		 * Try to see whether we should rather test the extent on
 		 * right from ex, or from the left of ex. This is because
-		 * ext4_ext_find_extent() can return either extent on the
+		 * ext4_find_extent() can return either extent on the
 		 * left, or on the right from the searched position. This
 		 * will make merging more effective.
 		 */
@@ -2034,7 +2034,7 @@ prepend:
 	if (next != EXT_MAX_BLOCKS) {
 		ext_debug("next leaf block - %u\n", next);
 		BUG_ON(npath != NULL);
-		npath = ext4_ext_find_extent(inode, next, NULL, 0);
+		npath = ext4_find_extent(inode, next, NULL, 0);
 		if (IS_ERR(npath))
 			return PTR_ERR(npath);
 		BUG_ON(npath->p_depth != path->p_depth);
@@ -2157,7 +2157,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		/* find extent for this block */
 		down_read(&EXT4_I(inode)->i_data_sem);
 
-		path = ext4_ext_find_extent(inode, block, &path, 0);
+		path = ext4_find_extent(inode, block, &path, 0);
 		if (IS_ERR(path)) {
 			up_read(&EXT4_I(inode)->i_data_sem);
 			err = PTR_ERR(path);
@@ -2840,7 +2840,7 @@ again:
 		ext4_lblk_t ee_block;
 
 		/* find extent for this block */
-		path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+		path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
 		if (IS_ERR(path)) {
 			ext4_journal_stop(handle);
 			return PTR_ERR(path);
@@ -3314,7 +3314,7 @@ static int ext4_split_extent(handle_t *handle,
 	 * Update path is required because previous ext4_split_extent_at() may
 	 * result in split of original leaf or extent zeroout.
 	 */
-	path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
+	path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	depth = ext_depth(inode);
@@ -3712,7 +3712,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 						 EXT4_GET_BLOCKS_CONVERT);
 		if (err < 0)
 			return err;
-		path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
+		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = ext_depth(inode);
@@ -3940,7 +3940,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
 		if (err < 0)
 			return err;
-		path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
+		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = ext_depth(inode);
@@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* find extent for this block */
-	path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
+	path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
 	if (IS_ERR(path)) {
 		err = PTR_ERR(path);
 		path = NULL;
@@ -4278,7 +4278,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * consistent leaf must not be empty;
 	 * this situation is possible, though, _during_ tree modification;
-	 * this is why assert can't be put in ext4_ext_find_extent()
+	 * this is why assert can't be put in ext4_find_extent()
 	 */
 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
 		EXT4_ERROR_INODE(inode, "bad extent address "
@@ -4363,7 +4363,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
 	/*
 	 * If we are doing bigalloc, check to see if the extent returned
-	 * by ext4_ext_find_extent() implies a cluster we can use.
+	 * by ext4_find_extent() implies a cluster we can use.
 	 */
 	if (cluster_offset && ex &&
 	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
@@ -5300,7 +5300,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	ext4_lblk_t ex_start, ex_end;
 
 	/* Let path point to the last extent */
-	path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 
@@ -5320,7 +5320,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	 * Don't start shifting extents until we make sure the hole is big
 	 * enough to accomodate the shift.
 	 */
-	path = ext4_ext_find_extent(inode, start - 1, &path, 0);
+	path = ext4_find_extent(inode, start - 1, &path, 0);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	depth = path->p_depth;
@@ -5340,7 +5340,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 
 	/* Its safe to start updating extents */
 	while (start < stop_block) {
-		path = ext4_ext_find_extent(inode, start, &path, 0);
+		path = ext4_find_extent(inode, start, &path, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
@@ -5537,7 +5537,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 		int e1_len, e2_len, len;
 		int split = 0;
 
-		path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
 		if (unlikely(IS_ERR(path1))) {
 			*erp = PTR_ERR(path1);
 			path1 = NULL;
@@ -5545,7 +5545,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 			count = 0;
 			goto repeat;
 		}
-		path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
 		if (unlikely(IS_ERR(path2))) {
 			*erp = PTR_ERR(path2);
 			path2 = NULL;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 8ffff96..bdd400c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -426,7 +426,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
 	unsigned short ee_len;
 	int depth, ee_status, es_status;
 
-	path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+	path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
 		return;
 
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 061c300..a432634 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	ext4_ext_store_pblock(&newext, lb->first_pblock);
 	/* Locking only for convinience since we are operating on temp inode */
 	down_write(&EXT4_I(inode)->i_data_sem);
-	path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
-
+	path = ext4_find_extent(inode, lb->first_block, NULL, 0);
 	if (IS_ERR(path)) {
 		retval = PTR_ERR(path);
 		path = NULL;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7bf970d..5d78063 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -27,7 +27,7 @@
  * @lblock:	logical block number to find an extent path
  * @path:	pointer to an extent path pointer (for output)
  *
- * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * ext4_find_extent wrapper. Return 0 on success, or a negative error value
  * on failure.
  */
 static inline int
@@ -36,7 +36,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
 {
 	struct ext4_ext_path *path;
 
-	path = ext4_ext_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
+	path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	if (path[ext_depth(inode)].p_ext == NULL) {
-- 
cgit v1.1


From be1158cc615fd723552f0d9912087423c7cadda5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 21:19:01 -0400
Subject: jbd2: fold __process_buffer() into jbd2_log_do_checkpoint()

__process_buffer() is only called by jbd2_log_do_checkpoint(), and it
had a very complex locking protocol where it would be called with the
j_list_lock, and sometimes exit with the lock held (if the return code
was 0), or release the lock.

This was confusing both to humans and to smatch (which erronously
complained that the lock was taken twice).

Folding __process_buffer() to the caller allows us to simplify the
control flow, making the resulting function easier to read and reason
about, and dropping the compiled size of fs/jbd2/checkpoint.c by 150
bytes (over 4% of the text size).

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/checkpoint.c | 195 ++++++++++++++++++++++-----------------------------
 1 file changed, 84 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 7f34f47..f1507e5 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -255,81 +255,6 @@ __flush_batch(journal_t *journal, int *batch_count)
 }
 
 /*
- * Try to flush one buffer from the checkpoint list to disk.
- *
- * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.  Return <0 if the buffer has failed to
- * be written out.
- *
- * Called with j_list_lock held and drops it if 1 is returned
- */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			    int *batch_count, transaction_t *transaction)
-{
-	struct buffer_head *bh = jh2bh(jh);
-	int ret = 0;
-
-	if (buffer_locked(bh)) {
-		get_bh(bh);
-		spin_unlock(&journal->j_list_lock);
-		wait_on_buffer(bh);
-		/* the journal_head may have gone by now */
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-		ret = 1;
-	} else if (jh->b_transaction != NULL) {
-		transaction_t *t = jh->b_transaction;
-		tid_t tid = t->t_tid;
-
-		transaction->t_chp_stats.cs_forced_to_close++;
-		spin_unlock(&journal->j_list_lock);
-		if (unlikely(journal->j_flags & JBD2_UNMOUNT))
-			/*
-			 * The journal thread is dead; so starting and
-			 * waiting for a commit to finish will cause
-			 * us to wait for a _very_ long time.
-			 */
-			printk(KERN_ERR "JBD2: %s: "
-			       "Waiting for Godot: block %llu\n",
-			       journal->j_devname,
-			       (unsigned long long) bh->b_blocknr);
-		jbd2_log_start_commit(journal, tid);
-		jbd2_log_wait_commit(journal, tid);
-		ret = 1;
-	} else if (!buffer_dirty(bh)) {
-		ret = 1;
-		if (unlikely(buffer_write_io_error(bh)))
-			ret = -EIO;
-		get_bh(bh);
-		BUFFER_TRACE(bh, "remove from checkpoint");
-		__jbd2_journal_remove_checkpoint(jh);
-		spin_unlock(&journal->j_list_lock);
-		__brelse(bh);
-	} else {
-		/*
-		 * Important: we are about to write the buffer, and
-		 * possibly block, while still holding the journal lock.
-		 * We cannot afford to let the transaction logic start
-		 * messing around with this buffer before we write it to
-		 * disk, as that would break recoverability.
-		 */
-		BUFFER_TRACE(bh, "queue");
-		get_bh(bh);
-		J_ASSERT_BH(bh, !buffer_jwrite(bh));
-		journal->j_chkpt_bhs[*batch_count] = bh;
-		__buffer_relink_io(jh);
-		transaction->t_chp_stats.cs_written++;
-		(*batch_count)++;
-		if (*batch_count == JBD2_NR_BATCH) {
-			spin_unlock(&journal->j_list_lock);
-			__flush_batch(journal, batch_count);
-			ret = 1;
-		}
-	}
-	return ret;
-}
-
-/*
  * Perform an actual checkpoint. We take the first transaction on the
  * list of transactions to be checkpointed and send all its buffers
  * to disk. We submit larger chunks of data at once.
@@ -339,9 +264,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
-	transaction_t *transaction;
-	tid_t this_tid;
-	int result;
+	struct journal_head	*jh;
+	struct buffer_head	*bh;
+	transaction_t		*transaction;
+	tid_t			this_tid;
+	int			err, result, batch_count = 0;
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -374,46 +301,92 @@ restart:
 	 * done (maybe it's a new transaction, but it fell at the same
 	 * address).
 	 */
-	if (journal->j_checkpoint_transactions == transaction &&
-			transaction->t_tid == this_tid) {
-		int batch_count = 0;
-		struct journal_head *jh;
-		int retry = 0, err;
-
-		while (!retry && transaction->t_checkpoint_list) {
-			jh = transaction->t_checkpoint_list;
-			retry = __process_buffer(journal, jh, &batch_count,
-						 transaction);
-			if (retry < 0 && !result)
-				result = retry;
-			if (!retry && (need_resched() ||
-				spin_needbreak(&journal->j_list_lock))) {
-				spin_unlock(&journal->j_list_lock);
-				retry = 1;
-				break;
-			}
-		}
+	if (journal->j_checkpoint_transactions != transaction ||
+	    transaction->t_tid != this_tid)
+		goto out;
 
-		if (batch_count) {
-			if (!retry) {
-				spin_unlock(&journal->j_list_lock);
-				retry = 1;
-			}
-			__flush_batch(journal, &batch_count);
+	/* checkpoint all of the transaction's buffers */
+	while (transaction->t_checkpoint_list) {
+		jh = transaction->t_checkpoint_list;
+		bh = jh2bh(jh);
+
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			get_bh(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			goto retry;
 		}
+		if (jh->b_transaction != NULL) {
+			transaction_t *t = jh->b_transaction;
+			tid_t tid = t->t_tid;
 
-		if (retry) {
-			spin_lock(&journal->j_list_lock);
-			goto restart;
+			transaction->t_chp_stats.cs_forced_to_close++;
+			spin_unlock(&journal->j_list_lock);
+			if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+				/*
+				 * The journal thread is dead; so
+				 * starting and waiting for a commit
+				 * to finish will cause us to wait for
+				 * a _very_ long time.
+				 */
+				printk(KERN_ERR
+		"JBD2: %s: Waiting for Godot: block %llu\n",
+		journal->j_devname, (unsigned long long) bh->b_blocknr);
+
+			jbd2_log_start_commit(journal, tid);
+			jbd2_log_wait_commit(journal, tid);
+			goto retry;
+		}
+		if (!buffer_dirty(bh)) {
+			if (unlikely(buffer_write_io_error(bh)) && !result)
+				result = -EIO;
+			get_bh(bh);
+			BUFFER_TRACE(bh, "remove from checkpoint");
+			__jbd2_journal_remove_checkpoint(jh);
+			spin_unlock(&journal->j_list_lock);
+			__brelse(bh);
+			goto retry;
 		}
 		/*
-		 * Now we have cleaned up the first transaction's checkpoint
-		 * list. Let's clean up the second one
+		 * Important: we are about to write the buffer, and
+		 * possibly block, while still holding the journal
+		 * lock.  We cannot afford to let the transaction
+		 * logic start messing around with this buffer before
+		 * we write it to disk, as that would break
+		 * recoverability.
 		 */
-		err = __wait_cp_io(journal, transaction);
-		if (!result)
-			result = err;
+		BUFFER_TRACE(bh, "queue");
+		get_bh(bh);
+		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		journal->j_chkpt_bhs[batch_count++] = bh;
+		__buffer_relink_io(jh);
+		transaction->t_chp_stats.cs_written++;
+		if ((batch_count == JBD2_NR_BATCH) ||
+		    need_resched() ||
+		    spin_needbreak(&journal->j_list_lock))
+			goto unlock_and_flush;
 	}
+
+	if (batch_count) {
+		unlock_and_flush:
+			spin_unlock(&journal->j_list_lock);
+		retry:
+			if (batch_count)
+				__flush_batch(journal, &batch_count);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+	}
+
+	/*
+	 * Now we issued all of the transaction's buffers, let's deal
+	 * with the buffers that are out for I/O.
+	 */
+	err = __wait_cp_io(journal, transaction);
+	if (!result)
+		result = err;
 out:
 	spin_unlock(&journal->j_list_lock);
 	if (result < 0)
-- 
cgit v1.1


From 88fe1acb5bedfcba5f42fcdf165493ee587ba643 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 1 Sep 2014 21:26:09 -0400
Subject: jbd2: fold __wait_cp_io into jbd2_log_do_checkpoint()

__wait_cp_io() is only called by jbd2_log_do_checkpoint().  Fold it in
to make it a bit easier to understand.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 87 +++++++++++++++++++---------------------------------
 1 file changed, 31 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index f1507e5..18c7a8d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -183,58 +183,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 	}
 }
 
-/*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
- *
- * Return 0 on success, and return <0 if some buffers have failed
- * to be written out.
- *
- * Called with j_list_lock held.
- */
-static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
-{
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	tid_t this_tid;
-	int released = 0;
-	int ret = 0;
-
-	this_tid = transaction->t_tid;
-restart:
-	/* Did somebody clean up the transaction in the meanwhile? */
-	if (journal->j_checkpoint_transactions != transaction ||
-			transaction->t_tid != this_tid)
-		return ret;
-	while (!released && transaction->t_checkpoint_io_list) {
-		jh = transaction->t_checkpoint_io_list;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			/* the journal_head may have gone by now */
-			BUFFER_TRACE(bh, "brelse");
-			__brelse(bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
-		if (unlikely(buffer_write_io_error(bh)))
-			ret = -EIO;
-
-		/*
-		 * Now in whatever state the buffer currently is, we know that
-		 * it has been written out and so we can drop it from the list
-		 */
-		released = __jbd2_journal_remove_checkpoint(jh);
-		__brelse(bh);
-	}
-
-	return ret;
-}
-
 static void
 __flush_batch(journal_t *journal, int *batch_count)
 {
@@ -268,7 +216,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	struct buffer_head	*bh;
 	transaction_t		*transaction;
 	tid_t			this_tid;
-	int			err, result, batch_count = 0;
+	int			result, batch_count = 0, done = 0;
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -384,9 +332,36 @@ restart:
 	 * Now we issued all of the transaction's buffers, let's deal
 	 * with the buffers that are out for I/O.
 	 */
-	err = __wait_cp_io(journal, transaction);
-	if (!result)
-		result = err;
+restart2:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+	    transaction->t_tid != this_tid)
+		goto out;
+
+	while (!done && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
+		bh = jh2bh(jh);
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart2;
+		}
+		if (unlikely(buffer_write_io_error(bh)) && !result)
+			result = -EIO;
+
+		/*
+		 * Now in whatever state the buffer currently is, we
+		 * know that it has been written out and so we can
+		 * drop it from the list
+		 */
+		done = __jbd2_journal_remove_checkpoint(jh);
+		__brelse(bh);
+	}
 out:
 	spin_unlock(&journal->j_list_lock);
 	if (result < 0)
-- 
cgit v1.1


From 45f1a9c3f63db3d4562c16062a51740801fbd88c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 1 Sep 2014 21:34:09 -0400
Subject: ext4: enable block_validity by default

Enable by default the block_validity feature, which checks for
collisions between newly allocated blocks and critical system
metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 896e452..7194a51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3519,8 +3519,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		set_opt(sb, ERRORS_CONT);
 	else
 		set_opt(sb, ERRORS_RO);
-	if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-		set_opt(sb, BLOCK_VALIDITY);
+	/* block_validity enabled by default; disable with noblock_validity */
+	set_opt(sb, BLOCK_VALIDITY);
 	if (def_mount_opts & EXT4_DEFM_DISCARD)
 		set_opt(sb, DISCARD);
 
-- 
cgit v1.1


From d91bd2c1d78d8d22f9f721aae84650a08239b509 Mon Sep 17 00:00:00 2001
From: Seunghun Lee <waydi1@gmail.com>
Date: Mon, 1 Sep 2014 22:15:30 -0400
Subject: ext4: fix comments about get_blocks

get_blocks is renamed to get_block.

Signed-off-by: Seunghun Lee <waydi1@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc95dca..4a16b0c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -590,7 +590,7 @@ found:
 	/*
 	 * New blocks allocate and/or writing to unwritten extent
 	 * will possibly result in updating i_data, so we take
-	 * the write lock of i_data_sem, and call get_blocks()
+	 * the write lock of i_data_sem, and call get_block()
 	 * with create == 1 flag.
 	 */
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -1529,7 +1529,7 @@ out_unlock:
 }
 
 /*
- * This is a special get_blocks_t callback which is used by
+ * This is a special get_block_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
  * reserve space for a single block.
  *
-- 
cgit v1.1


From e963bb1de415ab06693357336c1bec664753e1e2 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Mon, 1 Sep 2014 22:22:13 -0400
Subject: ext4: improve extents status tree trace point

This commit improves the trace point of extents status tree.  We rename
trace_ext4_es_shrink_enter in ext4_es_count() because it is also used
in ext4_es_scan() and we can not identify them from the result.

Further this commit fixes a variable name in trace point in order to
keep consistency with others.

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents_status.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index bdd400c..95da65c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1019,7 +1019,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
 
 	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
 	nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-	trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
 	return nr;
 }
 
@@ -1032,14 +1032,14 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
 	int ret, nr_shrunk;
 
 	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
 
 	if (!nr_to_scan)
 		return ret;
 
 	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
 
-	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
 	return nr_shrunk;
 }
 
-- 
cgit v1.1


From eb68d0e2fc5a4e5c06324ea5f485fccbae626d05 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Mon, 1 Sep 2014 22:26:49 -0400
Subject: ext4: track extent status tree shrinker delay statictics

This commit adds some statictics in extent status tree shrinker.  The
purpose to add these is that we want to collect more details when we
encounter a stall caused by extent status tree shrinker.  Here we count
the following statictics:
  stats:
    the number of all objects on all extent status trees
    the number of reclaimable objects on lru list
    cache hits/misses
    the last sorted interval
    the number of inodes on lru list
  average:
    scan time for shrinking some objects
    the number of shrunk objects
  maximum:
    the inode that has max nr. of objects on lru list
    the maximum scan time for shrinking some objects

The output looks like below:
  $ cat /proc/fs/ext4/sda1/es_shrinker_info
  stats:
    28228 objects
    6341 reclaimable objects
    5281/631 cache hits/misses
    586 ms last sorted interval
    250 inodes on lru list
  average:
    153 us scan time
    128 shrunk objects
  maximum:
    255 inode (255 objects, 198 reclaimable)
    125723 us max scan time

If the lru list has never been sorted, the following line will not be
printed:
    586ms last sorted interval
If there is an empty lru list, the following lines also will not be
printed:
    250 inodes on lru list
  ...
  maximum:
    255 inode (255 objects, 198 reclaimable)
    0 us max scan time

Meanwhile in this commit a new trace point is defined to print some
details in __ext4_es_shrink().

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h           |   4 +-
 fs/ext4/extents_status.c | 186 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/ext4/extents_status.h |  13 +++-
 fs/ext4/super.c          |  11 +--
 4 files changed, 193 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c07f43f..00fd822 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -891,6 +891,7 @@ struct ext4_inode_info {
 	struct ext4_es_tree i_es_tree;
 	rwlock_t i_es_lock;
 	struct list_head i_es_lru;
+	unsigned int i_es_all_nr;	/* protected by i_es_lock */
 	unsigned int i_es_lru_nr;	/* protected by i_es_lock */
 	unsigned long i_touch_when;	/* jiffies of last accessing */
 
@@ -1331,8 +1332,7 @@ struct ext4_sb_info {
 	/* Reclaim extents from extent status tree */
 	struct shrinker s_es_shrinker;
 	struct list_head s_es_lru;
-	unsigned long s_es_last_sorted;
-	struct percpu_counter s_extent_cache_cnt;
+	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_mb_cache;
 	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
 
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 95da65c..09fd576 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -11,6 +11,8 @@
  */
 #include <linux/rbtree.h>
 #include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include "ext4.h"
 #include "extents_status.h"
 
@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
 	 */
 	if (!ext4_es_is_delayed(es)) {
 		EXT4_I(inode)->i_es_lru_nr++;
-		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+					s_es_stats.es_stats_lru_cnt);
 	}
 
+	EXT4_I(inode)->i_es_all_nr++;
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
 	return es;
 }
 
 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 {
+	EXT4_I(inode)->i_es_all_nr--;
+	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
 	/* Decrease the lru counter when this es is not delayed */
 	if (!ext4_es_is_delayed(es)) {
 		BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
 		EXT4_I(inode)->i_es_lru_nr--;
-		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+					s_es_stats.es_stats_lru_cnt);
 	}
 
 	kmem_cache_free(ext4_es_cachep, es);
@@ -729,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 			  struct extent_status *es)
 {
 	struct ext4_es_tree *tree;
+	struct ext4_es_stats *stats;
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
 	int found = 0;
@@ -765,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 	}
 
 out:
+	stats = &EXT4_SB(inode->i_sb)->s_es_stats;
 	if (found) {
 		BUG_ON(!es1);
 		es->es_lblk = es1->es_lblk;
 		es->es_len = es1->es_len;
 		es->es_pblk = es1->es_pblk;
+		stats->es_stats_cache_hits++;
+	} else {
+		stats->es_stats_cache_misses++;
 	}
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -931,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 			    struct ext4_inode_info *locked_ei)
 {
 	struct ext4_inode_info *ei;
+	struct ext4_es_stats *es_stats;
 	struct list_head *cur, *tmp;
 	LIST_HEAD(skipped);
+	ktime_t start_time;
+	u64 scan_time;
 	int nr_shrunk = 0;
 	int retried = 0, skip_precached = 1, nr_skipped = 0;
 
+	es_stats = &sbi->s_es_stats;
+	start_time = ktime_get();
 	spin_lock(&sbi->s_es_lru_lock);
 
 retry:
@@ -946,7 +966,8 @@ retry:
 		 * If we have already reclaimed all extents from extent
 		 * status tree, just stop the loop immediately.
 		 */
-		if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+		if (percpu_counter_read_positive(
+				&es_stats->es_stats_lru_cnt) == 0)
 			break;
 
 		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
@@ -956,7 +977,7 @@ retry:
 		 * time.  Normally we try hard to avoid shrinking
 		 * precached inodes, but we will as a last resort.
 		 */
-		if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+		if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
 		    (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
 						EXT4_STATE_EXT_PRECACHED))) {
 			nr_skipped++;
@@ -990,7 +1011,7 @@ retry:
 	if ((nr_shrunk == 0) && nr_skipped && !retried) {
 		retried++;
 		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
-		sbi->s_es_last_sorted = jiffies;
+		es_stats->es_stats_last_sorted = jiffies;
 		ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
 				      i_es_lru);
 		/*
@@ -1008,6 +1029,22 @@ retry:
 	if (locked_ei && nr_shrunk == 0)
 		nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
 
+	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+	if (likely(es_stats->es_stats_scan_time))
+		es_stats->es_stats_scan_time = (scan_time +
+				es_stats->es_stats_scan_time*3) / 4;
+	else
+		es_stats->es_stats_scan_time = scan_time;
+	if (scan_time > es_stats->es_stats_max_scan_time)
+		es_stats->es_stats_max_scan_time = scan_time;
+	if (likely(es_stats->es_stats_shrunk))
+		es_stats->es_stats_shrunk = (nr_shrunk +
+				es_stats->es_stats_shrunk*3) / 4;
+	else
+		es_stats->es_stats_shrunk = nr_shrunk;
+
+	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+			     nr_skipped, retried);
 	return nr_shrunk;
 }
 
@@ -1018,7 +1055,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
 	struct ext4_sb_info *sbi;
 
 	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
-	nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
 	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
 	return nr;
 }
@@ -1031,7 +1068,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
 	int nr_to_scan = sc->nr_to_scan;
 	int ret, nr_shrunk;
 
-	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
 	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
 
 	if (!nr_to_scan)
@@ -1043,19 +1080,148 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
 	return nr_shrunk;
 }
 
-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+{
+	struct ext4_sb_info *sbi = seq->private;
+	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+	struct ext4_inode_info *ei, *max = NULL;
+	unsigned int inode_cnt = 0;
+
+	if (v != SEQ_START_TOKEN)
+		return 0;
+
+	/* here we just find an inode that has the max nr. of objects */
+	spin_lock(&sbi->s_es_lru_lock);
+	list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+		inode_cnt++;
+		if (max && max->i_es_all_nr < ei->i_es_all_nr)
+			max = ei;
+		else if (!max)
+			max = ei;
+	}
+	spin_unlock(&sbi->s_es_lru_lock);
+
+	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
+		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+		   percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
+		   es_stats->es_stats_cache_hits,
+		   es_stats->es_stats_cache_misses);
+	if (es_stats->es_stats_last_sorted != 0)
+		seq_printf(seq, "  %u ms last sorted interval\n",
+			   jiffies_to_msecs(jiffies -
+					    es_stats->es_stats_last_sorted));
+	if (inode_cnt)
+		seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
+
+	seq_printf(seq, "average:\n  %llu us scan time\n",
+	    div_u64(es_stats->es_stats_scan_time, 1000));
+	seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
+	if (inode_cnt)
+		seq_printf(seq,
+		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
+		    "  %llu us max scan time\n",
+		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+		    div_u64(es_stats->es_stats_max_scan_time, 1000));
+
+	return 0;
+}
+
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+	.start = ext4_es_seq_shrinker_info_start,
+	.next  = ext4_es_seq_shrinker_info_next,
+	.stop  = ext4_es_seq_shrinker_info_stop,
+	.show  = ext4_es_seq_shrinker_info_show,
+};
+
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = PDE_DATA(inode);
+	}
+
+	return ret;
+}
+
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_es_seq_shrinker_info_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ext4_es_seq_shrinker_info_release,
+};
+
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 {
+	int err;
+
 	INIT_LIST_HEAD(&sbi->s_es_lru);
 	spin_lock_init(&sbi->s_es_lru_lock);
-	sbi->s_es_last_sorted = 0;
+	sbi->s_es_stats.es_stats_last_sorted = 0;
+	sbi->s_es_stats.es_stats_shrunk = 0;
+	sbi->s_es_stats.es_stats_cache_hits = 0;
+	sbi->s_es_stats.es_stats_cache_misses = 0;
+	sbi->s_es_stats.es_stats_scan_time = 0;
+	sbi->s_es_stats.es_stats_max_scan_time = 0;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0);
+	if (err)
+		return err;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0);
+	if (err)
+		goto err1;
+
 	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
 	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-	register_shrinker(&sbi->s_es_shrinker);
+	err = register_shrinker(&sbi->s_es_shrinker);
+	if (err)
+		goto err2;
+
+	if (sbi->s_proc)
+		proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+				 &ext4_es_seq_shrinker_info_fops, sbi);
+
+	return 0;
+
+err2:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+err1:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+	return err;
 }
 
 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
+	if (sbi->s_proc)
+		remove_proc_entry("es_shrinker_info", sbi->s_proc);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
 	unregister_shrinker(&sbi->s_es_shrinker);
 }
 
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f1b62a4..efd5f97 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -64,6 +64,17 @@ struct ext4_es_tree {
 	struct extent_status *cache_es;	/* recently accessed extent */
 };
 
+struct ext4_es_stats {
+	unsigned long es_stats_last_sorted;
+	unsigned long es_stats_shrunk;
+	unsigned long es_stats_cache_hits;
+	unsigned long es_stats_cache_misses;
+	u64 es_stats_scan_time;
+	u64 es_stats_max_scan_time;
+	struct percpu_counter es_stats_all_cnt;
+	struct percpu_counter es_stats_lru_cnt;
+};
+
 extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
 		       (pb & ~ES_MASK));
 }
 
-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
 extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7194a51..487c65b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -885,6 +884,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ext4_es_init_tree(&ei->i_es_tree);
 	rwlock_init(&ei->i_es_lock);
 	INIT_LIST_HEAD(&ei->i_es_lru);
+	ei->i_es_all_nr = 0;
 	ei->i_es_lru_nr = 0;
 	ei->i_touch_when = 0;
 	ei->i_reserved_data_blocks = 0;
@@ -3890,12 +3890,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_err_report.data = (unsigned long) sb;
 
 	/* Register extent status tree shrinker */
-	ext4_es_register_shrinker(sbi);
-
-	if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
-		ext4_msg(sb, KERN_ERR, "insufficient memory");
+	if (ext4_es_register_shrinker(sbi))
 		goto failed_mount3;
-	}
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_extent_max_zeroout_kb = 32;
@@ -4225,10 +4221,9 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
-failed_mount3:
 	ext4_es_unregister_shrinker(sbi);
+failed_mount3:
 	del_timer_sync(&sbi->s_err_report);
-	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
-- 
cgit v1.1


From a521100231f816f8cdd9c8e77da14ff1e42c2b17 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 4 Sep 2014 18:06:25 -0400
Subject: ext4: pass allocation_request struct to ext4_(alloc,splice)_branch

Instead of initializing the allocation_request structure in
ext4_alloc_branch(), set it up in ext4_ind_map_blocks(), and then pass
it to ext4_alloc_branch() and ext4_splice_branch().

This allows ext4_ind_map_blocks to pass flags in the allocation
request structure without having to add Yet Another argument to
ext4_alloc_branch().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/indirect.c | 82 +++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index e75f840..69af0cd 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -318,34 +318,22 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
  *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  *	as described above and return 0.
  */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-			     ext4_lblk_t iblock, int indirect_blks,
-			     int *blks, ext4_fsblk_t goal,
-			     ext4_lblk_t *offsets, Indirect *branch)
+static int ext4_alloc_branch(handle_t *handle,
+			     struct ext4_allocation_request *ar,
+			     int indirect_blks, ext4_lblk_t *offsets,
+			     Indirect *branch)
 {
-	struct ext4_allocation_request	ar;
 	struct buffer_head *		bh;
 	ext4_fsblk_t			b, new_blocks[4];
 	__le32				*p;
 	int				i, j, err, len = 1;
 
-	/*
-	 * Set up for the direct block allocation
-	 */
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.len = *blks;
-	ar.logical = iblock;
-	if (S_ISREG(inode->i_mode))
-		ar.flags = EXT4_MB_HINT_DATA;
-
 	for (i = 0; i <= indirect_blks; i++) {
 		if (i == indirect_blks) {
-			ar.goal = goal;
-			new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+			new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
 		} else
-			goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
-							goal, 0, NULL, &err);
+			ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
+				    ar->inode, ar->goal, 0, NULL, &err);
 		if (err) {
 			i--;
 			goto failed;
@@ -354,7 +342,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		if (i == 0)
 			continue;
 
-		bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+		bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
 		if (unlikely(!bh)) {
 			err = -ENOMEM;
 			goto failed;
@@ -372,7 +360,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		b = new_blocks[i];
 
 		if (i == indirect_blks)
-			len = ar.len;
+			len = ar->len;
 		for (j = 0; j < len; j++)
 			*p++ = cpu_to_le32(b++);
 
@@ -381,11 +369,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		unlock_buffer(bh);
 
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
 		if (err)
 			goto failed;
 	}
-	*blks = ar.len;
 	return 0;
 failed:
 	for (; i >= 0; i--) {
@@ -396,10 +383,10 @@ failed:
 		 * existing before ext4_alloc_branch() was called.
 		 */
 		if (i > 0 && i != indirect_blks && branch[i].bh)
-			ext4_forget(handle, 1, inode, branch[i].bh,
+			ext4_forget(handle, 1, ar->inode, branch[i].bh,
 				    branch[i].bh->b_blocknr);
-		ext4_free_blocks(handle, inode, NULL, new_blocks[i],
-				 (i == indirect_blks) ? ar.len : 1, 0);
+		ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+				 (i == indirect_blks) ? ar->len : 1, 0);
 	}
 	return err;
 }
@@ -419,9 +406,9 @@ failed:
  * inode (->i_blocks, etc.). In case of success we end up with the full
  * chain to new block and return 0.
  */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-			      ext4_lblk_t block, Indirect *where, int num,
-			      int blks)
+static int ext4_splice_branch(handle_t *handle,
+			      struct ext4_allocation_request *ar,
+			      Indirect *where, int num)
 {
 	int i;
 	int err = 0;
@@ -446,9 +433,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	 * Update the host buffer_head or inode to point to more just allocated
 	 * direct blocks blocks
 	 */
-	if (num == 0 && blks > 1) {
+	if (num == 0 && ar->len > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
-		for (i = 1; i < blks; i++)
+		for (i = 1; i < ar->len; i++)
 			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 
@@ -465,14 +452,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 		 */
 		jbd_debug(5, "splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
-		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+		err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
 		if (err)
 			goto err_out;
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
 		 */
-		ext4_mark_inode_dirty(handle, inode);
+		ext4_mark_inode_dirty(handle, ar->inode);
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;
@@ -484,11 +471,11 @@ err_out:
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
 		 */
-		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+		ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
-	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
-			 blks, 0);
+	ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
+			 ar->len, 0);
 
 	return err;
 }
@@ -525,11 +512,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
 			int flags)
 {
+	struct ext4_allocation_request ar;
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
-	ext4_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
@@ -579,7 +566,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 		return -ENOSPC;
 	}
 
-	goal = ext4_find_goal(inode, map->m_lblk, partial);
+	/* Set up for the direct block allocation */
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.logical = map->m_lblk;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+
+	ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
@@ -588,13 +582,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	 * Next look up the indirect map to count the totoal number of
 	 * direct blocks to allocate for this branch.
 	 */
-	count = ext4_blks_to_allocate(partial, indirect_blks,
-				      map->m_len, blocks_to_boundary);
+	ar.len = ext4_blks_to_allocate(partial, indirect_blks,
+				       map->m_len, blocks_to_boundary);
+
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
-				&count, goal,
+	err = ext4_alloc_branch(handle, &ar, indirect_blks,
 				offsets + (partial - chain), partial);
 
 	/*
@@ -605,14 +599,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
-		err = ext4_splice_branch(handle, inode, map->m_lblk,
-					 partial, indirect_blks, count);
+		err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
 	if (err)
 		goto cleanup;
 
 	map->m_flags |= EXT4_MAP_NEW;
 
 	ext4_update_inode_fsync_trans(handle, inode, 1);
+	count = ar.len;
 got_it:
 	map->m_flags |= EXT4_MAP_MAPPED;
 	map->m_pblk = le32_to_cpu(chain[depth-1].key);
-- 
cgit v1.1


From e3cf5d5d9a86df1c5e413bdd3725c25a16ff854c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 4 Sep 2014 18:07:25 -0400
Subject: ext4: prepare to drop EXT4_STATE_DELALLOC_RESERVED

The EXT4_STATE_DELALLOC_RESERVED flag was originally implemented
because it was too hard to make sure the mballoc and get_block flags
could be reliably passed down through all of the codepaths that end up
calling ext4_mb_new_blocks().

Since then, we have mb_flags passed down through most of the code
paths, so getting rid of EXT4_STATE_DELALLOC_RESERVED isn't as tricky
as it used to.

This commit plumbs in the last of what is required, and then adds a
WARN_ON check to make sure we haven't missed anything.  If this passes
a full regression test run, we can then drop
EXT4_STATE_DELALLOC_RESERVED.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/balloc.c   |  3 +--
 fs/ext4/extents.c  |  6 +++++-
 fs/ext4/indirect.c |  6 +++++-
 fs/ext4/mballoc.c  | 10 ++++++----
 fs/ext4/xattr.c    |  6 ------
 5 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 581ef40..d70f154 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -636,8 +636,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	 * Account for the allocated meta blocks.  We will never
 	 * fail EDQUOT for metdata, but we do account for it.
 	 */
-	if (!(*errp) &&
-	    ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+	if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		dquot_alloc_block_nofail(inode,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3ac1686..8170b32 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1933,6 +1933,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	ext4_lblk_t next;
 	int mb_flags = 0, unwritten;
 
+	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
 		return -EIO;
@@ -2054,7 +2056,7 @@ prepend:
 	 * We're gonna add a new leaf in the tree.
 	 */
 	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
-		mb_flags = EXT4_MB_USE_RESERVED;
+		mb_flags |= EXT4_MB_USE_RESERVED;
 	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
 				       ppath, newext);
 	if (err)
@@ -4438,6 +4440,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ar.flags = 0;
 	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
 		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 69af0cd..36b3696 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -333,7 +333,9 @@ static int ext4_alloc_branch(handle_t *handle,
 			new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
 		} else
 			ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
-				    ar->inode, ar->goal, 0, NULL, &err);
+					ar->inode, ar->goal,
+					ar->flags & EXT4_MB_DELALLOC_RESERVED,
+					NULL, &err);
 		if (err) {
 			i--;
 			goto failed;
@@ -572,6 +574,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	ar.logical = map->m_lblk;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
 
 	ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8b0f9ef..15dffda 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4415,9 +4415,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	 * EDQUOT check, as blocks and quotas have been already
 	 * reserved when data being copied into pagecache.
 	 */
-	if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
+	if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) {
+		WARN_ON((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0);
 		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
-	else {
+	}
+
+	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
 		/* Without delayed allocation we need to verify
 		 * there is enough free blocks to do block allocation
 		 * and verify allocation doesn't exceed the quota limits.
@@ -4528,8 +4531,7 @@ out:
 	if (inquota && ar->len < inquota)
 		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
 	if (!ar->len) {
-		if (!ext4_test_inode_state(ar->inode,
-					   EXT4_STATE_DELALLOC_RESERVED))
+		if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
 			/* release all the reserved blocks if non delalloc */
 			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 						reserv_clstrs);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e738733..da4df70 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -899,14 +899,8 @@ inserted:
 			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
-			/*
-			 * take i_data_sem because we will test
-			 * i_delalloc_reserved_flag in ext4_mb_new_blocks
-			 */
-			down_read(&EXT4_I(inode)->i_data_sem);
 			block = ext4_new_meta_blocks(handle, inode, goal, 0,
 						     NULL, &error);
-			up_read((&EXT4_I(inode)->i_data_sem));
 			if (error)
 				goto cleanup;
 
-- 
cgit v1.1


From 754cfed6bbcfdea6afb14f2686f7f8d71e94d4e2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 4 Sep 2014 18:08:22 -0400
Subject: ext4: drop the EXT4_STATE_DELALLOC_RESERVED flag

Having done a full regression test, we can now drop the
DELALLOC_RESERVED state flag.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h    |  1 -
 fs/ext4/inode.c   | 20 ++++----------------
 fs/ext4/mballoc.c | 10 ----------
 3 files changed, 4 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 00fd822..4855800 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1400,7 +1400,6 @@ enum {
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
-	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
 	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
 					   nolocking */
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4a16b0c..d5dd7d4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -596,14 +596,6 @@ found:
 	down_write(&EXT4_I(inode)->i_data_sem);
 
 	/*
-	 * if the caller is from delayed allocation writeout path
-	 * we have already reserved fs blocks for allocation
-	 * let the underlying get_block() function know to
-	 * avoid double accounting
-	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
-	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
@@ -631,8 +623,6 @@ found:
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
 	if (retval > 0) {
 		unsigned int status;
@@ -2004,12 +1994,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 	 * in data loss.  So use reserved blocks to allocate metadata if
 	 * possible.
 	 *
-	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
-	 * in question are delalloc blocks.  This affects functions in many
-	 * different parts of the allocation call path.  This flag exists
-	 * primarily because we don't want to change *many* call functions, so
-	 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
-	 * once the inode's allocation semaphore is taken.
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
+	 * the blocks in question are delalloc blocks.  This indicates
+	 * that the blocks and quotas has already been checked when
+	 * the data was copied into the page cache.
 	 */
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
 			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 15dffda..65cca28 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4410,16 +4410,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	if (IS_NOQUOTA(ar->inode))
 		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
 
-	/*
-	 * For delayed allocation, we could skip the ENOSPC and
-	 * EDQUOT check, as blocks and quotas have been already
-	 * reserved when data being copied into pagecache.
-	 */
-	if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) {
-		WARN_ON((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0);
-		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
-	}
-
 	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
 		/* Without delayed allocation we need to verify
 		 * there is enough free blocks to do block allocation
-- 
cgit v1.1


From dc6e8d669cf5cb3ff84707c372c0a2a8a5e80845 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 4 Sep 2014 18:09:22 -0400
Subject: jbd2: don't call get_bh() before calling
 __jbd2_journal_remove_checkpoint()

The __jbd2_journal_remove_checkpoint() doesn't require an elevated
b_count; indeed, until the jh structure gets released by the call to
jbd2_journal_put_journal_head(), the bh's b_count is elevated by
virtue of the existence of the jh structure.

Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 18c7a8d..90d6091 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_transaction == NULL && !buffer_locked(bh) &&
 	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
-		/*
-		 * Get our reference so that bh cannot be freed before
-		 * we unlock it
-		 */
-		get_bh(bh);
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
-		BUFFER_TRACE(bh, "release");
-		__brelse(bh);
 	}
 	return ret;
 }
@@ -216,7 +209,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	struct buffer_head	*bh;
 	transaction_t		*transaction;
 	tid_t			this_tid;
-	int			result, batch_count = 0, done = 0;
+	int			result, batch_count = 0;
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -291,11 +284,9 @@ restart:
 		if (!buffer_dirty(bh)) {
 			if (unlikely(buffer_write_io_error(bh)) && !result)
 				result = -EIO;
-			get_bh(bh);
 			BUFFER_TRACE(bh, "remove from checkpoint");
 			__jbd2_journal_remove_checkpoint(jh);
 			spin_unlock(&journal->j_list_lock);
-			__brelse(bh);
 			goto retry;
 		}
 		/*
@@ -338,12 +329,12 @@ restart2:
 	    transaction->t_tid != this_tid)
 		goto out;
 
-	while (!done && transaction->t_checkpoint_io_list) {
+	while (transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
-		get_bh(bh);
 		if (buffer_locked(bh)) {
 			spin_unlock(&journal->j_list_lock);
+			get_bh(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
@@ -359,8 +350,8 @@ restart2:
 		 * know that it has been written out and so we can
 		 * drop it from the list
 		 */
-		done = __jbd2_journal_remove_checkpoint(jh);
-		__brelse(bh);
+		if (__jbd2_journal_remove_checkpoint(jh))
+			break;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-- 
cgit v1.1


From 0e5ecf0a762627b949141df1d83094a9b0eb54a8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 18:09:29 -0400
Subject: jbd2: optimize jbd2_log_do_checkpoint() a bit

When we discover written out buffer in transaction checkpoint list we
don't have to recheck validity of a transaction. Either this is the
last buffer in a transaction - and then we are done - or this isn't
and then we can just take another buffer from the checkpoint list
without dropping j_list_lock.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 90d6091..9ffb19c 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -285,9 +285,10 @@ restart:
 			if (unlikely(buffer_write_io_error(bh)) && !result)
 				result = -EIO;
 			BUFFER_TRACE(bh, "remove from checkpoint");
-			__jbd2_journal_remove_checkpoint(jh);
-			spin_unlock(&journal->j_list_lock);
-			goto retry;
+			if (__jbd2_journal_remove_checkpoint(jh))
+				/* The transaction was released; we're done */
+				goto out;
+			continue;
 		}
 		/*
 		 * Important: we are about to write the buffer, and
-- 
cgit v1.1


From d26e2c4d72c2f2a38246f618480864fe3224929c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 4 Sep 2014 18:09:29 -0400
Subject: ext4: renumber EXT4_EX_* flags to avoid flag aliasing problems

Suggested-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4855800..f70c3fc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -569,7 +569,6 @@ enum {
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
 	/* Convert written extents to unwritten */
 #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0400
-/* DO NOT ASSIGN ADDITIONAL FLAG VALUES WITHOUT ADJUSTING THE FLAGS BELOW */
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -580,8 +579,8 @@ enum {
  * caching the extents when reading from the extent tree while a
  * truncate or punch hole operation is in progress.
  */
-#define EXT4_EX_NOCACHE				0x0800
-#define EXT4_EX_FORCE_CACHE			0x1000
+#define EXT4_EX_NOCACHE				0x40000000
+#define EXT4_EX_FORCE_CACHE			0x20000000
 
 /*
  * Flags used by ext4_free_blocks
-- 
cgit v1.1


From 3b5e6454aaf6b4439b19400d8365e2ec2d24e411 Mon Sep 17 00:00:00 2001
From: Gioh Kim <gioh.kim@lge.com>
Date: Thu, 4 Sep 2014 22:04:42 -0400
Subject: fs/buffer.c: support buffer cache allocations with gfp modifiers

A buffer cache is allocated from movable area because it is referred
for a while and released soon.  But some filesystems are taking buffer
cache for a long time and it can disturb page migration.

New APIs are introduced to allocate buffer cache with user specific
flag.  *_gfp APIs are for user want to set page allocation flag for
page cache allocation.  And *_unmovable APIs are for the user wants to
allocate page cache from non-movable area.

Signed-off-by: Gioh Kim <gioh.kim@lge.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/buffer.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111..9a6029e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
  */
 static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-		pgoff_t index, int size, int sizebits)
+	      pgoff_t index, int size, int sizebits, gfp_t gfp)
 {
 	struct inode *inode = bdev->bd_inode;
 	struct page *page;
@@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	int ret = 0;		/* Will call free_more_memory() */
 	gfp_t gfp_mask;
 
-	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
-	gfp_mask |= __GFP_MOVABLE;
+	gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+
 	/*
 	 * XXX: __getblk_slow() can not really deal with failure and
 	 * will endlessly loop on improvised global reclaim.  Prefer
@@ -1058,7 +1058,7 @@ failed:
  * that page was dirty, the buffers are set dirty also.
  */
 static int
-grow_buffers(struct block_device *bdev, sector_t block, int size)
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
 {
 	pgoff_t index;
 	int sizebits;
@@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
 	}
 
 	/* Create a page with the proper size buffers.. */
-	return grow_dev_page(bdev, block, index, size, sizebits);
+	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
 }
 
-static struct buffer_head *
-__getblk_slow(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk_slow(struct block_device *bdev, sector_t block,
+	     unsigned size, gfp_t gfp)
 {
 	/* Size must be multiple of hard sectorsize */
 	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 		if (bh)
 			return bh;
 
-		ret = grow_buffers(bdev, block, size);
+		ret = grow_buffers(bdev, block, size, gfp);
 		if (ret < 0)
 			return NULL;
 		if (ret == 0)
 			free_more_memory();
 	}
 }
+EXPORT_SYMBOL(__getblk_slow);
 
 /*
  * The relationship between dirty buffers and dirty pages:
@@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 EXPORT_SYMBOL(__find_get_block);
 
 /*
- * __getblk will locate (and, if necessary, create) the buffer_head
+ * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
  * which corresponds to the passed block_device, block and size. The
  * returned buffer has its reference count incremented.
  *
- * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
- * attempt is failing.  FIXME, perhaps?
+ * __getblk_gfp() will lock up the machine if grow_dev_page's
+ * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
  */
 struct buffer_head *
-__getblk(struct block_device *bdev, sector_t block, unsigned size)
+__getblk_gfp(struct block_device *bdev, sector_t block,
+	     unsigned size, gfp_t gfp)
 {
 	struct buffer_head *bh = __find_get_block(bdev, block, size);
 
 	might_sleep();
 	if (bh == NULL)
-		bh = __getblk_slow(bdev, block, size);
+		bh = __getblk_slow(bdev, block, size, gfp);
 	return bh;
 }
-EXPORT_SYMBOL(__getblk);
+EXPORT_SYMBOL(__getblk_gfp);
 
 /*
  * Do async read-ahead on a buffer..
@@ -1404,24 +1407,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 EXPORT_SYMBOL(__breadahead);
 
 /**
- *  __bread() - reads a specified block and returns the bh
+ *  __bread_gfp() - reads a specified block and returns the bh
  *  @bdev: the block_device to read from
  *  @block: number of block
  *  @size: size (in bytes) to read
- * 
+ *  @gfp: page allocation flag
+ *
  *  Reads a specified block, and returns buffer head that contains it.
+ *  The page cache can be allocated from non-movable area
+ *  not to prevent page migration if you set gfp to zero.
  *  It returns NULL if the block was unreadable.
  */
 struct buffer_head *
-__bread(struct block_device *bdev, sector_t block, unsigned size)
+__bread_gfp(struct block_device *bdev, sector_t block,
+		   unsigned size, gfp_t gfp)
 {
-	struct buffer_head *bh = __getblk(bdev, block, size);
+	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
 
 	if (likely(bh) && !buffer_uptodate(bh))
 		bh = __bread_slow(bh);
 	return bh;
 }
-EXPORT_SYMBOL(__bread);
+EXPORT_SYMBOL(__bread_gfp);
 
 /*
  * invalidate_bh_lrus() is called rarely - but not only at unmount.
-- 
cgit v1.1


From a8ac900b8163703340a2fdad11c32f96b8fe686d Mon Sep 17 00:00:00 2001
From: Gioh Kim <gioh.kim@lge.com>
Date: Thu, 4 Sep 2014 22:36:15 -0400
Subject: ext4: use non-movable memory for the ext4 superblock

Since the ext4 superblock is not released until the file system is
unmounted, allocate the buffer cache entry for the ext4 superblock out
of the non-moveable are to allow page migrations and thus CMA
allocations to more easily succeed if the CMA area is limited.

Signed-off-by: Gioh Kim <gioh.kim@lge.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 487c65b..4b81747 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3436,7 +3436,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		logical_sb_block = sb_block;
 	}
 
-	if (!(bh = sb_bread(sb, logical_sb_block))) {
+	if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
 		ext4_msg(sb, KERN_ERR, "unable to read superblock");
 		goto out_fail;
 	}
@@ -3646,7 +3646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		brelse(bh);
 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
 		offset = do_div(logical_sb_block, blocksize);
-		bh = sb_bread(sb, logical_sb_block);
+		bh = sb_bread_unmovable(sb, logical_sb_block);
 		if (!bh) {
 			ext4_msg(sb, KERN_ERR,
 			       "Can't read superblock on 2nd try");
@@ -3868,7 +3868,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
-		sbi->s_group_desc[i] = sb_bread(sb, block);
+		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
 		if (!sbi->s_group_desc[i]) {
 			ext4_msg(sb, KERN_ERR,
 			       "can't read group descriptor %d", i);
-- 
cgit v1.1


From a49058fab2912296f068759490ac69ba43b43861 Mon Sep 17 00:00:00 2001
From: Gioh Kim <gioh.kim@lge.com>
Date: Thu, 4 Sep 2014 22:36:35 -0400
Subject: jbd/jbd2: use non-movable memory for the jbd superblock

Sicne the jbd/jbd2 superblock is not released until the file system is
unmounted, allocate the buffer cache from the non-moveable area to
allow page migration and CMA allocations to more easily succeed.

Signed-off-by: Gioh Kim <gioh.kim@lge.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c  | 2 +-
 fs/jbd2/journal.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 06fe11e..aab8549 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
 		goto out_err;
 	}
 
-	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
 	if (!bh) {
 		printk(KERN_ERR
 		       "%s: Cannot get buffer for journal superblock\n",
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 19d74d8..415041c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 		goto out_err;
 	}
 
-	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
 	if (!bh) {
 		printk(KERN_ERR
 		       "%s: Cannot get buffer for journal superblock\n",
-- 
cgit v1.1


From a2d4a646e619541e803fb52636964df39aed94b7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 11 Sep 2014 11:15:15 -0400
Subject: ext4: don't use MAXQUOTAS value

MAXQUOTAS value defines maximum number of quota types VFS supports.
This isn't necessarily the number of types ext4 supports. Although
ext4 will support project quotas, use ext4 private definition for
consistency with other filesystems.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h      |  5 ++++-
 fs/ext4/ext4_jbd2.h |  6 +++---
 fs/ext4/super.c     | 22 +++++++++++-----------
 3 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f70c3fc..1eb5b7b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1175,6 +1175,9 @@ struct ext4_super_block {
 #define EXT4_MF_MNTDIR_SAMPLED	0x0001
 #define EXT4_MF_FS_ABORTED	0x0002	/* Fatal error detected */
 
+/* Number of quota types we support */
+#define EXT4_MAXQUOTAS 2
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1238,7 +1241,7 @@ struct ext4_sb_info {
 	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_QUOTA
-	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	char *s_qf_names[EXT4_MAXQUOTAS];	/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 17c00ff..9c5b49f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -102,9 +102,9 @@
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
-#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
 static inline int ext4_jbd2_credits_xattr(struct inode *inode)
 {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4b81747..a318a2d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -822,7 +822,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
-	for (i = 0; i < MAXQUOTAS; i++)
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
 #endif
 
@@ -2207,7 +2207,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	/* Needed for iput() to work correctly and not trash data */
 	sb->s_flags |= MS_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	for (i = 0; i < MAXQUOTAS; i++) {
+	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
@@ -2263,7 +2263,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	for (i = 0; i < MAXQUOTAS; i++) {
+	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		if (sb_dqopt(sb)->files[i])
 			dquot_quota_off(sb, i);
 	}
@@ -4238,7 +4238,7 @@ failed_mount:
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 #ifdef CONFIG_QUOTA
-	for (i = 0; i < MAXQUOTAS; i++)
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
 #endif
 	ext4_blkdev_remove(sbi);
@@ -4765,7 +4765,7 @@ struct ext4_mount_options {
 	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
-	char *s_qf_names[MAXQUOTAS];
+	char *s_qf_names[EXT4_MAXQUOTAS];
 #endif
 };
 
@@ -4795,7 +4795,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
-	for (i = 0; i < MAXQUOTAS; i++)
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		if (sbi->s_qf_names[i]) {
 			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
 							 GFP_KERNEL);
@@ -4956,7 +4956,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
-	for (i = 0; i < MAXQUOTAS; i++)
+	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(old_opts.s_qf_names[i]);
 	if (enable_quota) {
 		if (sb_any_quota_suspended(sb))
@@ -4985,7 +4985,7 @@ restore_opts:
 	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
-	for (i = 0; i < MAXQUOTAS; i++) {
+	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		kfree(sbi->s_qf_names[i]);
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
@@ -5188,7 +5188,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 {
 	int err;
 	struct inode *qf_inode;
-	unsigned long qf_inums[MAXQUOTAS] = {
+	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
 	};
@@ -5216,13 +5216,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 static int ext4_enable_quotas(struct super_block *sb)
 {
 	int type, err = 0;
-	unsigned long qf_inums[MAXQUOTAS] = {
+	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
 	};
 
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
-	for (type = 0; type < MAXQUOTAS; type++) {
+	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
 		if (qf_inums[type]) {
 			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
 						DQUOT_USAGE_ENABLED);
-- 
cgit v1.1


From 52c198c6820f68b6fbe1d83f76e34a82bf736024 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 11 Sep 2014 11:18:13 -0400
Subject: ext4: add sysfs entry showing whether the fs contains errors

Currently there is no easy way to tell that the mounted file system
contains errors other than checking for log messages, or reading the
information directly from superblock.

This patch adds new sysfs entries:

errors_count		(number of fs errors we encounter)
first_error_time	(unix timestamp for the first error we see)
last_error_time		(unix timestamp for the last error we see)

If the file system is not marked as containing errors then any of the
file will return 0. Otherwise it will contain valid information. More
details about the errors should as always be found in the logs.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a318a2d..ff889e1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2548,6 +2548,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
 	return count;
 }
 
+static ssize_t es_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+
+	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
+			   a->u.offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
 static ssize_t reserved_clusters_show(struct ext4_attr *a,
 				  struct ext4_sb_info *sbi, char *buf)
 {
@@ -2601,14 +2611,29 @@ static struct ext4_attr ext4_attr_##_name = {			\
 		.offset = offsetof(struct ext4_sb_info, _elname),\
 	},							\
 }
+
+#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
+static struct ext4_attr ext4_attr_##_name = {				\
+	.attr = {.name = __stringify(_name), .mode = _mode },		\
+	.show	= _show,						\
+	.store	= _store,						\
+	.u = {								\
+		.offset = offsetof(struct ext4_super_block, _elname),	\
+	},								\
+}
+
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 
 #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+
+#define EXT4_RO_ATTR_ES_UI(name, elname)	\
+	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
 	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+
 #define ATTR_LIST(name) &ext4_attr_##name.attr
 #define EXT4_DEPRECATED_ATTR(_name, _val)	\
 static struct ext4_attr ext4_attr_##_name = {			\
@@ -2641,6 +2666,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2664,6 +2692,9 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(warning_ratelimit_burst),
 	ATTR_LIST(msg_ratelimit_interval_ms),
 	ATTR_LIST(msg_ratelimit_burst),
+	ATTR_LIST(errors_count),
+	ATTR_LIST(first_error_time),
+	ATTR_LIST(last_error_time),
 	NULL,
 };
 
-- 
cgit v1.1


From c7f725435adcf2ade4b9152ee33339d28f4cc330 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 11 Sep 2014 11:27:58 -0400
Subject: ext4: provide separate operations for sysfs feature files

Currently sysfs feature files uses ext4_attr_ops as the file operations
to show/store data. However the feature files is not supposed to contain
any data at all, the sole existence of the file means that the module
support the feature. Moreover, none of the sysfs feature attributes
actually register show/store functions so that would not be a problem.

However if a sysfs feature attribute register a show or store function
we might be in trouble because the kobject in this case is _not_ embedded
in the ext4_sb_info structure as ext4_attr_show/store expect.

So just to be safe, provide separate empty sysfs_ops to use in
ext4_feat_ktype. This might safe us from potential problems in the
future. As a bonus we can "store" something more descriptive than
nothing in the files, so let it contain "enabled" to make it clear that
the feature is really present in the module.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ff889e1..2766b8e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2754,9 +2754,25 @@ static void ext4_feat_release(struct kobject *kobj)
 	complete(&ext4_feat->f_kobj_unregister);
 }
 
+static ssize_t ext4_feat_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "supported\n");
+}
+
+/*
+ * We can not use ext4_attr_show/store because it relies on the kobject
+ * being embedded in the ext4_sb_info structure which is definitely not
+ * true in this case.
+ */
+static const struct sysfs_ops ext4_feat_ops = {
+	.show	= ext4_feat_show,
+	.store	= NULL,
+};
+
 static struct kobj_type ext4_feat_ktype = {
 	.default_attrs	= ext4_feat_attrs,
-	.sysfs_ops	= &ext4_attr_ops,
+	.sysfs_ops	= &ext4_feat_ops,
 	.release	= ext4_feat_release,
 };
 
-- 
cgit v1.1


From feb8c6d3dd0f2cc0e1c3376d099cf298c5f2c2c8 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 11 Sep 2014 11:38:21 -0400
Subject: jbd2: fix journal checksum feature flag handling

Clear all three journal checksum feature flags before turning on
whichever journal checksum options we want.  Rearrange the error
checking so that newer flags get complained about first.

Reported-by: TR Reardon <thomas_reardon@hotmail.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c   | 11 ++++++-----
 fs/jbd2/journal.c | 16 ++++++++--------
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2766b8e..fb219b9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3237,6 +3237,10 @@ static int set_journal_csum_feature_set(struct super_block *sb)
 		incompat = 0;
 	}
 
+	jbd2_journal_clear_features(sbi->s_journal,
+			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
+			JBD2_FEATURE_INCOMPAT_CSUM_V2);
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 		ret = jbd2_journal_set_features(sbi->s_journal,
 				compat, 0,
@@ -3249,11 +3253,8 @@ static int set_journal_csum_feature_set(struct super_block *sb)
 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
 	} else {
-		jbd2_journal_clear_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
-				JBD2_FEATURE_INCOMPAT_CSUM_V3 |
-				JBD2_FEATURE_INCOMPAT_CSUM_V2);
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
 	}
 
 	return ret;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 415041c..e4dc747 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1522,14 +1522,6 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
-	if (jbd2_journal_has_csum_v2or3(journal) &&
-	    JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
-		/* Can't have checksum v1 and v2 on at the same time! */
-		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
-		       "at the same time!\n");
-		goto out;
-	}
-
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
 	    JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
 		/* Can't have checksum v2 and v3 at the same time! */
@@ -1538,6 +1530,14 @@ static int journal_get_superblock(journal_t *journal)
 		goto out;
 	}
 
+	if (jbd2_journal_has_csum_v2or3(journal) &&
+	    JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+		/* Can't have checksum v1 and v2 on at the same time! */
+		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
+		       "at the same time!\n");
+		goto out;
+	}
+
 	if (!jbd2_verify_csum_type(journal, sb)) {
 		printk(KERN_ERR "JBD2: Unknown checksum type\n");
 		goto out;
-- 
cgit v1.1


From df4763bea5b04d8eed941cfe3df51f22cfe95570 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 11 Sep 2014 11:44:36 -0400
Subject: ext4: validate external journal superblock checksum

If the external journal device has metadata_csum enabled, verify
that the superblock checksum matches the block before we try to
mount.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb219b9..2632017 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4414,6 +4414,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 		goto out_bdev;
 	}
 
+	if ((le32_to_cpu(es->s_feature_ro_compat) &
+	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	    es->s_checksum != ext4_superblock_csum(sb, es)) {
+		ext4_msg(sb, KERN_ERR, "external journal has "
+				       "corrupt superblock");
+		brelse(bh);
+		goto out_bdev;
+	}
+
 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
 		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
 		brelse(bh);
-- 
cgit v1.1


From 684de5748660e16e185754697ac0afa9e18297f6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 11 Sep 2014 11:45:12 -0400
Subject: ext4: don't keep using page if inline conversion fails

If inline->extent conversion fails (most probably due to ENOSPC) and
we release the temporary page that we allocated to transfer the file
contents, don't keep using the page pointer after releasing the page.
This occasionally leads to complaints about evicting locked pages or
hangs when blocksize > pagesize, because it's possible for the page to
get reallocated elsewhere in the meantime.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Tao Ma <tm@tao.ma>
---
 fs/ext4/inline.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bea662b..378aadf 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -594,6 +594,7 @@ retry:
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
+		page = NULL;
 		ext4_orphan_add(handle, inode);
 		up_write(&EXT4_I(inode)->xattr_sem);
 		sem_held = 0;
@@ -613,7 +614,8 @@ retry:
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
-	block_commit_write(page, from, to);
+	if (page)
+		block_commit_write(page, from, to);
 out:
 	if (page) {
 		unlock_page(page);
-- 
cgit v1.1


From a0626e75954078cfacddb00a4545dde821170bc5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 16 Sep 2014 14:34:59 -0400
Subject: ext4: check EA value offset when loading

When loading extended attributes, check each entry's value offset to
make sure it doesn't collide with the entries.

Without this check it is easy to crash the kernel by mounting a
malicious FS containing a file with an EA wherein e_value_offs = 0 and
e_value_size > 0 and then deleting the EA, which corrupts the name
list.

(See the f_ea_value_crash test's FS image in e2fsprogs for an example.)

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/xattr.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index da4df70..42823ab 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -190,14 +190,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 
 static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
+		       void *value_start)
 {
-	while (!IS_LAST_ENTRY(entry)) {
-		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+	struct ext4_xattr_entry *e = entry;
+
+	while (!IS_LAST_ENTRY(e)) {
+		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
 		if ((void *)next >= end)
 			return -EIO;
-		entry = next;
+		e = next;
 	}
+
+	while (!IS_LAST_ENTRY(entry)) {
+		if (entry->e_value_size != 0 &&
+		    (value_start + le16_to_cpu(entry->e_value_offs) <
+		     (void *)e + sizeof(__u32) ||
+		     value_start + le16_to_cpu(entry->e_value_offs) +
+		    le32_to_cpu(entry->e_value_size) > end))
+			return -EIO;
+		entry = EXT4_XATTR_NEXT(entry);
+	}
+
 	return 0;
 }
 
@@ -214,7 +228,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
 		return -EIO;
 	if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
 		return -EIO;
-	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
+				       bh->b_data);
 	if (!error)
 		set_buffer_verified(bh);
 	return error;
@@ -331,7 +346,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 	header = IHDR(inode, raw_inode);
 	entry = IFIRST(header);
 	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-	error = ext4_xattr_check_names(entry, end);
+	error = ext4_xattr_check_names(entry, end, entry);
 	if (error)
 		goto cleanup;
 	error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -463,7 +478,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
 	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-	error = ext4_xattr_check_names(IFIRST(header), end);
+	error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
 	if (error)
 		goto cleanup;
 	error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -980,7 +995,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	is->s.here = is->s.first;
 	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
 	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
-		error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+		error = ext4_xattr_check_names(IFIRST(header), is->s.end,
+					       IFIRST(header));
 		if (error)
 			return error;
 		/* Find the named attribute. */
-- 
cgit v1.1


From 064d83892e9ba547f7d4eae22cbca066d95210ce Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 16 Sep 2014 14:43:09 -0400
Subject: jbd2: free bh when descriptor block checksum fails

Free the buffer head if the journal descriptor block fails checksum
verification.

This is the jbd2 port of the e2fsprogs patch "e2fsck: free bh on csum
verify error in do_one_pass".

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Cc: stable@vger.kernel.org
---
 fs/jbd2/recovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9b329b5..bcbef08 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal,
 			    !jbd2_descr_block_csum_verify(journal,
 							  bh->b_data)) {
 				err = -EIO;
+				brelse(bh);
 				goto failed;
 			}
 
-- 
cgit v1.1


From 1245799f752fa817a030b3b4448466e83ee7d61d Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 16 Sep 2014 14:50:50 -0400
Subject: jbd2: jbd2_log_wait_for_space improve error detetcion

If EIO happens after we have dropped j_state_lock, we won't notice
that the journal has been aborted.  So it is reasonable to move this
check after we have grabbed the j_checkpoint_mutex and re-grabbed the
j_state_lock.  This patch helps to prevent false positive complain
after EIO.

#DMESG:
__jbd2_log_wait_for_space: needed 8448 blocks and only had 8386 space available
__jbd2_log_wait_for_space: no way to get more journal space in ram1-8
------------[ cut here ]------------
WARNING: CPU: 15 PID: 6739 at fs/jbd2/checkpoint.c:168 __jbd2_log_wait_for_space+0x188/0x200()
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 15 PID: 6739 Comm: fsstress Tainted: G        W      3.17.0-rc2-00429-g684de57 #139
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
 00000000000000a8 ffff88077aaab878 ffffffff815c1a8c 00000000000000a8
 0000000000000000 ffff88077aaab8b8 ffffffff8106ce8c ffff88077aaab898
 ffff8807c57e6000 ffff8807c57e6028 0000000000002100 ffff8807c57e62f0
Call Trace:
 [<ffffffff815c1a8c>] dump_stack+0x51/0x6d
 [<ffffffff8106ce8c>] warn_slowpath_common+0x8c/0xc0
 [<ffffffff8106ceda>] warn_slowpath_null+0x1a/0x20
 [<ffffffff812419f8>] __jbd2_log_wait_for_space+0x188/0x200
 [<ffffffff8123be9a>] start_this_handle+0x4da/0x7b0
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810aba87>] ? lockdep_init_map+0xe7/0x180
 [<ffffffff8123c5bc>] jbd2__journal_start+0xdc/0x1d0
 [<ffffffff811f2414>] ? __ext4_new_inode+0x7f4/0x1330
 [<ffffffff81222a38>] __ext4_journal_start_sb+0xf8/0x110
 [<ffffffff811f2414>] __ext4_new_inode+0x7f4/0x1330
 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
 [<ffffffff812025bb>] ext4_create+0x8b/0x150
 [<ffffffff8117fe3b>] vfs_create+0x7b/0xb0
 [<ffffffff8118097b>] do_last+0x7db/0xcf0
 [<ffffffff8117e31d>] ? inode_permission+0x4d/0x50
 [<ffffffff811845d2>] path_openat+0x242/0x590
 [<ffffffff81191a76>] ? __alloc_fd+0x36/0x140
 [<ffffffff81184a6a>] do_filp_open+0x4a/0xb0
 [<ffffffff81191b61>] ? __alloc_fd+0x121/0x140
 [<ffffffff81172f20>] do_sys_open+0x170/0x220
 [<ffffffff8117300e>] SyS_open+0x1e/0x20
 [<ffffffff811715d6>] SyS_creat+0x16/0x20
 [<ffffffff815c7e12>] system_call_fastpath+0x16/0x1b
---[ end trace cd71c831f82059db ]---

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9ffb19c..1fbf599 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -115,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 
 	nblocks = jbd2_space_needed(journal);
 	while (jbd2_log_space_left(journal) < nblocks) {
-		if (journal->j_flags & JBD2_ABORT)
-			return;
 		write_unlock(&journal->j_state_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
 
@@ -132,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		 * trace for forensic evidence.
 		 */
 		write_lock(&journal->j_state_lock);
+		if (journal->j_flags & JBD2_ABORT) {
+			mutex_unlock(&journal->j_checkpoint_mutex);
+			return;
+		}
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd2_space_needed(journal);
 		space_left = jbd2_log_space_left(journal);
-- 
cgit v1.1


From 844749764b416ee2c4ba2da328c04eaad7388242 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 16 Sep 2014 14:52:03 -0400
Subject: ext4: explicitly inform user about orphan list cleanup

Production fs likely compiled/mounted w/o jbd debugging, so orphan
list clearing will be silent.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2632017..028935f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2191,7 +2191,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		/* don't clear list on RO mount w/ errors */
 		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
-			jbd_debug(1, "Errors on filesystem, "
+			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
 				  "clearing orphan list.\n");
 			es->s_last_orphan = 0;
 		}
-- 
cgit v1.1


From cc97f1a7c7eed970e674b84be0e68f479c80228d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 Sep 2014 00:42:16 -0400
Subject: jbd2: avoid pointless scanning of checkpoint lists

Yuanhan has reported that when he is running fsync(2) heavy workload
creating new files over ramdisk, significant amount of time is spent in
__jbd2_journal_clean_checkpoint_list() trying to clean old transactions
(but they cannot be cleaned up because flusher hasn't yet checkpointed
those buffers). The workload can be generated by:
  fs_mark -d /fs/ram0/1 -D 2 -N 2560 -n 1000000 -L 1 -S 1 -s 4096

Reduce the amount of scanning by stopping to scan the transaction list
once we find a transaction that cannot be checkpointed. Note that this
way of cleaning is still enough to keep freeing space in the journal
after fully checkpointed transactions.

Reported-and-tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1fbf599..3ab4c5e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -420,7 +420,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
  * Find all the written-back checkpoint buffers in the given list and
  * release them.
  *
- * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of buffers reaped (for debug)
  */
@@ -440,12 +439,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
 		ret = __try_to_free_cp_buf(jh);
-		if (ret) {
-			freed++;
-			if (ret == 2) {
-				*released = 1;
-				return freed;
-			}
+		if (!ret)
+			return freed;
+		freed++;
+		if (ret == 2) {
+			*released = 1;
+			return freed;
 		}
 		/*
 		 * This function only frees up some memory
@@ -465,7 +464,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
- * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of buffers reaped (for debug)
  */
@@ -473,7 +471,8 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0;
+	int ret;
+	int freed = 0;
 	int released;
 
 	transaction = journal->j_checkpoint_transactions;
@@ -485,17 +484,21 @@ int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 	do {
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		ret += journal_clean_one_cp_list(transaction->
+		ret = journal_clean_one_cp_list(transaction->
 				t_checkpoint_list, &released);
 		/*
 		 * This function only frees up some memory if possible so we
 		 * dont have an obligation to finish processing. Bail out if
 		 * preemption requested:
 		 */
-		if (need_resched())
+		if (need_resched()) {
+			freed += ret;
 			goto out;
-		if (released)
+		}
+		if (released) {
+			freed += ret;
 			continue;
+		}
 		/*
 		 * It is essential that we are as careful as in the case of
 		 * t_checkpoint_list with removing the buffer from the list as
@@ -503,11 +506,12 @@ int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 		 */
 		ret += journal_clean_one_cp_list(transaction->
 				t_checkpoint_io_list, &released);
-		if (need_resched())
+		freed += ret;
+		if (need_resched() || !ret)
 			goto out;
 	} while (transaction != last_transaction);
 out:
-	return ret;
+	return freed;
 }
 
 /*
-- 
cgit v1.1


From 50849db32a9f529235a84bcc84a6b8e631b1d0ec Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 Sep 2014 00:58:12 -0400
Subject: jbd2: simplify calling convention around
 __jbd2_journal_clean_checkpoint_list

__jbd2_journal_clean_checkpoint_list() returns number of buffers it
freed but noone was using the value so just stop doing that. This
also allows for simplifying the calling convention for
journal_clean_once_cp_list().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 56 ++++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 3ab4c5e..988b32e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -421,16 +421,15 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
  * release them.
  *
  * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
+ * Returns 1 if we freed the transaction, 0 otherwise.
  */
-
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+static int journal_clean_one_cp_list(struct journal_head *jh)
 {
 	struct journal_head *last_jh;
 	struct journal_head *next_jh = jh;
-	int ret, freed = 0;
+	int ret;
+	int freed = 0;
 
-	*released = 0;
 	if (!jh)
 		return 0;
 
@@ -441,11 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 		ret = __try_to_free_cp_buf(jh);
 		if (!ret)
 			return freed;
-		freed++;
-		if (ret == 2) {
-			*released = 1;
-			return freed;
-		}
+		if (ret == 2)
+			return 1;
+		freed = 1;
 		/*
 		 * This function only frees up some memory
 		 * if possible so we dont have an obligation
@@ -465,53 +462,48 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
  * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
  */
-
-int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret;
-	int freed = 0;
-	int released;
 
 	transaction = journal->j_checkpoint_transactions;
 	if (!transaction)
-		goto out;
+		return;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		ret = journal_clean_one_cp_list(transaction->
-				t_checkpoint_list, &released);
+		ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
 		/*
 		 * This function only frees up some memory if possible so we
 		 * dont have an obligation to finish processing. Bail out if
 		 * preemption requested:
 		 */
-		if (need_resched()) {
-			freed += ret;
-			goto out;
-		}
-		if (released) {
-			freed += ret;
+		if (need_resched())
+			return;
+		if (ret)
 			continue;
-		}
 		/*
 		 * It is essential that we are as careful as in the case of
 		 * t_checkpoint_list with removing the buffer from the list as
 		 * we can possibly see not yet submitted buffers on io_list
 		 */
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_io_list, &released);
-		freed += ret;
-		if (need_resched() || !ret)
-			goto out;
+		ret = journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list);
+		if (need_resched())
+			return;
+		/*
+		 * Stop scanning if we couldn't free the transaction. This
+		 * avoids pointless scanning of transactions which still
+		 * weren't checkpointed.
+		 */
+		if (!ret)
+			return;
 	} while (transaction != last_transaction);
-out:
-	return freed;
 }
 
 /*
-- 
cgit v1.1


From 279bf6d390933d5353ab298fcc306c391a961469 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 Sep 2014 01:12:15 -0400
Subject: ext4: don't check quota format when there are no quota files

The check whether quota format is set even though there are no
quota files with journalled quota is pointless and it actually
makes it impossible to turn off journalled quotas (as there's
no way to unset journalled quota format). Just remove the check.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 028935f..115e27d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1712,13 +1712,6 @@ static int parse_options(char *options, struct super_block *sb,
 					"not specified");
 			return 0;
 		}
-	} else {
-		if (sbi->s_jquota_fmt) {
-			ext4_msg(sb, KERN_ERR, "journaled quota format "
-					"specified with no journaling "
-					"enabled");
-			return 0;
-		}
 	}
 #endif
 	if (test_opt(sb, DIOREAD_NOLOCK)) {
-- 
cgit v1.1


From bda3253043c54a705c8352096194ab6216e2e5c1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 18 Sep 2014 16:12:37 -0400
Subject: ext4: fold ext4_sync_fs_nojournal() into ext4_sync_fs()

This allows us to eliminate duplicate code, and eventually allow us to
also fold ext4_sops and ext4_nojournal_sops together.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 115e27d..4770c98 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 static void ext4_clear_journal_err(struct super_block *sb,
 				   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
-static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
@@ -1131,7 +1130,7 @@ static const struct super_operations ext4_nojournal_sops = {
 	.dirty_inode	= ext4_dirty_inode,
 	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
-	.sync_fs	= ext4_sync_fs_nojournal,
+	.sync_fs	= ext4_sync_fs,
 	.put_super	= ext4_put_super,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
@@ -4718,15 +4717,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	 * being sent at the end of the function. But we can skip it if
 	 * transaction_commit will do it for us.
 	 */
-	target = jbd2_get_latest_transaction(sbi->s_journal);
-	if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
-	    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+	if (sbi->s_journal) {
+		target = jbd2_get_latest_transaction(sbi->s_journal);
+		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+			needs_barrier = true;
+
+		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+			if (wait)
+				ret = jbd2_log_wait_commit(sbi->s_journal,
+							   target);
+		}
+	} else if (wait && test_opt(sb, BARRIER))
 		needs_barrier = true;
-
-	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
-		if (wait)
-			ret = jbd2_log_wait_commit(sbi->s_journal, target);
-	}
 	if (needs_barrier) {
 		int err;
 		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
@@ -4737,19 +4740,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	return ret;
 }
 
-static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
-{
-	int ret = 0;
-
-	trace_ext4_sync_fs(sb, wait);
-	flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
-	dquot_writeback_dquots(sb, -1);
-	if (wait && test_opt(sb, BARRIER))
-		ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
-
-	return ret;
-}
-
 /*
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
-- 
cgit v1.1


From bb0445765866e5b1607af81e2f48ca5a8efbeed8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 18 Sep 2014 17:12:02 -0400
Subject: ext4: support freezing ext2 (nojournal) file systems

Through an oversight, when we added nojournal support to ext4, we
didn't add support to allow file system freezing.  This is relatively
easy to add, so let's do it.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reported-by: Dexuan Cui <decui@microsoft.com>
---
 fs/ext4/super.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4770c98..4db537b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1131,6 +1131,8 @@ static const struct super_operations ext4_nojournal_sops = {
 	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
 	.sync_fs	= ext4_sync_fs,
+	.freeze_fs	= ext4_freeze,
+	.unfreeze_fs	= ext4_unfreeze,
 	.put_super	= ext4_put_super,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
@@ -4758,23 +4760,26 @@ static int ext4_freeze(struct super_block *sb)
 
 	journal = EXT4_SB(sb)->s_journal;
 
-	/* Now we set up the journal barrier. */
-	jbd2_journal_lock_updates(journal);
+	if (journal) {
+		/* Now we set up the journal barrier. */
+		jbd2_journal_lock_updates(journal);
 
-	/*
-	 * Don't clear the needs_recovery flag if we failed to flush
-	 * the journal.
-	 */
-	error = jbd2_journal_flush(journal);
-	if (error < 0)
-		goto out;
+		/*
+		 * Don't clear the needs_recovery flag if we failed to
+		 * flush the journal.
+		 */
+		error = jbd2_journal_flush(journal);
+		if (error < 0)
+			goto out;
+	}
 
 	/* Journal blocked and flushed, clear needs_recovery flag. */
 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	error = ext4_commit_super(sb, 1);
 out:
-	/* we rely on upper layer to stop further updates */
-	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	if (journal)
+		/* we rely on upper layer to stop further updates */
+		jbd2_journal_unlock_updates(journal);
 	return error;
 }
 
-- 
cgit v1.1


From f6e63f90809946d410c42045577cb159fedabf8c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 18 Sep 2014 17:12:30 -0400
Subject: ext4: fold ext4_nojournal_sops into ext4_sops

There's no longer any need to have a separate set of super_operations
for nojournal mode.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4db537b..1070d6e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1123,27 +1123,6 @@ static const struct super_operations ext4_sops = {
 	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
-static const struct super_operations ext4_nojournal_sops = {
-	.alloc_inode	= ext4_alloc_inode,
-	.destroy_inode	= ext4_destroy_inode,
-	.write_inode	= ext4_write_inode,
-	.dirty_inode	= ext4_dirty_inode,
-	.drop_inode	= ext4_drop_inode,
-	.evict_inode	= ext4_evict_inode,
-	.sync_fs	= ext4_sync_fs,
-	.freeze_fs	= ext4_freeze,
-	.unfreeze_fs	= ext4_unfreeze,
-	.put_super	= ext4_put_super,
-	.statfs		= ext4_statfs,
-	.remount_fs	= ext4_remount,
-	.show_options	= ext4_show_options,
-#ifdef CONFIG_QUOTA
-	.quota_read	= ext4_quota_read,
-	.quota_write	= ext4_quota_write,
-#endif
-	.bdev_try_to_free_page = bdev_try_to_free_page,
-};
-
 static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
@@ -3941,11 +3920,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/*
 	 * set up enough so that it can read an inode
 	 */
-	if (!test_opt(sb, NOLOAD) &&
-	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
-		sb->s_op = &ext4_sops;
-	else
-		sb->s_op = &ext4_nojournal_sops;
+	sb->s_op = &ext4_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
-- 
cgit v1.1


From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Oct 2014 21:49:18 -0400
Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data

->page_mkwrite() is used by filesystems to allocate blocks under a page
which is becoming writeably mmapped in some process' address space. This
allows a filesystem to return a page fault if there is not enough space
available, user exceeds quota or similar problem happens, rather than
silently discarding data later when writepage is called.

However VFS fails to call ->page_mkwrite() in all the cases where
filesystems need it when blocksize < pagesize. For example when
blocksize = 1024, pagesize = 4096 the following is problematic:
  ftruncate(fd, 0);
  pwrite(fd, buf, 1024, 0);
  map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0);
  map[0] = 'a';       ----> page_mkwrite() for index 0 is called
  ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */
  mremap(map, 1024, 10000, 0);
  map[4095] = 'a';    ----> no page_mkwrite() called

At the moment ->page_mkwrite() is called, filesystem can allocate only
one block for the page because i_size == 1024. Otherwise it would create
blocks beyond i_size which is generally undesirable. But later at
->writepage() time, we also need to store data at offset 4095 but we
don't have block allocated for it.

This patch introduces a helper function filesystems can use to have
->page_mkwrite() called at all the necessary moments.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 9a6029e..6dc1475 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
+	loff_t old_size = inode->i_size;
 	int i_size_changed = 0;
 
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	unlock_page(page);
 	page_cache_release(page);
 
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
-- 
cgit v1.1


From d6320cbfc92910a3e5f10c42d98c231c98db4f60 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Oct 2014 21:49:46 -0400
Subject: ext4: fix mmap data corruption when blocksize < pagesize

Use truncate_isize_extended() when hole is being created in a file so that
->page_mkwrite() will get called for the partial tail page if it is
mmaped (see the first patch in the series for details).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d5dd7d4..0918452 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4514,8 +4514,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				ext4_orphan_del(NULL, inode);
 				goto err_out;
 			}
-		} else
+		} else {
+			loff_t oldsize = inode->i_size;
+
 			i_size_write(inode, attr->ia_size);
+			pagecache_isize_extended(inode, oldsize, inode->i_size);
+		}
 
 		/*
 		 * Blocks are going to be removed from the inode. Wait
-- 
cgit v1.1


From bce92d566a57893e98ec83e4e5447f860d2889b7 Mon Sep 17 00:00:00 2001
From: Li Xi <pkuelelixi@gmail.com>
Date: Wed, 1 Oct 2014 22:11:06 -0400
Subject: ext4: fix return value of ext4_do_update_inode

When ext4_do_update_inode() gets error from ext4_inode_blocks_set(),
error number should be returned.

Signed-off-by: Li Xi <lixi@ddn.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0918452..41c4f97 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4204,7 +4204,8 @@ static int ext4_do_update_inode(handle_t *handle,
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 
-	if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+	err = ext4_inode_blocks_set(handle, raw_inode, ei);
+	if (err) {
 		spin_unlock(&ei->i_raw_lock);
 		goto out_brelse;
 	}
-- 
cgit v1.1


From c5d311926da483951bd5da637ed65de8614d1901 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 1 Oct 2014 22:23:15 -0400
Subject: ext4: fix over-defensive complaint after journal abort

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0074e0d..3445035 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 	set_buffer_prio(bh);
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_dirty_metadata(handle, bh);
-		/* Errors can only happen if there is a bug */
-		if (WARN_ON_ONCE(err)) {
+		/* Errors can only happen due to aborted journal or a nasty bug */
+		if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
 			ext4_journal_abort_handle(where, line, __func__, bh,
 						  handle, err);
 			if (inode == NULL) {
-- 
cgit v1.1


From dfe076c106f63cf6bcd375c56db9c8c89a088dab Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 1 Oct 2014 22:26:17 -0400
Subject: ext4: get rid of code duplication

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 65cca28..eab825f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3155,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 			 "start %lu, size %lu, fe_logical %lu",
 			 (unsigned long) start, (unsigned long) size,
 			 (unsigned long) ac->ac_o_ex.fe_logical);
+		BUG();
 	}
-	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
-			start > ac->ac_o_ex.fe_logical);
 	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 
 	/* now prepare goal request */
-- 
cgit v1.1


From be5cd90ddaf471e676fad6ced29e69e8610c5d20 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 1 Oct 2014 22:57:09 -0400
Subject: ext4: optimize block allocation on grow indepth

It is reasonable to prepend newly created index to older one.

[ Dropped no longer used function parameter newext. -tytso ]

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8170b32..c3ed9af2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1263,16 +1263,24 @@ cleanup:
  *   just created block
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-				 unsigned int flags,
-				 struct ext4_extent *newext)
+				 unsigned int flags)
 {
 	struct ext4_extent_header *neh;
 	struct buffer_head *bh;
-	ext4_fsblk_t newblock;
+	ext4_fsblk_t newblock, goal = 0;
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	int err = 0;
 
-	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
-		newext, &err, flags);
+	/* Try to prepend new index to old one */
+	if (ext_depth(inode))
+		goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
+	if (goal > le32_to_cpu(es->s_first_data_block)) {
+		flags |= EXT4_MB_HINT_TRY_GOAL;
+		goal--;
+	} else
+		goal = ext4_inode_to_goal_block(inode);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+					NULL, &err);
 	if (newblock == 0)
 		return err;
 
@@ -1373,7 +1381,7 @@ repeat:
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
+		err = ext4_ext_grow_indepth(handle, inode, mb_flags);
 		if (err)
 			goto out;
 
-- 
cgit v1.1


From 3e67cfad22230ebed85c56cbe413876f33fea82b Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 3 Oct 2014 12:47:23 -0400
Subject: ext4: grab missed write_count for EXT4_IOC_SWAP_BOOT

Otherwise this provokes complain like follows:
WARNING: CPU: 12 PID: 5795 at fs/ext4/ext4_jbd2.c:48 ext4_journal_check_start+0x4e/0xa0()
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 12 PID: 5795 Comm: python Not tainted 3.17.0-rc2-00175-gae5344f #158
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
 0000000000000030 ffff8808116cfd28 ffffffff815c7dfc 0000000000000030
 0000000000000000 ffff8808116cfd68 ffffffff8106ce8c ffff8808116cfdc8
 ffff880813b16000 ffff880806ad6ae8 ffffffff81202008 0000000000000000
Call Trace:
 [<ffffffff815c7dfc>] dump_stack+0x51/0x6d
 [<ffffffff8106ce8c>] warn_slowpath_common+0x8c/0xc0
 [<ffffffff81202008>] ? ext4_ioctl+0x9e8/0xeb0
 [<ffffffff8106ceda>] warn_slowpath_null+0x1a/0x20
 [<ffffffff8122867e>] ext4_journal_check_start+0x4e/0xa0
 [<ffffffff81228c10>] __ext4_journal_start_sb+0x90/0x110
 [<ffffffff81202008>] ext4_ioctl+0x9e8/0xeb0
 [<ffffffff8107b0bd>] ? ptrace_stop+0x24d/0x2f0
 [<ffffffff81088530>] ? alloc_pid+0x480/0x480
 [<ffffffff8107b1f2>] ? ptrace_do_notify+0x92/0xb0
 [<ffffffff81186545>] do_vfs_ioctl+0x4e5/0x550
 [<ffffffff815cdbcb>] ? _raw_spin_unlock_irq+0x2b/0x40
 [<ffffffff81186603>] SyS_ioctl+0x53/0x80
 [<ffffffff815ce2ce>] tracesys+0xd0/0xd5

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/ioctl.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0f2252e..3d5de16 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -532,9 +532,17 @@ group_add_out:
 	}
 
 	case EXT4_IOC_SWAP_BOOT:
+	{
+		int err;
 		if (!(filp->f_mode & FMODE_WRITE))
 			return -EBADF;
-		return swap_inode_boot_loader(sb, inode);
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		err = swap_inode_boot_loader(sb, inode);
+		mnt_drop_write_file(filp);
+		return err;
+	}
 
 	case EXT4_IOC_RESIZE_FS: {
 		ext4_fsblk_t n_blocks_count;
-- 
cgit v1.1


From e2bfb088fac03c0f621886a04cffc7faa2b49b1d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Oct 2014 22:47:07 -0400
Subject: ext4: don't orphan or truncate the boot loader inode

The boot loader inode (inode #5) should never be visible in the
directory hierarchy, but it's possible if the file system is corrupted
that there will be a directory entry that points at inode #5.  In
order to avoid accidentally trashing it, when such a directory inode
is opened, the inode will be marked as a bad inode, so that it's not
possible to modify (or read) the inode from userspace.

Unfortunately, when we unlink this (invalid/illegal) directory entry,
we will put the bad inode on the ophan list, and then when try to
unlink the directory, we don't actually remove the bad inode from the
orphan list before freeing in-memory inode structure.  This means the
in-memory orphan list is corrupted, leading to a kernel oops.

In addition, avoid truncating a bad inode in ext4_destroy_inode(),
since truncating the boot loader inode is not a smart thing to do.

Reported-by: Sami Liedes <sami.liedes@iki.fi>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/inode.c | 7 +++----
 fs/ext4/namei.c | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 41c4f97..59983b2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -224,16 +224,15 @@ void ext4_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
-	if (!is_bad_inode(inode))
-		dquot_initialize(inode);
+	if (is_bad_inode(inode))
+		goto no_delete;
+	dquot_initialize(inode);
 
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages_final(&inode->i_data);
 
 	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
-	if (is_bad_inode(inode))
-		goto no_delete;
 
 	/*
 	 * Protect us against freezing - iput() caller didn't have to have any
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 51705f8..a2a9d40 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2544,7 +2544,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	int err = 0, rc;
 	bool dirty = false;
 
-	if (!sbi->s_journal)
+	if (!sbi->s_journal || is_bad_inode(inode))
 		return 0;
 
 	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-- 
cgit v1.1


From f4bb2981024fc91b23b4d09a8817c415396dbabb Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Oct 2014 22:56:00 -0400
Subject: ext4: add ext4_iget_normal() which is to be used for dir tree lookups

If there is a corrupted file system which has directory entries that
point at reserved, metadata inodes, prohibit them from being used by
treating them the same way we treat Boot Loader inodes --- that is,
mark them to be bad inodes.  This prohibits them from being opened,
deleted, or modified via chmod, chown, utimes, etc.

In particular, this prevents a corrupted file system which has a
directory entry which points at the journal inode from being deleted
and its blocks released, after which point Much Hilarity Ensues.

Reported-by: Sami Liedes <sami.liedes@iki.fi>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/ext4.h  | 1 +
 fs/ext4/inode.c | 7 +++++++
 fs/ext4/namei.c | 4 ++--
 fs/ext4/super.c | 2 +-
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1eb5b7b..012e89b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle,
 #define CONVERT_INLINE_DATA	 2
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59983b2..e204d8a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4104,6 +4104,13 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
+struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
+{
+	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+		return ERR_PTR(-EIO);
+	return ext4_iget(sb, ino);
+}
+
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a2a9d40..7037ecf 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1417,7 +1417,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 					 dentry);
 			return ERR_PTR(-EIO);
 		}
-		inode = ext4_iget(dir->i_sb, ino);
+		inode = ext4_iget_normal(dir->i_sb, ino);
 		if (inode == ERR_PTR(-ESTALE)) {
 			EXT4_ERROR_INODE(dir,
 					 "deleted inode referenced: %u",
@@ -1450,7 +1450,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 		return ERR_PTR(-EIO);
 	}
 
-	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
+	return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino));
 }
 
 /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1070d6e..a0811cc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1001,7 +1001,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 	 * Currently we don't know the generation for parent directory, so
 	 * a generation of 0 means "accept any"
 	 */
-	inode = ext4_iget(sb, ino);
+	inode = ext4_iget_normal(sb, ino);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (generation && inode->i_generation != generation) {
-- 
cgit v1.1


From 0ff8947fc5f700172b37cbca811a38eb9cb81e08 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 11 Oct 2014 19:51:17 -0400
Subject: ext4: fix reservation overflow in ext4_da_write_begin

Delalloc write journal reservations only reserve 1 credit,
to update the inode if necessary.  However, it may happen
once in a filesystem's lifetime that a file will cross
the 2G threshold, and require the LARGE_FILE feature to
be set in the superblock as well, if it was not set already.

This overruns the transaction reservation, and can be
demonstrated simply on any ext4 filesystem without the LARGE_FILE
feature already set:

dd if=/dev/zero of=testfile bs=1 seek=2147483646 count=1 \
	conv=notrunc of=testfile
sync
dd if=/dev/zero of=testfile bs=1 seek=2147483647 count=1 \
	conv=notrunc of=testfile

leads to:

EXT4-fs: ext4_do_update_inode:4296: aborting transaction: error 28 in __ext4_handle_dirty_super
EXT4-fs error (device loop0) in ext4_do_update_inode:4301: error 28
EXT4-fs error (device loop0) in ext4_reserve_inode_write:4757: Readonly filesystem
EXT4-fs error (device loop0) in ext4_dirty_inode:4876: error 28
EXT4-fs error (device loop0) in ext4_da_write_end:2685: error 28

Adjust the number of credits based on whether the flag is
already set, and whether the current write may extend past the
LARGE_FILE limit.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: stable@vger.kernel.org
---
 fs/ext4/inode.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e204d8a..0dd9150 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2495,6 +2495,20 @@ static int ext4_nonda_switch(struct super_block *sb)
 	return 0;
 }
 
+/* We always reserve for an inode update; the superblock could be there too */
+static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
+{
+	if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+		return 1;
+
+	if (pos + len <= 0x7fffffffULL)
+		return 1;
+
+	/* We might need to update the superblock to set LARGE_FILE */
+	return 2;
+}
+
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
@@ -2545,7 +2559,8 @@ retry_grab:
 	 * of file which has an already mapped buffer.
 	 */
 retry_journal:
-	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				ext4_da_write_credits(inode, pos, len));
 	if (IS_ERR(handle)) {
 		page_cache_release(page);
 		return PTR_ERR(handle);
-- 
cgit v1.1


From 65dd8327eb055a393a413a2214f70a9a10ff7ad6 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Date: Sat, 11 Oct 2014 19:56:34 -0400
Subject: ext4: delete useless comments about ext4_move_extents

In patch 'ext4: refactor ext4_move_extents code base',  Dmitry Monakhov has
refactored ext4_move_extents' implementation, but forgot to update the
corresponding comments, this patch will try to delete some useless comments.

Reviewed-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 59 ++++++---------------------------------------------
 1 file changed, 6 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5d78063..9f2311b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -120,33 +120,12 @@ out:
 }
 
 /**
- * mext_replace_branches - Replace original extents with new extents
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @donor_inode:	donor inode
- * @from:		block offset of orig_inode
- * @count:		block count to be replaced
- * @err:		pointer to save return value
- *
- * Replace original inode extents and donor inode extents page by page.
- * We implement this replacement in the following three steps:
- * 1. Save the block information of original and donor inodes into
- *    dummy extents.
- * 2. Change the block information of original inode to point at the
- *    donor inode blocks.
- * 3. Change the block information of donor inode to point at the saved
- *    original inode blocks in the dummy extents.
- *
- * Return replaced block count.
- */
-
-/**
  * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
  *
  * @inode1:	the inode structure
  * @inode2:	the inode structure
- * @index:	page index
+ * @index1:	page index
+ * @index2:	page index
  * @page:	result page vector
  *
  * Grab two locked pages for inode's by inode order
@@ -266,13 +245,14 @@ out:
  * @o_filp:			file structure of original file
  * @donor_inode:		donor inode
  * @orig_page_offset:		page index on original file
+ * @donor_page_offset:		page index on donor file
  * @data_offset_in_page:	block index where data swapping starts
  * @block_len_in_page:		the number of blocks to be swapped
  * @unwritten:			orig extent is unwritten or not
  * @err:			pointer to save return value
  *
  * Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling mext_replace_branches().
+ * with donor inode extents by calling ext4_swap_extents().
  * Finally, write out the saved data in new original inode blocks. Return
  * replaced block count.
  */
@@ -551,41 +531,14 @@ mext_check_arguments(struct inode *orig_inode,
  *
  * @o_filp:		file structure of the original file
  * @d_filp:		file structure of the donor file
- * @orig_start:		start offset in block for orig
- * @donor_start:	start offset in block for donor
+ * @orig_blk:		start offset in block for orig
+ * @donor_blk:		start offset in block for donor
  * @len:		the number of blocks to be moved
  * @moved_len:		moved block length
  *
  * This function returns 0 and moved block length is set in moved_len
  * if succeed, otherwise returns error value.
  *
- * Note: ext4_move_extents() proceeds the following order.
- * 1:ext4_move_extents() calculates the last block number of moving extent
- *   function by the start block number (orig_start) and the number of blocks
- *   to be moved (len) specified as arguments.
- *   If the {orig, donor}_start points a hole, the extent's start offset
- *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
- *   after hole behind.
- * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
- *   or the ext_cur exceeds the block_end which is last logical block number.
- * 3:To get the length of continues area, call mext_next_extent()
- *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
- *   until find un-continuous extent, the start logical block number exceeds
- *   the block_end or the extent points to the last extent.
- * 4:Exchange the original inode data with donor inode data
- *   from orig_page_offset to seq_end_page.
- *   The start indexes of data are specified as arguments.
- *   That of the original inode is orig_page_offset,
- *   and the donor inode is also orig_page_offset
- *   (To easily handle blocksize != pagesize case, the offset for the
- *   donor inode is block unit).
- * 5:Update holecheck_path and orig_path to points a next proceeding extent,
- *   then returns to step 2.
- * 6:Release holecheck_path, orig_path and set the len to moved_len
- *   which shows the number of moved blocks.
- *   The moved_len is useful for the command to calculate the file offset
- *   for starting next move extent ioctl.
- * 7:Return 0 on success, or a negative error value on failure.
  */
 int
 ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
-- 
cgit v1.1


From 9aa5d32ba269bec0e7eaba2697a986a7b0bc8528 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 13 Oct 2014 03:36:16 -0400
Subject: ext4: Replace open coded mdata csum feature to helper function

Besides the fact that this replacement improves code readability
it also protects from errors caused direct EXT4_S(sb)->s_es manipulation
which may result attempt to use uninitialized  csum machinery.

#Testcase_BEGIN
IMG=/dev/ram0
MNT=/mnt
mkfs.ext4 $IMG
mount $IMG $MNT
#Enable feature directly on disk, on mounted fs
tune2fs -O metadata_csum  $IMG
# Provoke metadata update, likey result in OOPS
touch $MNT/test
umount $MNT
#Testcase_END

# Replacement script
@@
expression E;
@@
- EXT4_HAS_RO_COMPAT_FEATURE(E, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)
+ ext4_has_metadata_csum(E)

https://bugzilla.kernel.org/show_bug.cgi?id=82201

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/bitmap.c  | 12 ++++--------
 fs/ext4/ext4.h    |  8 ++++++++
 fs/ext4/extents.c |  6 ++----
 fs/ext4/ialloc.c  |  3 +--
 fs/ext4/inline.c  |  3 +--
 fs/ext4/inode.c   |  9 +++------
 fs/ext4/ioctl.c   |  3 +--
 fs/ext4/mmp.c     |  6 ++----
 fs/ext4/namei.c   | 39 +++++++++++++--------------------------
 fs/ext4/resize.c  |  3 +--
 fs/ext4/super.c   | 15 +++++----------
 fs/ext4/xattr.c   |  6 ++----
 12 files changed, 43 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 3285aa5..b610779 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
 	__u32 provided, calculated;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return 1;
 
 	provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
@@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
 	__u32 csum;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return;
 
 	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
@@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return 1;
 
 	provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
 	__u32 csum;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return;
 
 	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 012e89b..1483d9c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2337,6 +2337,14 @@ static inline int ext4_has_group_desc_csum(struct super_block *sb)
 					  EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
 }
 
+static inline int ext4_has_metadata_csum(struct super_block *sb)
+{
+	WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+		     !EXT4_SB(sb)->s_chksum_driver);
+
+	return (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
 	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c3ed9af2..37043d0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
 {
 	struct ext4_extent_tail *et;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return 1;
 
 	et = find_ext4_extent_tail(eh);
@@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
 {
 	struct ext4_extent_tail *et;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return;
 
 	et = find_ext4_extent_tail(eh);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5b87fc3..8012a5d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1011,8 +1011,7 @@ got:
 	spin_unlock(&sbi->s_next_gen_lock);
 
 	/* Precompute checksum seed for inode metadata */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+	if (ext4_has_metadata_csum(sb)) {
 		__u32 csum;
 		__le32 inum = cpu_to_le32(inode->i_ino);
 		__le32 gen = cpu_to_le32(inode->i_generation);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 378aadf..3ea6269 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1128,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
 	memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
 		inline_size - EXT4_INLINE_DOTDOT_SIZE);
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(inode->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	inode->i_size = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0dd9150..e9777f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
 
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_LINUX) ||
-	    !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	    !ext4_has_metadata_csum(inode->i_sb))
 		return 1;
 
 	provided = le16_to_cpu(raw->i_checksum_lo);
@@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
 
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_LINUX) ||
-	    !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	    !ext4_has_metadata_csum(inode->i_sb))
 		return;
 
 	csum = ext4_inode_csum(inode, raw, ei);
@@ -3928,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_extra_isize = 0;
 
 	/* Precompute checksum seed for inode metadata */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+	if (ext4_has_metadata_csum(sb)) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		__u32 csum;
 		__le32 inum = cpu_to_le32(inode->i_ino);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 3d5de16..bfda18a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,8 +331,7 @@ flags_out:
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 
-		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+		if (ext4_has_metadata_csum(inode->i_sb)) {
 			ext4_warning(sb, "Setting inode version is not "
 				     "supported with metadata_csum enabled.");
 			return -ENOTTY;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 32bce84..8313ca3 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
 
 static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
 {
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return 1;
 
 	return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
 
 static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
 {
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return;
 
 	mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 7037ecf..61756f9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		       "directory leaf block found instead of index block");
 		return ERR_PTR(-EIO);
 	}
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+	if (!ext4_has_metadata_csum(inode->i_sb) ||
 	    buffer_verified(bh))
 		return bh;
 
@@ -338,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 {
 	struct ext4_dir_entry_tail *t;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return 1;
 
 	t = get_dirent_tail(inode, dirent);
@@ -360,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
 {
 	struct ext4_dir_entry_tail *t;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return;
 
 	t = get_dirent_tail(inode, dirent);
@@ -436,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
 	struct dx_tail *t;
 	int count_offset, limit, count;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return 1;
 
 	c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -466,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
 	struct dx_tail *t;
 	int count_offset, limit, count;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return;
 
 	c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -555,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(dir->i_sb))
 		entry_space -= sizeof(struct dx_tail);
 	return entry_space / sizeof(struct dx_entry);
 }
@@ -565,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(dir->i_sb))
 		entry_space -= sizeof(struct dx_tail);
 	return entry_space / sizeof(struct dx_entry);
 }
@@ -1524,8 +1517,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	int	csum_size = 0;
 	int	err = 0, i;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(dir->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	bh2 = ext4_append(handle, dir, &newblock);
@@ -1691,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	int		csum_size = 0;
 	int		err;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(inode->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	if (!de) {
@@ -1759,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 	int		csum_size = 0;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(inode->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	blocksize =  dir->i_sb->s_blocksize;
@@ -1877,8 +1867,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	ext4_lblk_t block, blocks;
 	int	csum_size = 0;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(inode->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	sb = dir->i_sb;
@@ -2142,8 +2131,7 @@ static int ext4_delete_entry(handle_t *handle,
 			return err;
 	}
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(dir->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	BUFFER_TRACE(bh, "get_write_access");
@@ -2362,8 +2350,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
 	int csum_size = 0;
 	int err;
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(dir->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f..d5afb0a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1210,8 +1210,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
 {
 	struct buffer_head *bh;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return 0;
 
 	bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a0811cc..5afe42d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -140,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
 static int ext4_superblock_csum_verify(struct super_block *sb,
 				       struct ext4_super_block *es)
 {
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return 1;
 
 	return es->s_checksum == ext4_superblock_csum(sb, es);
@@ -151,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(sb))
 		return;
 
 	es->s_checksum = ext4_superblock_csum(sb, es);
@@ -1989,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 	__u16 crc = 0;
 	__le32 le_group = cpu_to_le32(block_group);
 
-	if ((sbi->s_es->s_feature_ro_compat &
-	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+	if (ext4_has_metadata_csum(sbi->s_sb)) {
 		/* Use new metadata_csum algorithm */
 		__le16 save_csum;
 		__u32 csum32;
@@ -3199,8 +3196,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
 	int compat, incompat;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+	if (ext4_has_metadata_csum(sb)) {
 		/* journal checksum v3 */
 		compat = 0;
 		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -3508,8 +3504,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Precompute checksum seed for all metadata */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (ext4_has_metadata_csum(sb))
 		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
 					       sizeof(es->s_uuid));
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 42823ab..1e09fc7 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
 					sector_t block_nr,
 					struct ext4_xattr_header *hdr)
 {
-	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	if (ext4_has_metadata_csum(inode->i_sb) &&
 	    (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
 		return 0;
 	return 1;
@@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode,
 				      sector_t block_nr,
 				      struct ext4_xattr_header *hdr)
 {
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+	if (!ext4_has_metadata_csum(inode->i_sb))
 		return;
 
 	hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
-- 
cgit v1.1


From aef4885ae14f1df75b58395c5314d71f613d26d9 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 13 Oct 2014 03:42:12 -0400
Subject: ext4: move error report out of atomic context in
 ext4_init_block_bitmap()

Error report likely result in IO so it is bad idea to do it from
atomic context.

This patch should fix following issue:

BUG: sleeping function called from invalid context at include/linux/buffer_head.h:349
in_atomic(): 1, irqs_disabled(): 0, pid: 137, name: kworker/u128:1
5 locks held by kworker/u128:1/137:
 #0:  ("writeback"){......}, at: [<ffffffff81085618>] process_one_work+0x228/0x4d0
 #1:  ((&(&wb->dwork)->work)){......}, at: [<ffffffff81085618>] process_one_work+0x228/0x4d0
 #2:  (jbd2_handle){......}, at: [<ffffffff81242622>] start_this_handle+0x712/0x7b0
 #3:  (&ei->i_data_sem){......}, at: [<ffffffff811fa387>] ext4_map_blocks+0x297/0x430
 #4:  (&(&bgl->locks[i].lock)->rlock){......}, at: [<ffffffff811f3180>] ext4_read_block_bitmap_nowait+0x5d0/0x630
CPU: 3 PID: 137 Comm: kworker/u128:1 Not tainted 3.17.0-rc2-00184-g82752e4 #165
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
Workqueue: writeback bdi_writeback_workfn (flush-1:0)
 0000000000000411 ffff880813777288 ffffffff815c7fdc ffff880813777288
 ffff880813a8bba0 ffff8808137772a8 ffffffff8108fb30 ffff880803e01e38
 ffff880803e01e38 ffff8808137772c8 ffffffff811a8d53 ffff88080ecc6000
Call Trace:
 [<ffffffff815c7fdc>] dump_stack+0x51/0x6d
 [<ffffffff8108fb30>] __might_sleep+0xf0/0x100
 [<ffffffff811a8d53>] __sync_dirty_buffer+0x43/0xe0
 [<ffffffff811a8e03>] sync_dirty_buffer+0x13/0x20
 [<ffffffff8120f581>] ext4_commit_super+0x1d1/0x230
 [<ffffffff8120fa03>] save_error_info+0x23/0x30
 [<ffffffff8120fd06>] __ext4_error+0xb6/0xd0
 [<ffffffff8120f260>] ? ext4_group_desc_csum+0x140/0x190
 [<ffffffff811f2d8c>] ext4_read_block_bitmap_nowait+0x1dc/0x630
 [<ffffffff8122e23a>] ext4_mb_init_cache+0x21a/0x8f0
 [<ffffffff8113ae95>] ? lru_cache_add+0x55/0x60
 [<ffffffff8112e16c>] ? add_to_page_cache_lru+0x6c/0x80
 [<ffffffff8122eaa0>] ext4_mb_init_group+0x190/0x280
 [<ffffffff8122ec51>] ext4_mb_good_group+0xc1/0x190
 [<ffffffff8123309a>] ext4_mb_regular_allocator+0x17a/0x410
 [<ffffffff8122c821>] ? ext4_mb_use_preallocated+0x31/0x380
 [<ffffffff81233535>] ? ext4_mb_new_blocks+0x205/0x8e0
 [<ffffffff8116ed5c>] ? kmem_cache_alloc+0xfc/0x180
 [<ffffffff812335b0>] ext4_mb_new_blocks+0x280/0x8e0
 [<ffffffff8116f2c4>] ? __kmalloc+0x144/0x1c0
 [<ffffffff81221797>] ? ext4_find_extent+0x97/0x320
 [<ffffffff812257f4>] ext4_ext_map_blocks+0xbc4/0x1050
 [<ffffffff811fa387>] ? ext4_map_blocks+0x297/0x430
 [<ffffffff811fa3ab>] ext4_map_blocks+0x2bb/0x430
 [<ffffffff81200e43>] ? ext4_init_io_end+0x23/0x50
 [<ffffffff811feb44>] ext4_writepages+0x564/0xaf0
 [<ffffffff815cde3b>] ? _raw_spin_unlock+0x2b/0x40
 [<ffffffff810ac7bd>] ? lock_release_non_nested+0x2fd/0x3c0
 [<ffffffff811a009e>] ? writeback_sb_inodes+0x10e/0x490
 [<ffffffff811a009e>] ? writeback_sb_inodes+0x10e/0x490
 [<ffffffff811377e3>] do_writepages+0x23/0x40
 [<ffffffff8119c8ce>] __writeback_single_inode+0x9e/0x280
 [<ffffffff811a026b>] writeback_sb_inodes+0x2db/0x490
 [<ffffffff811a0664>] wb_writeback+0x174/0x2d0
 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
 [<ffffffff811a0863>] wb_do_writeback+0xa3/0x200
 [<ffffffff811a0a40>] bdi_writeback_workfn+0x80/0x230
 [<ffffffff81085618>] ? process_one_work+0x228/0x4d0
 [<ffffffff810856cd>] process_one_work+0x2dd/0x4d0
 [<ffffffff81085618>] ? process_one_work+0x228/0x4d0
 [<ffffffff81085c1d>] worker_thread+0x35d/0x460
 [<ffffffff810858c0>] ? process_one_work+0x4d0/0x4d0
 [<ffffffff810858c0>] ? process_one_work+0x4d0/0x4d0
 [<ffffffff8108a885>] kthread+0xf5/0x100
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff8108a790>] ? __init_kthread_worker+0x70/0x70
 [<ffffffff815ce2ac>] ret_from_fork+0x7c/0xb0
 [<ffffffff8108a790>] ? __init_kthread_work

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/balloc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d70f154..83a6f49 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
 }
 
 /* Initializes an uninitialized block bitmap */
-static void ext4_init_block_bitmap(struct super_block *sb,
+static int ext4_init_block_bitmap(struct super_block *sb,
 				   struct buffer_head *bh,
 				   ext4_group_t block_group,
 				   struct ext4_group_desc *gdp)
@@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb,
 	/* If checksum is bad mark all blocks used to prevent allocation
 	 * essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
-		ext4_error(sb, "Checksum bad for group %u", block_group);
 		grp = ext4_get_group_info(sb, block_group);
 		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 			percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
 					   count);
 		}
 		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
-		return;
+		return -EIO;
 	}
 	memset(bh->b_data, 0, sb->s_blocksize);
 
@@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
 			     sb->s_blocksize * 8, bh->b_data);
 	ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
 	ext4_group_desc_csum_set(sb, block_group, gdp);
+	return 0;
 }
 
 /* Return the number of free blocks in a block group.  It is used when
@@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	}
 	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		ext4_init_block_bitmap(sb, bh, block_group, desc);
+		int err;
+
+		err = ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
 		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
+		if (err)
+			ext4_error(sb, "Checksum bad for grp %u", block_group);
 		return bh;
 	}
 	ext4_unlock_group(sb, block_group);
-- 
cgit v1.1


From 813d32f91333e4c33d5a19b67167c4bae42dae75 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 14 Oct 2014 02:35:49 -0400
Subject: ext4: check s_chksum_driver when looking for bg csum presence

Convert the ext4_has_group_desc_csum predicate to look for a checksum
driver instead of the metadata_csum flag and change the bg checksum
calculation function to look for GDT_CSUM before taking the crc16
path.

Without this patch, if we mount with ^uninit_bg,^metadata_csum and
later metadata_csum gets turned on by accident, the block group
checksum functions will incorrectly assume that checksumming is
enabled (metadata_csum) but that crc16 should be used
(!s_chksum_driver).  This is totally wrong, so fix the predicate
and the checksum formula selection.

(Granted, if the metadata_csum feature bit gets enabled on a live FS
then something underhanded is going on, but we could at least avoid
writing garbage into the on-disk fields.)

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: stable@vger.kernel.org
---
 fs/ext4/ext4.h  | 4 ++--
 fs/ext4/super.c | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1483d9c..c55a1fa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2333,8 +2333,8 @@ extern int ext4_register_li_request(struct super_block *sb,
 static inline int ext4_has_group_desc_csum(struct super_block *sb)
 {
 	return EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					  EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
-					  EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+					  EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
+	       (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
 
 static inline int ext4_has_metadata_csum(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5afe42d..e96b6ec 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2005,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 	}
 
 	/* old crc16 code */
+	if (!(sbi->s_es->s_feature_ro_compat &
+	      cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+		return 0;
+
 	offset = offsetof(struct ext4_group_desc, bg_checksum);
 
 	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
-- 
cgit v1.1


From aa281ac631008b9c18c405c8880007789f659c7d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <ooo@electrozaur.com>
Date: Sun, 19 Oct 2014 19:38:58 +0300
Subject: Boaz Harrosh - Fix broken email address

I no longer have access to the Panasas email.
So change to an email that can always reach me.

Signed-off-by: Boaz Harrosh <ooo@electrozaur.com>
---
 fs/exofs/Kbuild                     | 2 +-
 fs/exofs/common.h                   | 2 +-
 fs/exofs/dir.c                      | 2 +-
 fs/exofs/exofs.h                    | 2 +-
 fs/exofs/file.c                     | 2 +-
 fs/exofs/inode.c                    | 2 +-
 fs/exofs/namei.c                    | 2 +-
 fs/exofs/ore.c                      | 4 ++--
 fs/exofs/ore_raid.c                 | 2 +-
 fs/exofs/ore_raid.h                 | 2 +-
 fs/exofs/super.c                    | 2 +-
 fs/exofs/symlink.c                  | 2 +-
 fs/exofs/sys.c                      | 2 +-
 fs/nfs/objlayout/objio_osd.c        | 2 +-
 fs/nfs/objlayout/objlayout.c        | 2 +-
 fs/nfs/objlayout/objlayout.h        | 2 +-
 fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 2 +-
 17 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 389ba83..b47c7b8 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -4,7 +4,7 @@
 # Copyright (C) 2008 Panasas Inc.  All rights reserved.
 #
 # Authors:
-#   Boaz Harrosh <bharrosh@panasas.com>
+#   Boaz Harrosh <ooo@electrozaur.com>
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index 3bbd469..7d88ef5 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -4,7 +4,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 49f51ab..d7defd5 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index fffe86f..ad9cac6 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 71bf8e4..1a376b4 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3f9cafd..f1d3d4e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 4731fd9..2890746 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index cfc0205..7bd8ac8 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * This file is part of exofs.
  *
@@ -29,7 +29,7 @@
 
 #include "ore_raid.h"
 
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
 MODULE_LICENSE("GPL");
 
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 84529b8..27cbdb6 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2011
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * This file is part of the objects raid engine (ore).
  *
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index cf6375d..a6e7467 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) from 2011
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * This file is part of the objects raid engine (ore).
  *
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index ed73ed8..9596550 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 4dd687c..832e262 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2006
  * Avishay Traeger (avishay@gmail.com)
  * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * Copyrights for code taken from ext2:
  *     Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 1b4f2f9..5e6a2c0 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2012
  * Sachin Bhamare <sbhamare@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
  *
  * This file is part of exofs.
  *
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278..5cbd8cf 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -5,7 +5,7 @@
  *  All rights reserved.
  *
  *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d..ad98842 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -5,7 +5,7 @@
  *  All rights reserved.
  *
  *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d..3472e83 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -6,7 +6,7 @@
  *  All rights reserved.
  *
  *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index b3918f7..f093c7e 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -5,7 +5,7 @@
  *  All rights reserved.
  *
  *  Benny Halevy <bhalevy@panasas.com>
- *  Boaz Harrosh <bharrosh@panasas.com>
+ *  Boaz Harrosh <ooo@electrozaur.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2
-- 
cgit v1.1


From d1d84c9626bb3a519863b3ffc40d347166f9fb83 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Aug 2014 15:04:31 -0400
Subject: nfsd4: fix response size estimation for OP_SEQUENCE

We added this new estimator function but forgot to hook it up.  The
effect is that NFSv4.1 (and greater) won't do zero-copy reads.

The estimate was also wrong by 8 bytes.

Fixes: ccae70a9ee41 "nfsd4: estimate sequence response size"
Cc: stable@vger.kernel.org
Reported-by: Chuck Lever <chucklever@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cdeb3cf..f4bd578 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1589,7 +1589,8 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
 static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
 				       struct nfsd4_op *op)
 {
-	return NFS4_MAX_SESSIONID_LEN + 20;
+	return (op_encode_hdr_size
+		+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 
 static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1893,6 +1894,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_func = (nfsd4op_func)nfsd4_sequence,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_SEQUENCE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize,
 	},
 	[OP_DESTROY_CLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
-- 
cgit v1.1


From b744c2ac4bbc040794efb33207d6ebc14f88ea2e Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Tue, 21 Oct 2014 13:55:09 -0600
Subject: fs: merge I/O error prints into one line

buffer.c uses two printk calls to print these messages:
[67353.422338] Buffer I/O error on device sdr, logical block 212868488
[67353.422338] lost page write due to I/O error on sdr

In a busy system, they may be interleaved with other prints,
losing the context for the second message.  Merge them into
one line with one printk call so the prints are atomic.

Also, differentiate between async page writes, sync page writes, and
async page reads.

Also, shorten "device" to "dev" to match the block layer prints:
[67353.467906] blk_update_request: critical target error, dev sdr, sector
1707107328

Also, use %llu rather than %Lu.

Resulting prints look like:
[ 1356.437006] blk_update_request: critical target error, dev sdr, sector 1719693992
[ 1361.383522] quiet_error: 659876 callbacks suppressed
[ 1361.385816] Buffer I/O error on dev sdr, logical block 256902912, lost async page write
[ 1361.385819] Buffer I/O error on dev sdr, logical block 256903644, lost async page write

Signed-off-by: Robert Elliott <elliott@hp.com>
Reviewed-by: Webb Scales <webbnh@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 6c48f20e..9d1da1d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -137,12 +137,12 @@ static int quiet_error(struct buffer_head *bh)
 }
 
 
-static void buffer_io_error(struct buffer_head *bh)
+static void buffer_io_error(struct buffer_head *bh, char *msg)
 {
 	char b[BDEVNAME_SIZE];
-	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+	printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n",
 			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr);
+			(unsigned long long)bh->b_blocknr, msg);
 }
 
 /*
@@ -177,17 +177,11 @@ EXPORT_SYMBOL(end_buffer_read_sync);
 
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh)) {
-			buffer_io_error(bh);
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
-		}
+		if (!quiet_error(bh))
+			buffer_io_error(bh, ", lost sync page write");
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
@@ -305,7 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	} else {
 		clear_buffer_uptodate(bh);
 		if (!quiet_error(bh))
-			buffer_io_error(bh);
+			buffer_io_error(bh, ", async page read");
 		SetPageError(page);
 	}
 
@@ -353,7 +347,6 @@ still_busy:
  */
 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
 	unsigned long flags;
 	struct buffer_head *first;
 	struct buffer_head *tmp;
@@ -365,12 +358,8 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh)) {
-			buffer_io_error(bh);
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-			       bdevname(bh->b_bdev, b));
-		}
+		if (!quiet_error(bh))
+			buffer_io_error(bh, ", lost async page write");
 		set_bit(AS_EIO, &page->mapping->flags);
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
-- 
cgit v1.1


From 432f16e64f50fd4999a476543d04dd52f7a2d753 Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Tue, 21 Oct 2014 13:55:11 -0600
Subject: fs: clarify rate limit suppressed buffer I/O errors

When quiet_error applies rate limiting to buffer_io_error calls, what the
they apply to is unclear because the name is so generic, particularly
if the messages are interleaved with others:

[ 1936.063572] quiet_error: 664293 callbacks suppressed
[ 1936.065297] Buffer I/O error on dev sdr, logical block 257429952, lost async page write
[ 1936.067814] Buffer I/O error on dev sdr, logical block 257429953, lost async page write

Also, the function uses printk_ratelimit(), although printk.h includes a
comment advising "Please don't use... Instead use printk_ratelimited()."

Change buffer_io_error to check the BH_Quiet bit itself, drop the
printk_ratelimit call, and print using printk_ratelimited.

This makes the messages look like:

[  387.208839] buffer_io_error: 676394 callbacks suppressed
[  387.210693] Buffer I/O error on dev sdr, logical block 211291776, lost async page write
[  387.213432] Buffer I/O error on dev sdr, logical block 211291777, lost async page write

Signed-off-by: Robert Elliott <elliott@hp.com>
Reviewed-by: Webb Scales <webbnh@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 9d1da1d..20805db 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,19 +128,13 @@ __clear_page_buffers(struct page *page)
 	page_cache_release(page);
 }
 
-
-static int quiet_error(struct buffer_head *bh)
-{
-	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
-		return 0;
-	return 1;
-}
-
-
 static void buffer_io_error(struct buffer_head *bh, char *msg)
 {
 	char b[BDEVNAME_SIZE];
-	printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n",
+
+	if (!test_bit(BH_Quiet, &bh->b_state))
+		printk_ratelimited(KERN_ERR
+			"Buffer I/O error on dev %s, logical block %llu%s\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr, msg);
 }
@@ -180,8 +174,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh))
-			buffer_io_error(bh, ", lost sync page write");
+		buffer_io_error(bh, ", lost sync page write");
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
@@ -298,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
-		if (!quiet_error(bh))
-			buffer_io_error(bh, ", async page read");
+		buffer_io_error(bh, ", async page read");
 		SetPageError(page);
 	}
 
@@ -358,8 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh))
-			buffer_io_error(bh, ", lost async page write");
+		buffer_io_error(bh, ", lost async page write");
 		set_bit(AS_EIO, &page->mapping->flags);
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
-- 
cgit v1.1


From 7938db449bbc55bbeb164bec7af406212e7e98f1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 16 Sep 2014 22:23:10 +0200
Subject: ext3: Don't check quota format when there are no quota files

The check whether quota format is set even though there are no
quota files with journalled quota is pointless and it actually
makes it impossible to turn off journalled quotas (as there's
no way to unset journalled quota format). Just remove the check.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7015db0..eb742d0 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1354,13 +1354,6 @@ set_qf_format:
 					"not specified.");
 			return 0;
 		}
-	} else {
-		if (sbi->s_jquota_fmt) {
-			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
-					"specified with no journaling "
-					"enabled.");
-			return 0;
-		}
 	}
 #endif
 	return 1;
-- 
cgit v1.1


From 474d2605d119479e5aa050f738632e63589d4bb5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Oct 2014 09:06:49 +0200
Subject: quota: Properly return errors from dquot_writeback_dquots()

Due to a switched left and right side of an assignment,
dquot_writeback_dquots() never returned error. This could result in
errors during quota writeback to not be reported to userspace properly.
Fix it.

CC: stable@vger.kernel.org
Coverity-id: 1226884
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b663b2..6b45272 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -634,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
 			dqstats_inc(DQST_LOOKUPS);
 			err = sb->dq_op->write_dquot(dquot);
 			if (!ret && err)
-				err = ret;
+				ret = err;
 			dqput(dquot);
 			spin_lock(&dq_list_lock);
 		}
-- 
cgit v1.1


From 3c9cafe05ff002eb84d438a02f3c8d468720463b Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 21 Oct 2014 16:43:55 -0400
Subject: fs, jbd: use a more generic hash function

While the hash function used by the revoke hashtable is good somewhere else,
it's not really good here.

The default hash shift (8) means that one third of the hashing function
gets lost (and is undefined anyways (8 - 12 = negative shift)):

	"(block << (hash_shift - 12))) & (table->hash_size - 1)"

Instead, just use the kernel's generic hash function that gets used everywhere
else.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/revoke.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8898bbd..dcead63 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -93,6 +93,7 @@
 #include <linux/bio.h>
 #endif
 #include <linux/log2.h>
+#include <linux/hash.h>
 
 static struct kmem_cache *revoke_record_cache;
 static struct kmem_cache *revoke_table_cache;
@@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 
 /* Utility functions to maintain the revoke table */
 
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
 static inline int hash(journal_t *journal, unsigned int block)
 {
 	struct jbd_revoke_table_s *table = journal->j_revoke;
-	int hash_shift = table->hash_shift;
 
-	return ((block << (hash_shift - 6)) ^
-		(block >> 13) ^
-		(block << (hash_shift - 12))) & (table->hash_size - 1);
+	return hash_32(block, table->hash_shift);
 }
 
 static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
-- 
cgit v1.1


From 51904b08072a8bf2b9ed74d1bd7a5300a614471d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 22 Oct 2014 14:46:29 -0400
Subject: nfsd4: fix crash on unknown operation number

Unknown operation numbers are caught in nfsd4_decode_compound() which
sets op->opnum to OP_ILLEGAL and op->status to nfserr_op_illegal.  The
error causes the main loop in nfsd4_proc_compound() to skip most
processing.  But nfsd4_proc_compound also peeks ahead at the next
operation in one case and doesn't take similar precautions there.

Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f4bd578..0beb023 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1272,7 +1272,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
 	 */
 	if (argp->opcnt == resp->opcnt)
 		return false;
-
+	if (next->opnum == OP_ILLEGAL)
+		return false;
 	nextd = OPDESC(next);
 	/*
 	 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
-- 
cgit v1.1


From 4aa7c6346be395bdf776f82bbb2e3e2bc60bdd2b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:35 +0200
Subject: vfs: add i_op->dentry_open()

Add a new inode operation i_op->dentry_open().  This is for stacked filesystems
that want to return a struct file from a different filesystem.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namei.c |  9 ++++++---
 fs/open.c  | 23 +++++++++++++++++++++--
 2 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 43927d1..75306b3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3064,9 +3064,12 @@ finish_open_created:
 	error = may_open(&nd->path, acc_mode, open_flag);
 	if (error)
 		goto out;
-	file->f_path.mnt = nd->path.mnt;
-	error = finish_open(file, nd->path.dentry, NULL, opened);
-	if (error) {
+
+	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
+	error = vfs_open(&nd->path, file, current_cred());
+	if (!error) {
+		*opened |= FILE_OPENED;
+	} else {
 		if (error == -EOPENSTALE)
 			goto stale_open;
 		goto out;
diff --git a/fs/open.c b/fs/open.c
index d6fd3ac..de92c13 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags,
 	f = get_empty_filp();
 	if (!IS_ERR(f)) {
 		f->f_flags = flags;
-		f->f_path = *path;
-		error = do_dentry_open(f, NULL, cred);
+		error = vfs_open(path, f, cred);
 		if (!error) {
 			/* from now on we need fput() to dispose of f */
 			error = open_check_o_direct(f);
@@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
 
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @filp: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *filp,
+	     const struct cred *cred)
+{
+	struct inode *inode = path->dentry->d_inode;
+
+	if (inode->i_op->dentry_open)
+		return inode->i_op->dentry_open(path->dentry, filp, cred);
+	else {
+		filp->f_path = *path;
+		return do_dentry_open(filp, NULL, cred);
+	}
+}
+EXPORT_SYMBOL(vfs_open);
+
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
-- 
cgit v1.1


From 1c118596a7682912106c80007102ce0184c77780 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:35 +0200
Subject: vfs: export do_splice_direct() to modules

Export do_splice_direct() to modules.  Needed by overlay filesystem.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/internal.h | 6 ------
 fs/splice.c   | 1 +
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 9477f8f..0f0626a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -139,12 +139,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 
 /*
- * splice.c
- */
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		loff_t *opos, size_t len, unsigned int flags);
-
-/*
  * pipe.c
  */
 extern const struct file_operations pipefifo_fops;
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba..75c6058 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	return ret;
 }
+EXPORT_SYMBOL(do_splice_direct);
 
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			       struct pipe_inode_info *opipe,
-- 
cgit v1.1


From bd5d08569cc379f8366663a61558a9ce17c2e460 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:35 +0200
Subject: vfs: export __inode_permission() to modules

We need to be able to check inode permissions (but not filesystem implied
permissions) for stackable filesystems.  Expose this interface for overlayfs.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/internal.h | 1 -
 fs/namei.c    | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 0f0626a..757ba2a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -47,7 +47,6 @@ extern void __init chrdev_init(void);
 /*
  * namei.c
  */
-extern int __inode_permission(struct inode *, int);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index 75306b3..d944f6d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask)
 
 	return security_inode_permission(inode, mask);
 }
+EXPORT_SYMBOL(__inode_permission);
 
 /**
  * sb_permission - Check superblock-level permissions
-- 
cgit v1.1


From c771d683a62e5d36bc46036f5c07f4f5bb7dda61 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:36 +0200
Subject: vfs: introduce clone_private_mount()

Overlayfs needs a private clone of the mount, so create a function for
this and export to modules.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namespace.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index fbba8b1..5b66b2b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
 	namespace_unlock();
 }
 
+/**
+ * clone_private_mount - create a private clone of a path
+ *
+ * This creates a new vfsmount, which will be the clone of @path.  The new will
+ * not be attached anywhere in the namespace and will be private (i.e. changes
+ * to the originating mount won't be propagated into this).
+ *
+ * Release with mntput().
+ */
+struct vfsmount *clone_private_mount(struct path *path)
+{
+	struct mount *old_mnt = real_mount(path->mnt);
+	struct mount *new_mnt;
+
+	if (IS_MNT_UNBINDABLE(old_mnt))
+		return ERR_PTR(-EINVAL);
+
+	down_read(&namespace_sem);
+	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+	up_read(&namespace_sem);
+	if (IS_ERR(new_mnt))
+		return ERR_CAST(new_mnt);
+
+	return &new_mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(clone_private_mount);
+
 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 		   struct vfsmount *root)
 {
-- 
cgit v1.1


From cbdf35bcb833bfd00f0925d7a9a33a21f41ea582 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:36 +0200
Subject: vfs: export check_sticky()

It's already duplicated in btrfs and about to be used in overlayfs too.

Move the sticky bit check to an inline helper and call the out-of-line
helper only in the unlikly case of the sticky bit being set.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/btrfs/ioctl.c | 20 +-------------------
 fs/namei.c       |  9 ++-------
 2 files changed, 3 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d2b76e..4399f0c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,23 +765,6 @@ out:
 	return ret;
 }
 
-/*  copy of check_sticky in fs/namei.c()
-* It's inline, so penalty for filesystems that don't use sticky bit is
-* minimal.
-*/
-static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
-{
-	kuid_t fsuid = current_fsuid();
-
-	if (!(dir->i_mode & S_ISVTX))
-		return 0;
-	if (uid_eq(inode->i_uid, fsuid))
-		return 0;
-	if (uid_eq(dir->i_uid, fsuid))
-		return 0;
-	return !capable(CAP_FOWNER);
-}
-
 /*  copy of may_delete in fs/namei.c()
  *	Check whether we can remove a link victim from directory dir, check
  *  whether the type of victim is right.
@@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
-	if (btrfs_check_sticky(dir, victim->d_inode)||
-		IS_APPEND(victim->d_inode)||
+	if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
 	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
 		return -EPERM;
 	if (isdir) {
diff --git a/fs/namei.c b/fs/namei.c
index d944f6d..77fd536 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2384,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
 }
 EXPORT_SYMBOL(kern_path_mountpoint);
 
-/*
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
- */
-static inline int check_sticky(struct inode *dir, struct inode *inode)
+int __check_sticky(struct inode *dir, struct inode *inode)
 {
 	kuid_t fsuid = current_fsuid();
 
-	if (!(dir->i_mode & S_ISVTX))
-		return 0;
 	if (uid_eq(inode->i_uid, fsuid))
 		return 0;
 	if (uid_eq(dir->i_uid, fsuid))
 		return 0;
 	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
 }
+EXPORT_SYMBOL(__check_sticky);
 
 /*
  *	Check whether we can remove a link victim from directory dir, check
-- 
cgit v1.1


From 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:36 +0200
Subject: vfs: add whiteout support

Whiteout isn't actually a new file type, but is represented as a char
device (Linus's idea) with 0/0 device number.

This has several advantages compared to introducing a new whiteout file
type:

 - no userspace API changes (e.g. trivial to make backups of upper layer
   filesystem, without losing whiteouts)

 - no fs image format changes (you can boot an old kernel/fsck without
   whiteout support and things won't break)

 - implementation is trivial

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namei.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 77fd536..d20191c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4346,6 +4346,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
 	return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
 
+int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+{
+	int error = may_create(dir, dentry);
+	if (error)
+		return error;
+
+	if (!dir->i_op->mknod)
+		return -EPERM;
+
+	return dir->i_op->mknod(dir, dentry,
+				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
+EXPORT_SYMBOL(vfs_whiteout);
+
 int readlink_copy(char __user *buffer, int buflen, const char *link)
 {
 	int len = PTR_ERR(link);
-- 
cgit v1.1


From 0d7a855526dd672e114aff2ac22b60fc6f155b08 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:37 +0200
Subject: vfs: add RENAME_WHITEOUT

This adds a new RENAME_WHITEOUT flag.  This flag makes rename() create a
whiteout of source.  The whiteout creation is atomic relative to the
rename.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namei.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d20191c..42df664 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4209,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 	bool should_retry = false;
 	int error;
 
-	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
-	if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
+	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+	    (flags & RENAME_EXCHANGE))
 		return -EINVAL;
 
+	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+		return -EPERM;
+
 retry:
 	from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
 	if (IS_ERR(from)) {
-- 
cgit v1.1


From cd808deced431b66b5fa4e5c193cb7ec0059eaff Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:37 +0200
Subject: ext4: support RENAME_WHITEOUT

Add whiteout support to ext4_rename().  A whiteout inode (chrdev/0,0) is
created before the rename takes place.  The whiteout inode is added to the
old entry instead of deleting it.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/ext4/namei.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 78 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 603e4eb..aba86e8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3190,6 +3190,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
 	}
 }
 
+static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+					      int credits, handle_t **h)
+{
+	struct inode *wh;
+	handle_t *handle;
+	int retries = 0;
+
+	/*
+	 * for inode block, sb block, group summaries,
+	 * and inode bitmap
+	 */
+	credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
+		    EXT4_XATTR_TRANS_BLOCKS + 4);
+retry:
+	wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+					 &ent->dentry->d_name, 0, NULL,
+					 EXT4_HT_DIR, credits);
+
+	handle = ext4_journal_current_handle();
+	if (IS_ERR(wh)) {
+		if (handle)
+			ext4_journal_stop(handle);
+		if (PTR_ERR(wh) == -ENOSPC &&
+		    ext4_should_retry_alloc(ent->dir->i_sb, &retries))
+			goto retry;
+	} else {
+		*h = handle;
+		init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
+		wh->i_op = &ext4_special_inode_operations;
+	}
+	return wh;
+}
+
 /*
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
@@ -3199,7 +3232,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
  * This comes from rename(const char *oldpath, const char *newpath)
  */
 static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry)
+		       struct inode *new_dir, struct dentry *new_dentry,
+		       unsigned int flags)
 {
 	handle_t *handle = NULL;
 	struct ext4_renament old = {
@@ -3214,6 +3248,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	};
 	int force_reread;
 	int retval;
+	struct inode *whiteout = NULL;
+	int credits;
+	u8 old_file_type;
 
 	dquot_initialize(old.dir);
 	dquot_initialize(new.dir);
@@ -3252,11 +3289,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_alloc_da_blocks(old.inode);
 
-	handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
-		(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
-		 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (!(flags & RENAME_WHITEOUT)) {
+		handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+	} else {
+		whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+		if (IS_ERR(whiteout))
+			return PTR_ERR(whiteout);
+	}
 
 	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
 		ext4_handle_sync(handle);
@@ -3284,13 +3327,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	force_reread = (new.dir->i_ino == old.dir->i_ino &&
 			ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
+
+	old_file_type = old.de->file_type;
+	if (whiteout) {
+		/*
+		 * Do this before adding a new entry, so the old entry is sure
+		 * to be still pointing to the valid old entry.
+		 */
+		retval = ext4_setent(handle, &old, whiteout->i_ino,
+				     EXT4_FT_CHRDEV);
+		if (retval)
+			goto end_rename;
+		ext4_mark_inode_dirty(handle, whiteout);
+	}
 	if (!new.bh) {
 		retval = ext4_add_entry(handle, new.dentry, old.inode);
 		if (retval)
 			goto end_rename;
 	} else {
 		retval = ext4_setent(handle, &new,
-				     old.inode->i_ino, old.de->file_type);
+				     old.inode->i_ino, old_file_type);
 		if (retval)
 			goto end_rename;
 	}
@@ -3305,10 +3361,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old.inode->i_ctime = ext4_current_time(old.inode);
 	ext4_mark_inode_dirty(handle, old.inode);
 
-	/*
-	 * ok, that's it
-	 */
-	ext4_rename_delete(handle, &old, force_reread);
+	if (!whiteout) {
+		/*
+		 * ok, that's it
+		 */
+		ext4_rename_delete(handle, &old, force_reread);
+	}
 
 	if (new.inode) {
 		ext4_dec_count(handle, new.inode);
@@ -3344,6 +3402,12 @@ end_rename:
 	brelse(old.dir_bh);
 	brelse(old.bh);
 	brelse(new.bh);
+	if (whiteout) {
+		if (retval)
+			drop_nlink(whiteout);
+		unlock_new_inode(whiteout);
+		iput(whiteout);
+	}
 	if (handle)
 		ext4_journal_stop(handle);
 	return retval;
@@ -3476,18 +3540,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
-	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
 	if (flags & RENAME_EXCHANGE) {
 		return ext4_cross_rename(old_dir, old_dentry,
 					 new_dir, new_dentry);
 	}
-	/*
-	 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
-	 * is equivalent to regular rename.
-	 */
-	return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+	return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
 /*
-- 
cgit v1.1


From e9be9d5e76e34872f0c37d72e25bc27fe9e2c54c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:38 +0200
Subject: overlay filesystem

Overlayfs allows one, usually read-write, directory tree to be
overlaid onto another, read-only directory tree.  All modifications
go to the upper, writable layer.

This type of mechanism is most often used for live CDs but there's a
wide variety of other uses.

The implementation differs from other "union filesystem"
implementations in that after a file is opened all operations go
directly to the underlying, lower or upper, filesystems.  This
simplifies the implementation and allows native performance in these
cases.

The dentry tree is duplicated from the underlying filesystems, this
enables fast cached lookups without adding special support into the
VFS.  This uses slightly more memory than union mounts, but dentries
are relatively small.

Currently inodes are duplicated as well, but it is a possible
optimization to share inodes for non-directories.

Opening non directories results in the open forwarded to the
underlying filesystem.  This makes the behavior very similar to union
mounts (with the same limitations vs. fchmod/fchown on O_RDONLY file
descriptors).

Usage:

  mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper/upper,workdir=/upper/work /overlay

The following cotributions have been folded into this patch:

Neil Brown <neilb@suse.de>:
 - minimal remount support
 - use correct seek function for directories
 - initialise is_real before use
 - rename ovl_fill_cache to ovl_dir_read

Felix Fietkau <nbd@openwrt.org>:
 - fix a deadlock in ovl_dir_read_merged
 - fix a deadlock in ovl_remove_whiteouts

Erez Zadok <ezk@fsl.cs.sunysb.edu>
 - fix cleanup after WARN_ON

Sedat Dilek <sedat.dilek@googlemail.com>
 - fix up permission to confirm to new API

Robin Dong <hao.bigrat@gmail.com>
 - fix possible leak in ovl_new_inode
 - create new inode in ovl_link

Andy Whitcroft <apw@canonical.com>
 - switch to __inode_permission()
 - copy up i_uid/i_gid from the underlying inode

AV:
 - ovl_copy_up_locked() - dput(ERR_PTR(...)) on two failure exits
 - ovl_clear_empty() - one failure exit forgetting to do unlock_rename(),
   lack of check for udir being the parent of upper, dropping and regaining
   the lock on udir (which would require _another_ check for parent being
   right).
 - bogus d_drop() in copyup and rename [fix from your mail]
 - copyup/remove and copyup/rename races [fix from your mail]
 - ovl_dir_fsync() leaving ERR_PTR() in ->realfile
 - ovl_entry_free() is pointless - it's just a kfree_rcu()
 - fold ovl_do_lookup() into ovl_lookup()
 - manually assigning ->d_op is wrong.  Just use ->s_d_op.
 [patches picked from Miklos]:
 * copyup/remove and copyup/rename races
 * bogus d_drop() in copyup and rename

Also thanks to the following people for testing and reporting bugs:

  Jordi Pujol <jordipujolp@gmail.com>
  Andy Whitcroft <apw@canonical.com>
  Michal Suchanek <hramrach@centrum.cz>
  Felix Fietkau <nbd@openwrt.org>
  Erez Zadok <ezk@fsl.cs.sunysb.edu>
  Randy Dunlap <rdunlap@xenotime.net>

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/Kconfig               |   1 +
 fs/Makefile              |   1 +
 fs/overlayfs/Kconfig     |  10 +
 fs/overlayfs/Makefile    |   7 +
 fs/overlayfs/copy_up.c   | 414 +++++++++++++++++++++
 fs/overlayfs/dir.c       | 921 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/inode.c     | 425 ++++++++++++++++++++++
 fs/overlayfs/overlayfs.h | 191 ++++++++++
 fs/overlayfs/readdir.c   | 587 ++++++++++++++++++++++++++++++
 fs/overlayfs/super.c     | 727 +++++++++++++++++++++++++++++++++++++
 10 files changed, 3284 insertions(+)
 create mode 100644 fs/overlayfs/Kconfig
 create mode 100644 fs/overlayfs/Makefile
 create mode 100644 fs/overlayfs/copy_up.c
 create mode 100644 fs/overlayfs/dir.c
 create mode 100644 fs/overlayfs/inode.c
 create mode 100644 fs/overlayfs/overlayfs.h
 create mode 100644 fs/overlayfs/readdir.c
 create mode 100644 fs/overlayfs/super.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index db5dc15..664991a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
 
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
+source "fs/overlayfs/Kconfig"
 
 menu "Caches"
 
diff --git a/fs/Makefile b/fs/Makefile
index 90c8852..34a1b9de 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS)		+= qnx6/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
+obj-$(CONFIG_OVERLAYFS_FS)	+= overlayfs/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_OMFS_FS)		+= omfs/
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 0000000..e601259
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
+config OVERLAYFS_FS
+	tristate "Overlay filesystem support"
+	help
+	  An overlay filesystem combines two filesystems - an 'upper' filesystem
+	  and a 'lower' filesystem.  When a name exists in both filesystems, the
+	  object in the 'upper' filesystem is visible while the object in the
+	  'lower' filesystem is either hidden or, in the case of directories,
+	  merged with the 'upper' object.
+
+	  For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 0000000..8f91889
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the overlay filesystem.
+#
+
+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
+
+overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 0000000..ea10a87
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,414 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+
+int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+{
+	ssize_t list_size, size;
+	char *buf, *name, *value;
+	int error;
+
+	if (!old->d_inode->i_op->getxattr ||
+	    !new->d_inode->i_op->getxattr)
+		return 0;
+
+	list_size = vfs_listxattr(old, NULL, 0);
+	if (list_size <= 0) {
+		if (list_size == -EOPNOTSUPP)
+			return 0;
+		return list_size;
+	}
+
+	buf = kzalloc(list_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	error = -ENOMEM;
+	value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+	if (!value)
+		goto out;
+
+	list_size = vfs_listxattr(old, buf, list_size);
+	if (list_size <= 0) {
+		error = list_size;
+		goto out_free_value;
+	}
+
+	for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+		size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+		if (size <= 0) {
+			error = size;
+			goto out_free_value;
+		}
+		error = vfs_setxattr(new, name, value, size, 0);
+		if (error)
+			goto out_free_value;
+	}
+
+out_free_value:
+	kfree(value);
+out:
+	kfree(buf);
+	return error;
+}
+
+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+{
+	struct file *old_file;
+	struct file *new_file;
+	loff_t old_pos = 0;
+	loff_t new_pos = 0;
+	int error = 0;
+
+	if (len == 0)
+		return 0;
+
+	old_file = ovl_path_open(old, O_RDONLY);
+	if (IS_ERR(old_file))
+		return PTR_ERR(old_file);
+
+	new_file = ovl_path_open(new, O_WRONLY);
+	if (IS_ERR(new_file)) {
+		error = PTR_ERR(new_file);
+		goto out_fput;
+	}
+
+	/* FIXME: copy up sparse files efficiently */
+	while (len) {
+		size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+		long bytes;
+
+		if (len < this_len)
+			this_len = len;
+
+		if (signal_pending_state(TASK_KILLABLE, current)) {
+			error = -EINTR;
+			break;
+		}
+
+		bytes = do_splice_direct(old_file, &old_pos,
+					 new_file, &new_pos,
+					 this_len, SPLICE_F_MOVE);
+		if (bytes <= 0) {
+			error = bytes;
+			break;
+		}
+		WARN_ON(old_pos != new_pos);
+
+		len -= bytes;
+	}
+
+	fput(new_file);
+out_fput:
+	fput(old_file);
+	return error;
+}
+
+static char *ovl_read_symlink(struct dentry *realdentry)
+{
+	int res;
+	char *buf;
+	struct inode *inode = realdentry->d_inode;
+	mm_segment_t old_fs;
+
+	res = -EINVAL;
+	if (!inode->i_op->readlink)
+		goto err;
+
+	res = -ENOMEM;
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = inode->i_op->readlink(realdentry,
+				    (char __user *)buf, PAGE_SIZE - 1);
+	set_fs(old_fs);
+	if (res < 0) {
+		free_page((unsigned long) buf);
+		goto err;
+	}
+	buf[res] = '\0';
+
+	return buf;
+
+err:
+	return ERR_PTR(res);
+}
+
+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+{
+	struct iattr attr = {
+		.ia_valid =
+		     ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+		.ia_atime = stat->atime,
+		.ia_mtime = stat->mtime,
+	};
+
+	return notify_change(upperdentry, &attr, NULL);
+}
+
+int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+{
+	int err = 0;
+
+	if (!S_ISLNK(stat->mode)) {
+		struct iattr attr = {
+			.ia_valid = ATTR_MODE,
+			.ia_mode = stat->mode,
+		};
+		err = notify_change(upperdentry, &attr, NULL);
+	}
+	if (!err) {
+		struct iattr attr = {
+			.ia_valid = ATTR_UID | ATTR_GID,
+			.ia_uid = stat->uid,
+			.ia_gid = stat->gid,
+		};
+		err = notify_change(upperdentry, &attr, NULL);
+	}
+	if (!err)
+		ovl_set_timestamps(upperdentry, stat);
+
+	return err;
+
+}
+
+static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
+			      struct dentry *dentry, struct path *lowerpath,
+			      struct kstat *stat, struct iattr *attr,
+			      const char *link)
+{
+	struct inode *wdir = workdir->d_inode;
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *newdentry = NULL;
+	struct dentry *upper = NULL;
+	umode_t mode = stat->mode;
+	int err;
+
+	newdentry = ovl_lookup_temp(workdir, dentry);
+	err = PTR_ERR(newdentry);
+	if (IS_ERR(newdentry))
+		goto out;
+
+	upper = lookup_one_len(dentry->d_name.name, upperdir,
+			       dentry->d_name.len);
+	err = PTR_ERR(upper);
+	if (IS_ERR(upper))
+		goto out1;
+
+	/* Can't properly set mode on creation because of the umask */
+	stat->mode &= S_IFMT;
+	err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
+	stat->mode = mode;
+	if (err)
+		goto out2;
+
+	if (S_ISREG(stat->mode)) {
+		struct path upperpath;
+		ovl_path_upper(dentry, &upperpath);
+		BUG_ON(upperpath.dentry != NULL);
+		upperpath.dentry = newdentry;
+
+		err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
+		if (err)
+			goto out_cleanup;
+	}
+
+	err = ovl_copy_xattr(lowerpath->dentry, newdentry);
+	if (err)
+		goto out_cleanup;
+
+	mutex_lock(&newdentry->d_inode->i_mutex);
+	err = ovl_set_attr(newdentry, stat);
+	if (!err && attr)
+		err = notify_change(newdentry, attr, NULL);
+	mutex_unlock(&newdentry->d_inode->i_mutex);
+	if (err)
+		goto out_cleanup;
+
+	err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+	if (err)
+		goto out_cleanup;
+
+	ovl_dentry_update(dentry, newdentry);
+	newdentry = NULL;
+
+	/*
+	 * Non-directores become opaque when copied up.
+	 */
+	if (!S_ISDIR(stat->mode))
+		ovl_dentry_set_opaque(dentry, true);
+out2:
+	dput(upper);
+out1:
+	dput(newdentry);
+out:
+	return err;
+
+out_cleanup:
+	ovl_cleanup(wdir, newdentry);
+	goto out;
+}
+
+/*
+ * Copy up a single dentry
+ *
+ * Directory renames only allowed on "pure upper" (already created on
+ * upper filesystem, never copied up).  Directories which are on lower or
+ * are merged may not be renamed.  For these -EXDEV is returned and
+ * userspace has to deal with it.  This means, when copying up a
+ * directory we can rely on it and ancestors being stable.
+ *
+ * Non-directory renames start with copy up of source if necessary.  The
+ * actual rename will only proceed once the copy up was successful.  Copy
+ * up uses upper parent i_mutex for exclusion.  Since rename can change
+ * d_parent it is possible that the copy up will lock the old parent.  At
+ * that point the file will have already been copied up anyway.
+ */
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+		    struct path *lowerpath, struct kstat *stat,
+		    struct iattr *attr)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	int err;
+	struct kstat pstat;
+	struct path parentpath;
+	struct dentry *upperdir;
+	struct dentry *upperdentry;
+	const struct cred *old_cred;
+	struct cred *override_cred;
+	char *link = NULL;
+
+	ovl_path_upper(parent, &parentpath);
+	upperdir = parentpath.dentry;
+
+	err = vfs_getattr(&parentpath, &pstat);
+	if (err)
+		return err;
+
+	if (S_ISLNK(stat->mode)) {
+		link = ovl_read_symlink(lowerpath->dentry);
+		if (IS_ERR(link))
+			return PTR_ERR(link);
+	}
+
+	err = -ENOMEM;
+	override_cred = prepare_creds();
+	if (!override_cred)
+		goto out_free_link;
+
+	override_cred->fsuid = stat->uid;
+	override_cred->fsgid = stat->gid;
+	/*
+	 * CAP_SYS_ADMIN for copying up extended attributes
+	 * CAP_DAC_OVERRIDE for create
+	 * CAP_FOWNER for chmod, timestamp update
+	 * CAP_FSETID for chmod
+	 * CAP_CHOWN for chown
+	 * CAP_MKNOD for mknod
+	 */
+	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+	cap_raise(override_cred->cap_effective, CAP_FOWNER);
+	cap_raise(override_cred->cap_effective, CAP_FSETID);
+	cap_raise(override_cred->cap_effective, CAP_CHOWN);
+	cap_raise(override_cred->cap_effective, CAP_MKNOD);
+	old_cred = override_creds(override_cred);
+
+	err = -EIO;
+	if (lock_rename(workdir, upperdir) != NULL) {
+		pr_err("overlayfs: failed to lock workdir+upperdir\n");
+		goto out_unlock;
+	}
+	upperdentry = ovl_dentry_upper(dentry);
+	if (upperdentry) {
+		unlock_rename(workdir, upperdir);
+		err = 0;
+		/* Raced with another copy-up?  Do the setattr here */
+		if (attr) {
+			mutex_lock(&upperdentry->d_inode->i_mutex);
+			err = notify_change(upperdentry, attr, NULL);
+			mutex_unlock(&upperdentry->d_inode->i_mutex);
+		}
+		goto out_put_cred;
+	}
+
+	err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
+				 stat, attr, link);
+	if (!err) {
+		/* Restore timestamps on parent (best effort) */
+		ovl_set_timestamps(upperdir, &pstat);
+	}
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out_put_cred:
+	revert_creds(old_cred);
+	put_cred(override_cred);
+
+out_free_link:
+	if (link)
+		free_page((unsigned long) link);
+
+	return err;
+}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+	int err;
+
+	err = 0;
+	while (!err) {
+		struct dentry *next;
+		struct dentry *parent;
+		struct path lowerpath;
+		struct kstat stat;
+		enum ovl_path_type type = ovl_path_type(dentry);
+
+		if (type != OVL_PATH_LOWER)
+			break;
+
+		next = dget(dentry);
+		/* find the topmost dentry not yet copied up */
+		for (;;) {
+			parent = dget_parent(next);
+
+			type = ovl_path_type(parent);
+			if (type != OVL_PATH_LOWER)
+				break;
+
+			dput(next);
+			next = parent;
+		}
+
+		ovl_path_lower(next, &lowerpath);
+		err = vfs_getattr(&lowerpath, &stat);
+		if (!err)
+			err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+
+		dput(parent);
+		dput(next);
+	}
+
+	return err;
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 0000000..15cd91a
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,921 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+{
+	int err;
+
+	dget(wdentry);
+	if (S_ISDIR(wdentry->d_inode->i_mode))
+		err = ovl_do_rmdir(wdir, wdentry);
+	else
+		err = ovl_do_unlink(wdir, wdentry);
+	dput(wdentry);
+
+	if (err) {
+		pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+		       wdentry, err);
+	}
+}
+
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+{
+	struct dentry *temp;
+	char name[20];
+
+	snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+
+	temp = lookup_one_len(name, workdir, strlen(name));
+	if (!IS_ERR(temp) && temp->d_inode) {
+		pr_err("overlayfs: workdir/%s already exists\n", name);
+		dput(temp);
+		temp = ERR_PTR(-EIO);
+	}
+
+	return temp;
+}
+
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct dentry *workdir,
+				   struct dentry *dentry)
+{
+	int err;
+	struct dentry *whiteout;
+	struct inode *wdir = workdir->d_inode;
+
+	whiteout = ovl_lookup_temp(workdir, dentry);
+	if (IS_ERR(whiteout))
+		return whiteout;
+
+	err = ovl_do_whiteout(wdir, whiteout);
+	if (err) {
+		dput(whiteout);
+		whiteout = ERR_PTR(err);
+	}
+
+	return whiteout;
+}
+
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+		    struct kstat *stat, const char *link,
+		    struct dentry *hardlink, bool debug)
+{
+	int err;
+
+	if (newdentry->d_inode)
+		return -ESTALE;
+
+	if (hardlink) {
+		err = ovl_do_link(hardlink, dir, newdentry, debug);
+	} else {
+		switch (stat->mode & S_IFMT) {
+		case S_IFREG:
+			err = ovl_do_create(dir, newdentry, stat->mode, debug);
+			break;
+
+		case S_IFDIR:
+			err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+			break;
+
+		case S_IFCHR:
+		case S_IFBLK:
+		case S_IFIFO:
+		case S_IFSOCK:
+			err = ovl_do_mknod(dir, newdentry,
+					   stat->mode, stat->rdev, debug);
+			break;
+
+		case S_IFLNK:
+			err = ovl_do_symlink(dir, newdentry, link, debug);
+			break;
+
+		default:
+			err = -EPERM;
+		}
+	}
+	if (!err && WARN_ON(!newdentry->d_inode)) {
+		/*
+		 * Not quite sure if non-instantiated dentry is legal or not.
+		 * VFS doesn't seem to care so check and warn here.
+		 */
+		err = -ENOENT;
+	}
+	return err;
+}
+
+static int ovl_set_opaque(struct dentry *upperdentry)
+{
+	return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
+}
+
+static void ovl_remove_opaque(struct dentry *upperdentry)
+{
+	int err;
+
+	err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
+	if (err) {
+		pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
+			upperdentry->d_name.name, err);
+	}
+}
+
+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			 struct kstat *stat)
+{
+	int err;
+	enum ovl_path_type type;
+	struct path realpath;
+
+	type = ovl_path_real(dentry, &realpath);
+	err = vfs_getattr(&realpath, stat);
+	if (err)
+		return err;
+
+	stat->dev = dentry->d_sb->s_dev;
+	stat->ino = dentry->d_inode->i_ino;
+
+	/*
+	 * It's probably not worth it to count subdirs to get the
+	 * correct link count.  nlink=1 seems to pacify 'find' and
+	 * other utilities.
+	 */
+	if (type == OVL_PATH_MERGE)
+		stat->nlink = 1;
+
+	return 0;
+}
+
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+			    struct kstat *stat, const char *link,
+			    struct dentry *hardlink)
+{
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *newdentry;
+	int err;
+
+	mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+	newdentry = lookup_one_len(dentry->d_name.name, upperdir,
+				   dentry->d_name.len);
+	err = PTR_ERR(newdentry);
+	if (IS_ERR(newdentry))
+		goto out_unlock;
+	err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+	if (err)
+		goto out_dput;
+
+	ovl_dentry_version_inc(dentry->d_parent);
+	ovl_dentry_update(dentry, newdentry);
+	ovl_copyattr(newdentry->d_inode, inode);
+	d_instantiate(dentry, inode);
+	newdentry = NULL;
+out_dput:
+	dput(newdentry);
+out_unlock:
+	mutex_unlock(&udir->i_mutex);
+	return err;
+}
+
+static int ovl_lock_rename_workdir(struct dentry *workdir,
+				   struct dentry *upperdir)
+{
+	/* Workdir should not be the same as upperdir */
+	if (workdir == upperdir)
+		goto err;
+
+	/* Workdir should not be subdir of upperdir and vice versa */
+	if (lock_rename(workdir, upperdir) != NULL)
+		goto err_unlock;
+
+	return 0;
+
+err_unlock:
+	unlock_rename(workdir, upperdir);
+err:
+	pr_err("overlayfs: failed to lock workdir+upperdir\n");
+	return -EIO;
+}
+
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+				      struct list_head *list)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *wdir = workdir->d_inode;
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct path upperpath;
+	struct dentry *upper;
+	struct dentry *opaquedir;
+	struct kstat stat;
+	int err;
+
+	err = ovl_lock_rename_workdir(workdir, upperdir);
+	if (err)
+		goto out;
+
+	ovl_path_upper(dentry, &upperpath);
+	err = vfs_getattr(&upperpath, &stat);
+	if (err)
+		goto out_unlock;
+
+	err = -ESTALE;
+	if (!S_ISDIR(stat.mode))
+		goto out_unlock;
+	upper = upperpath.dentry;
+	if (upper->d_parent->d_inode != udir)
+		goto out_unlock;
+
+	opaquedir = ovl_lookup_temp(workdir, dentry);
+	err = PTR_ERR(opaquedir);
+	if (IS_ERR(opaquedir))
+		goto out_unlock;
+
+	err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+	if (err)
+		goto out_dput;
+
+	err = ovl_copy_xattr(upper, opaquedir);
+	if (err)
+		goto out_cleanup;
+
+	err = ovl_set_opaque(opaquedir);
+	if (err)
+		goto out_cleanup;
+
+	mutex_lock(&opaquedir->d_inode->i_mutex);
+	err = ovl_set_attr(opaquedir, &stat);
+	mutex_unlock(&opaquedir->d_inode->i_mutex);
+	if (err)
+		goto out_cleanup;
+
+	err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+	if (err)
+		goto out_cleanup;
+
+	ovl_cleanup_whiteouts(upper, list);
+	ovl_cleanup(wdir, upper);
+	unlock_rename(workdir, upperdir);
+
+	/* dentry's upper doesn't match now, get rid of it */
+	d_drop(dentry);
+
+	return opaquedir;
+
+out_cleanup:
+	ovl_cleanup(wdir, opaquedir);
+out_dput:
+	dput(opaquedir);
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out:
+	return ERR_PTR(err);
+}
+
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry,
+						enum ovl_path_type type)
+{
+	int err;
+	struct dentry *ret = NULL;
+	LIST_HEAD(list);
+
+	err = ovl_check_empty_dir(dentry, &list);
+	if (err)
+		ret = ERR_PTR(err);
+	else if (type == OVL_PATH_MERGE)
+		ret = ovl_clear_empty(dentry, &list);
+
+	ovl_cache_free(&list);
+
+	return ret;
+}
+
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+				    struct kstat *stat, const char *link,
+				    struct dentry *hardlink)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *wdir = workdir->d_inode;
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *upper;
+	struct dentry *newdentry;
+	int err;
+
+	err = ovl_lock_rename_workdir(workdir, upperdir);
+	if (err)
+		goto out;
+
+	newdentry = ovl_lookup_temp(workdir, dentry);
+	err = PTR_ERR(newdentry);
+	if (IS_ERR(newdentry))
+		goto out_unlock;
+
+	upper = lookup_one_len(dentry->d_name.name, upperdir,
+			       dentry->d_name.len);
+	err = PTR_ERR(upper);
+	if (IS_ERR(upper))
+		goto out_dput;
+
+	err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+	if (err)
+		goto out_dput2;
+
+	if (S_ISDIR(stat->mode)) {
+		err = ovl_set_opaque(newdentry);
+		if (err)
+			goto out_cleanup;
+
+		err = ovl_do_rename(wdir, newdentry, udir, upper,
+				    RENAME_EXCHANGE);
+		if (err)
+			goto out_cleanup;
+
+		ovl_cleanup(wdir, upper);
+	} else {
+		err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+		if (err)
+			goto out_cleanup;
+	}
+	ovl_dentry_version_inc(dentry->d_parent);
+	ovl_dentry_update(dentry, newdentry);
+	ovl_copyattr(newdentry->d_inode, inode);
+	d_instantiate(dentry, inode);
+	newdentry = NULL;
+out_dput2:
+	dput(upper);
+out_dput:
+	dput(newdentry);
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out:
+	return err;
+
+out_cleanup:
+	ovl_cleanup(wdir, newdentry);
+	goto out_dput2;
+}
+
+static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
+			      const char *link, struct dentry *hardlink)
+{
+	int err;
+	struct inode *inode;
+	struct kstat stat = {
+		.mode = mode,
+		.rdev = rdev,
+	};
+
+	err = -ENOMEM;
+	inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
+	if (!inode)
+		goto out;
+
+	err = ovl_copy_up(dentry->d_parent);
+	if (err)
+		goto out_iput;
+
+	if (!ovl_dentry_is_opaque(dentry)) {
+		err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
+	} else {
+		const struct cred *old_cred;
+		struct cred *override_cred;
+
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_iput;
+
+		/*
+		 * CAP_SYS_ADMIN for setting opaque xattr
+		 * CAP_DAC_OVERRIDE for create in workdir, rename
+		 * CAP_FOWNER for removing whiteout from sticky dir
+		 */
+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		cap_raise(override_cred->cap_effective, CAP_FOWNER);
+		old_cred = override_creds(override_cred);
+
+		err = ovl_create_over_whiteout(dentry, inode, &stat, link,
+					       hardlink);
+
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+
+	if (!err)
+		inode = NULL;
+out_iput:
+	iput(inode);
+out:
+	return err;
+}
+
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+			     const char *link)
+{
+	int err;
+
+	err = ovl_want_write(dentry);
+	if (!err) {
+		err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
+		ovl_drop_write(dentry);
+	}
+
+	return err;
+}
+
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
+{
+	return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+
+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+
+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		     dev_t rdev)
+{
+	/* Don't allow creation of "whiteout" on overlay */
+	if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+		return -EPERM;
+
+	return ovl_create_object(dentry, mode, rdev, NULL);
+}
+
+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *link)
+{
+	return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+
+static int ovl_link(struct dentry *old, struct inode *newdir,
+		    struct dentry *new)
+{
+	int err;
+	struct dentry *upper;
+
+	err = ovl_want_write(old);
+	if (err)
+		goto out;
+
+	err = ovl_copy_up(old);
+	if (err)
+		goto out_drop_write;
+
+	upper = ovl_dentry_upper(old);
+	err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+
+out_drop_write:
+	ovl_drop_write(old);
+out:
+	return err;
+}
+
+static int ovl_remove_and_whiteout(struct dentry *dentry,
+				   enum ovl_path_type type, bool is_dir)
+{
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *wdir = workdir->d_inode;
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *udir = upperdir->d_inode;
+	struct dentry *whiteout;
+	struct dentry *upper;
+	struct dentry *opaquedir = NULL;
+	int err;
+
+	if (is_dir) {
+		opaquedir = ovl_check_empty_and_clear(dentry, type);
+		err = PTR_ERR(opaquedir);
+		if (IS_ERR(opaquedir))
+			goto out;
+	}
+
+	err = ovl_lock_rename_workdir(workdir, upperdir);
+	if (err)
+		goto out_dput;
+
+	whiteout = ovl_whiteout(workdir, dentry);
+	err = PTR_ERR(whiteout);
+	if (IS_ERR(whiteout))
+		goto out_unlock;
+
+	if (type == OVL_PATH_LOWER) {
+		upper = lookup_one_len(dentry->d_name.name, upperdir,
+					   dentry->d_name.len);
+		err = PTR_ERR(upper);
+		if (IS_ERR(upper))
+			goto kill_whiteout;
+
+		err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
+		dput(upper);
+		if (err)
+			goto kill_whiteout;
+	} else {
+		int flags = 0;
+
+		upper = ovl_dentry_upper(dentry);
+		if (opaquedir)
+			upper = opaquedir;
+		err = -ESTALE;
+		if (upper->d_parent != upperdir)
+			goto kill_whiteout;
+
+		if (is_dir)
+			flags |= RENAME_EXCHANGE;
+
+		err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+		if (err)
+			goto kill_whiteout;
+
+		if (is_dir)
+			ovl_cleanup(wdir, upper);
+	}
+	ovl_dentry_version_inc(dentry->d_parent);
+out_d_drop:
+	d_drop(dentry);
+	dput(whiteout);
+out_unlock:
+	unlock_rename(workdir, upperdir);
+out_dput:
+	dput(opaquedir);
+out:
+	return err;
+
+kill_whiteout:
+	ovl_cleanup(wdir, whiteout);
+	goto out_d_drop;
+}
+
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
+{
+	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+	struct inode *dir = upperdir->d_inode;
+	struct dentry *upper = ovl_dentry_upper(dentry);
+	int err;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	err = -ESTALE;
+	if (upper->d_parent == upperdir) {
+		/* Don't let d_delete() think it can reset d_inode */
+		dget(upper);
+		if (is_dir)
+			err = vfs_rmdir(dir, upper);
+		else
+			err = vfs_unlink(dir, upper, NULL);
+		dput(upper);
+		ovl_dentry_version_inc(dentry->d_parent);
+	}
+
+	/*
+	 * Keeping this dentry hashed would mean having to release
+	 * upperpath/lowerpath, which could only be done if we are the
+	 * sole user of this dentry.  Too tricky...  Just unhash for
+	 * now.
+	 */
+	d_drop(dentry);
+	mutex_unlock(&dir->i_mutex);
+
+	return err;
+}
+
+static inline int ovl_check_sticky(struct dentry *dentry)
+{
+	struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
+	struct inode *inode = ovl_dentry_real(dentry)->d_inode;
+
+	if (check_sticky(dir, inode))
+		return -EPERM;
+
+	return 0;
+}
+
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+	enum ovl_path_type type;
+	int err;
+
+	err = ovl_check_sticky(dentry);
+	if (err)
+		goto out;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	err = ovl_copy_up(dentry->d_parent);
+	if (err)
+		goto out_drop_write;
+
+	type = ovl_path_type(dentry);
+	if (type == OVL_PATH_PURE_UPPER) {
+		err = ovl_remove_upper(dentry, is_dir);
+	} else {
+		const struct cred *old_cred;
+		struct cred *override_cred;
+
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_drop_write;
+
+		/*
+		 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+		 * CAP_DAC_OVERRIDE for create in workdir, rename
+		 * CAP_FOWNER for removing whiteout from sticky dir
+		 * CAP_FSETID for chmod of opaque dir
+		 * CAP_CHOWN for chown of opaque dir
+		 */
+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		cap_raise(override_cred->cap_effective, CAP_FOWNER);
+		cap_raise(override_cred->cap_effective, CAP_FSETID);
+		cap_raise(override_cred->cap_effective, CAP_CHOWN);
+		old_cred = override_creds(override_cred);
+
+		err = ovl_remove_and_whiteout(dentry, type, is_dir);
+
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+out_drop_write:
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return ovl_do_remove(dentry, false);
+}
+
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return ovl_do_remove(dentry, true);
+}
+
+static int ovl_rename2(struct inode *olddir, struct dentry *old,
+		       struct inode *newdir, struct dentry *new,
+		       unsigned int flags)
+{
+	int err;
+	enum ovl_path_type old_type;
+	enum ovl_path_type new_type;
+	struct dentry *old_upperdir;
+	struct dentry *new_upperdir;
+	struct dentry *olddentry;
+	struct dentry *newdentry;
+	struct dentry *trap;
+	bool old_opaque;
+	bool new_opaque;
+	bool new_create = false;
+	bool cleanup_whiteout = false;
+	bool overwrite = !(flags & RENAME_EXCHANGE);
+	bool is_dir = S_ISDIR(old->d_inode->i_mode);
+	bool new_is_dir = false;
+	struct dentry *opaquedir = NULL;
+	const struct cred *old_cred = NULL;
+	struct cred *override_cred = NULL;
+
+	err = -EINVAL;
+	if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+		goto out;
+
+	flags &= ~RENAME_NOREPLACE;
+
+	err = ovl_check_sticky(old);
+	if (err)
+		goto out;
+
+	/* Don't copy up directory trees */
+	old_type = ovl_path_type(old);
+	err = -EXDEV;
+	if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
+		goto out;
+
+	if (new->d_inode) {
+		err = ovl_check_sticky(new);
+		if (err)
+			goto out;
+
+		if (S_ISDIR(new->d_inode->i_mode))
+			new_is_dir = true;
+
+		new_type = ovl_path_type(new);
+		err = -EXDEV;
+		if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
+			goto out;
+
+		err = 0;
+		if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
+			if (ovl_dentry_lower(old)->d_inode ==
+			    ovl_dentry_lower(new)->d_inode)
+				goto out;
+		}
+		if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
+			if (ovl_dentry_upper(old)->d_inode ==
+			    ovl_dentry_upper(new)->d_inode)
+				goto out;
+		}
+	} else {
+		if (ovl_dentry_is_opaque(new))
+			new_type = OVL_PATH_UPPER;
+		else
+			new_type = OVL_PATH_PURE_UPPER;
+	}
+
+	err = ovl_want_write(old);
+	if (err)
+		goto out;
+
+	err = ovl_copy_up(old);
+	if (err)
+		goto out_drop_write;
+
+	err = ovl_copy_up(new->d_parent);
+	if (err)
+		goto out_drop_write;
+	if (!overwrite) {
+		err = ovl_copy_up(new);
+		if (err)
+			goto out_drop_write;
+	}
+
+	old_opaque = old_type != OVL_PATH_PURE_UPPER;
+	new_opaque = new_type != OVL_PATH_PURE_UPPER;
+
+	if (old_opaque || new_opaque) {
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_drop_write;
+
+		/*
+		 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+		 * CAP_DAC_OVERRIDE for create in workdir
+		 * CAP_FOWNER for removing whiteout from sticky dir
+		 * CAP_FSETID for chmod of opaque dir
+		 * CAP_CHOWN for chown of opaque dir
+		 */
+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+		cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+		cap_raise(override_cred->cap_effective, CAP_FOWNER);
+		cap_raise(override_cred->cap_effective, CAP_FSETID);
+		cap_raise(override_cred->cap_effective, CAP_CHOWN);
+		old_cred = override_creds(override_cred);
+	}
+
+	if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
+		opaquedir = ovl_check_empty_and_clear(new, new_type);
+		err = PTR_ERR(opaquedir);
+		if (IS_ERR(opaquedir)) {
+			opaquedir = NULL;
+			goto out_revert_creds;
+		}
+	}
+
+	if (overwrite) {
+		if (old_opaque) {
+			if (new->d_inode || !new_opaque) {
+				/* Whiteout source */
+				flags |= RENAME_WHITEOUT;
+			} else {
+				/* Switch whiteouts */
+				flags |= RENAME_EXCHANGE;
+			}
+		} else if (is_dir && !new->d_inode && new_opaque) {
+			flags |= RENAME_EXCHANGE;
+			cleanup_whiteout = true;
+		}
+	}
+
+	old_upperdir = ovl_dentry_upper(old->d_parent);
+	new_upperdir = ovl_dentry_upper(new->d_parent);
+
+	trap = lock_rename(new_upperdir, old_upperdir);
+
+	olddentry = ovl_dentry_upper(old);
+	newdentry = ovl_dentry_upper(new);
+	if (newdentry) {
+		if (opaquedir) {
+			newdentry = opaquedir;
+			opaquedir = NULL;
+		} else {
+			dget(newdentry);
+		}
+	} else {
+		new_create = true;
+		newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+					   new->d_name.len);
+		err = PTR_ERR(newdentry);
+		if (IS_ERR(newdentry))
+			goto out_unlock;
+	}
+
+	err = -ESTALE;
+	if (olddentry->d_parent != old_upperdir)
+		goto out_dput;
+	if (newdentry->d_parent != new_upperdir)
+		goto out_dput;
+	if (olddentry == trap)
+		goto out_dput;
+	if (newdentry == trap)
+		goto out_dput;
+
+	if (is_dir && !old_opaque && new_opaque) {
+		err = ovl_set_opaque(olddentry);
+		if (err)
+			goto out_dput;
+	}
+	if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
+		err = ovl_set_opaque(newdentry);
+		if (err)
+			goto out_dput;
+	}
+
+	if (old_opaque || new_opaque) {
+		err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+				    new_upperdir->d_inode, newdentry,
+				    flags);
+	} else {
+		/* No debug for the plain case */
+		BUG_ON(flags & ~RENAME_EXCHANGE);
+		err = vfs_rename(old_upperdir->d_inode, olddentry,
+				 new_upperdir->d_inode, newdentry,
+				 NULL, flags);
+	}
+
+	if (err) {
+		if (is_dir && !old_opaque && new_opaque)
+			ovl_remove_opaque(olddentry);
+		if (!overwrite && new_is_dir && old_opaque && !new_opaque)
+			ovl_remove_opaque(newdentry);
+		goto out_dput;
+	}
+
+	if (is_dir && old_opaque && !new_opaque)
+		ovl_remove_opaque(olddentry);
+	if (!overwrite && new_is_dir && !old_opaque && new_opaque)
+		ovl_remove_opaque(newdentry);
+
+	if (old_opaque != new_opaque) {
+		ovl_dentry_set_opaque(old, new_opaque);
+		if (!overwrite)
+			ovl_dentry_set_opaque(new, old_opaque);
+	}
+
+	if (cleanup_whiteout)
+		ovl_cleanup(old_upperdir->d_inode, newdentry);
+
+	ovl_dentry_version_inc(old->d_parent);
+	ovl_dentry_version_inc(new->d_parent);
+
+out_dput:
+	dput(newdentry);
+out_unlock:
+	unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+	if (old_opaque || new_opaque) {
+		revert_creds(old_cred);
+		put_cred(override_cred);
+	}
+out_drop_write:
+	ovl_drop_write(old);
+out:
+	dput(opaquedir);
+	return err;
+}
+
+const struct inode_operations ovl_dir_inode_operations = {
+	.lookup		= ovl_lookup,
+	.mkdir		= ovl_mkdir,
+	.symlink	= ovl_symlink,
+	.unlink		= ovl_unlink,
+	.rmdir		= ovl_rmdir,
+	.rename2	= ovl_rename2,
+	.link		= ovl_link,
+	.setattr	= ovl_setattr,
+	.create		= ovl_create,
+	.mknod		= ovl_mknod,
+	.permission	= ovl_permission,
+	.getattr	= ovl_dir_getattr,
+	.setxattr	= ovl_setxattr,
+	.getxattr	= ovl_getxattr,
+	.listxattr	= ovl_listxattr,
+	.removexattr	= ovl_removexattr,
+};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 0000000..af2d18c
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,425 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
+			    bool no_data)
+{
+	int err;
+	struct dentry *parent;
+	struct kstat stat;
+	struct path lowerpath;
+
+	parent = dget_parent(dentry);
+	err = ovl_copy_up(parent);
+	if (err)
+		goto out_dput_parent;
+
+	ovl_path_lower(dentry, &lowerpath);
+	err = vfs_getattr(&lowerpath, &stat);
+	if (err)
+		goto out_dput_parent;
+
+	if (no_data)
+		stat.size = 0;
+
+	err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+
+out_dput_parent:
+	dput(parent);
+	return err;
+}
+
+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int err;
+	struct dentry *upperdentry;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	upperdentry = ovl_dentry_upper(dentry);
+	if (upperdentry) {
+		mutex_lock(&upperdentry->d_inode->i_mutex);
+		err = notify_change(upperdentry, attr, NULL);
+		mutex_unlock(&upperdentry->d_inode->i_mutex);
+	} else {
+		err = ovl_copy_up_last(dentry, attr, false);
+	}
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			 struct kstat *stat)
+{
+	struct path realpath;
+
+	ovl_path_real(dentry, &realpath);
+	return vfs_getattr(&realpath, stat);
+}
+
+int ovl_permission(struct inode *inode, int mask)
+{
+	struct ovl_entry *oe;
+	struct dentry *alias = NULL;
+	struct inode *realinode;
+	struct dentry *realdentry;
+	bool is_upper;
+	int err;
+
+	if (S_ISDIR(inode->i_mode)) {
+		oe = inode->i_private;
+	} else if (mask & MAY_NOT_BLOCK) {
+		return -ECHILD;
+	} else {
+		/*
+		 * For non-directories find an alias and get the info
+		 * from there.
+		 */
+		alias = d_find_any_alias(inode);
+		if (WARN_ON(!alias))
+			return -ENOENT;
+
+		oe = alias->d_fsdata;
+	}
+
+	realdentry = ovl_entry_real(oe, &is_upper);
+
+	/* Careful in RCU walk mode */
+	realinode = ACCESS_ONCE(realdentry->d_inode);
+	if (!realinode) {
+		WARN_ON(!(mask & MAY_NOT_BLOCK));
+		err = -ENOENT;
+		goto out_dput;
+	}
+
+	if (mask & MAY_WRITE) {
+		umode_t mode = realinode->i_mode;
+
+		/*
+		 * Writes will always be redirected to upper layer, so
+		 * ignore lower layer being read-only.
+		 *
+		 * If the overlay itself is read-only then proceed
+		 * with the permission check, don't return EROFS.
+		 * This will only happen if this is the lower layer of
+		 * another overlayfs.
+		 *
+		 * If upper fs becomes read-only after the overlay was
+		 * constructed return EROFS to prevent modification of
+		 * upper layer.
+		 */
+		err = -EROFS;
+		if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
+		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+			goto out_dput;
+	}
+
+	err = __inode_permission(realinode, mask);
+out_dput:
+	dput(alias);
+	return err;
+}
+
+
+struct ovl_link_data {
+	struct dentry *realdentry;
+	void *cookie;
+};
+
+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	void *ret;
+	struct dentry *realdentry;
+	struct inode *realinode;
+
+	realdentry = ovl_dentry_real(dentry);
+	realinode = realdentry->d_inode;
+
+	if (WARN_ON(!realinode->i_op->follow_link))
+		return ERR_PTR(-EPERM);
+
+	ret = realinode->i_op->follow_link(realdentry, nd);
+	if (IS_ERR(ret))
+		return ret;
+
+	if (realinode->i_op->put_link) {
+		struct ovl_link_data *data;
+
+		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
+		if (!data) {
+			realinode->i_op->put_link(realdentry, nd, ret);
+			return ERR_PTR(-ENOMEM);
+		}
+		data->realdentry = realdentry;
+		data->cookie = ret;
+
+		return data;
+	} else {
+		return NULL;
+	}
+}
+
+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+	struct inode *realinode;
+	struct ovl_link_data *data = c;
+
+	if (!data)
+		return;
+
+	realinode = data->realdentry->d_inode;
+	realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+	kfree(data);
+}
+
+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+	struct path realpath;
+	struct inode *realinode;
+
+	ovl_path_real(dentry, &realpath);
+	realinode = realpath.dentry->d_inode;
+
+	if (!realinode->i_op->readlink)
+		return -EINVAL;
+
+	touch_atime(&realpath);
+
+	return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+}
+
+
+static bool ovl_is_private_xattr(const char *name)
+{
+	return strncmp(name, "trusted.overlay.", 14) == 0;
+}
+
+int ovl_setxattr(struct dentry *dentry, const char *name,
+		 const void *value, size_t size, int flags)
+{
+	int err;
+	struct dentry *upperdentry;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	err = -EPERM;
+	if (ovl_is_private_xattr(name))
+		goto out_drop_write;
+
+	err = ovl_copy_up(dentry);
+	if (err)
+		goto out_drop_write;
+
+	upperdentry = ovl_dentry_upper(dentry);
+	err = vfs_setxattr(upperdentry, name, value, size, flags);
+
+out_drop_write:
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+		     void *value, size_t size)
+{
+	if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
+	    ovl_is_private_xattr(name))
+		return -ENODATA;
+
+	return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	ssize_t res;
+	int off;
+
+	res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
+	if (res <= 0 || size == 0)
+		return res;
+
+	if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
+		return res;
+
+	/* filter out private xattrs */
+	for (off = 0; off < res;) {
+		char *s = list + off;
+		size_t slen = strlen(s) + 1;
+
+		BUG_ON(off + slen > res);
+
+		if (ovl_is_private_xattr(s)) {
+			res -= slen;
+			memmove(s, s + slen, res - off);
+		} else {
+			off += slen;
+		}
+	}
+
+	return res;
+}
+
+int ovl_removexattr(struct dentry *dentry, const char *name)
+{
+	int err;
+	struct path realpath;
+	enum ovl_path_type type;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
+	    ovl_is_private_xattr(name))
+		goto out_drop_write;
+
+	type = ovl_path_real(dentry, &realpath);
+	if (type == OVL_PATH_LOWER) {
+		err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+		if (err < 0)
+			goto out_drop_write;
+
+		err = ovl_copy_up(dentry);
+		if (err)
+			goto out_drop_write;
+
+		ovl_path_upper(dentry, &realpath);
+	}
+
+	err = vfs_removexattr(realpath.dentry, name);
+out_drop_write:
+	ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
+				  struct dentry *realdentry)
+{
+	if (type != OVL_PATH_LOWER)
+		return false;
+
+	if (special_file(realdentry->d_inode->i_mode))
+		return false;
+
+	if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
+		return false;
+
+	return true;
+}
+
+static int ovl_dentry_open(struct dentry *dentry, struct file *file,
+		    const struct cred *cred)
+{
+	int err;
+	struct path realpath;
+	enum ovl_path_type type;
+	bool want_write = false;
+
+	type = ovl_path_real(dentry, &realpath);
+	if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+		want_write = true;
+		err = ovl_want_write(dentry);
+		if (err)
+			goto out;
+
+		if (file->f_flags & O_TRUNC)
+			err = ovl_copy_up_last(dentry, NULL, true);
+		else
+			err = ovl_copy_up(dentry);
+		if (err)
+			goto out_drop_write;
+
+		ovl_path_upper(dentry, &realpath);
+	}
+
+	err = vfs_open(&realpath, file, cred);
+out_drop_write:
+	if (want_write)
+		ovl_drop_write(dentry);
+out:
+	return err;
+}
+
+static const struct inode_operations ovl_file_inode_operations = {
+	.setattr	= ovl_setattr,
+	.permission	= ovl_permission,
+	.getattr	= ovl_getattr,
+	.setxattr	= ovl_setxattr,
+	.getxattr	= ovl_getxattr,
+	.listxattr	= ovl_listxattr,
+	.removexattr	= ovl_removexattr,
+	.dentry_open	= ovl_dentry_open,
+};
+
+static const struct inode_operations ovl_symlink_inode_operations = {
+	.setattr	= ovl_setattr,
+	.follow_link	= ovl_follow_link,
+	.put_link	= ovl_put_link,
+	.readlink	= ovl_readlink,
+	.getattr	= ovl_getattr,
+	.setxattr	= ovl_setxattr,
+	.getxattr	= ovl_getxattr,
+	.listxattr	= ovl_listxattr,
+	.removexattr	= ovl_removexattr,
+};
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+			    struct ovl_entry *oe)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	mode &= S_IFMT;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	switch (mode) {
+	case S_IFDIR:
+		inode->i_private = oe;
+		inode->i_op = &ovl_dir_inode_operations;
+		inode->i_fop = &ovl_dir_operations;
+		break;
+
+	case S_IFLNK:
+		inode->i_op = &ovl_symlink_inode_operations;
+		break;
+
+	case S_IFREG:
+	case S_IFSOCK:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFIFO:
+		inode->i_op = &ovl_file_inode_operations;
+		break;
+
+	default:
+		WARN(1, "illegal file type: %i\n", mode);
+		iput(inode);
+		inode = NULL;
+	}
+
+	return inode;
+
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 0000000..814bed3
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,191 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+
+struct ovl_entry;
+
+enum ovl_path_type {
+	OVL_PATH_PURE_UPPER,
+	OVL_PATH_UPPER,
+	OVL_PATH_MERGE,
+	OVL_PATH_LOWER,
+};
+
+extern const char *ovl_opaque_xattr;
+
+static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err = vfs_rmdir(dir, dentry);
+	pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+	return err;
+}
+
+static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int err = vfs_unlink(dir, dentry, NULL);
+	pr_debug("unlink(%pd2) = %i\n", dentry, err);
+	return err;
+}
+
+static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
+			      struct dentry *new_dentry, bool debug)
+{
+	int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+	if (debug) {
+		pr_debug("link(%pd2, %pd2) = %i\n",
+			 old_dentry, new_dentry, err);
+	}
+	return err;
+}
+
+static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+			     umode_t mode, bool debug)
+{
+	int err = vfs_create(dir, dentry, mode, true);
+	if (debug)
+		pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+	return err;
+}
+
+static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+			       umode_t mode, bool debug)
+{
+	int err = vfs_mkdir(dir, dentry, mode);
+	if (debug)
+		pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+	return err;
+}
+
+static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+			       umode_t mode, dev_t dev, bool debug)
+{
+	int err = vfs_mknod(dir, dentry, mode, dev);
+	if (debug) {
+		pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
+			 dentry, mode, dev, err);
+	}
+	return err;
+}
+
+static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+				 const char *oldname, bool debug)
+{
+	int err = vfs_symlink(dir, dentry, oldname);
+	if (debug)
+		pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+	return err;
+}
+
+static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
+				  const void *value, size_t size, int flags)
+{
+	int err = vfs_setxattr(dentry, name, value, size, flags);
+	pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
+		 dentry, name, (int) size, (char *) value, flags, err);
+	return err;
+}
+
+static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+{
+	int err = vfs_removexattr(dentry, name);
+	pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+	return err;
+}
+
+static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+				struct inode *newdir, struct dentry *newdentry,
+				unsigned int flags)
+{
+	int err;
+
+	pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+		 olddentry, newdentry, flags);
+
+	err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+
+	if (err) {
+		pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+			 olddentry, newdentry, err);
+	}
+	return err;
+}
+
+static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+{
+	int err = vfs_whiteout(dir, dentry);
+	pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+	return err;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct dentry *ovl_workdir(struct dentry *dentry);
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
+bool ovl_is_whiteout(struct dentry *dentry);
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+			  unsigned int flags);
+struct file *ovl_path_open(struct path *path, int flags);
+
+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
+				struct kstat *stat, const char *link);
+
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+
+/* inode.c */
+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_permission(struct inode *inode, int mask);
+int ovl_setxattr(struct dentry *dentry, const char *name,
+		 const void *value, size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+		     void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+int ovl_removexattr(struct dentry *dentry, const char *name);
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+			    struct ovl_entry *oe);
+static inline void ovl_copyattr(struct inode *from, struct inode *to)
+{
+	to->i_uid = from->i_uid;
+	to->i_gid = from->i_gid;
+}
+
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+		    struct kstat *stat, const char *link,
+		    struct dentry *hardlink, bool debug);
+void ovl_cleanup(struct inode *dir, struct dentry *dentry);
+
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+		    struct path *lowerpath, struct kstat *stat,
+		    struct iattr *attr);
+int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 0000000..c6787f8
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,587 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+struct ovl_cache_entry {
+	const char *name;
+	unsigned int len;
+	unsigned int type;
+	u64 ino;
+	bool is_whiteout;
+	struct list_head l_node;
+	struct rb_node node;
+};
+
+struct ovl_dir_cache {
+	long refcount;
+	u64 version;
+	struct list_head entries;
+};
+
+struct ovl_readdir_data {
+	struct dir_context ctx;
+	bool is_merge;
+	struct rb_root *root;
+	struct list_head *list;
+	struct list_head *middle;
+	int count;
+	int err;
+};
+
+struct ovl_dir_file {
+	bool is_real;
+	bool is_upper;
+	struct ovl_dir_cache *cache;
+	struct ovl_cache_entry cursor;
+	struct file *realfile;
+	struct file *upperfile;
+};
+
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+	return container_of(n, struct ovl_cache_entry, node);
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+						    const char *name, int len)
+{
+	struct rb_node *node = root->rb_node;
+	int cmp;
+
+	while (node) {
+		struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+
+		cmp = strncmp(name, p->name, len);
+		if (cmp > 0)
+			node = p->node.rb_right;
+		else if (cmp < 0 || len < p->len)
+			node = p->node.rb_left;
+		else
+			return p;
+	}
+
+	return NULL;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
+						   u64 ino, unsigned int d_type)
+{
+	struct ovl_cache_entry *p;
+
+	p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
+	if (p) {
+		char *name_copy = (char *) (p + 1);
+		memcpy(name_copy, name, len);
+		name_copy[len] = '\0';
+		p->name = name_copy;
+		p->len = len;
+		p->type = d_type;
+		p->ino = ino;
+		p->is_whiteout = false;
+	}
+
+	return p;
+}
+
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+				  const char *name, int len, u64 ino,
+				  unsigned int d_type)
+{
+	struct rb_node **newp = &rdd->root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ovl_cache_entry *p;
+
+	while (*newp) {
+		int cmp;
+		struct ovl_cache_entry *tmp;
+
+		parent = *newp;
+		tmp = ovl_cache_entry_from_node(*newp);
+		cmp = strncmp(name, tmp->name, len);
+		if (cmp > 0)
+			newp = &tmp->node.rb_right;
+		else if (cmp < 0 || len < tmp->len)
+			newp = &tmp->node.rb_left;
+		else
+			return 0;
+	}
+
+	p = ovl_cache_entry_new(name, len, ino, d_type);
+	if (p == NULL)
+		return -ENOMEM;
+
+	list_add_tail(&p->l_node, rdd->list);
+	rb_link_node(&p->node, parent, newp);
+	rb_insert_color(&p->node, rdd->root);
+
+	return 0;
+}
+
+static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+			  const char *name, int namelen,
+			  loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct ovl_cache_entry *p;
+
+	p = ovl_cache_entry_find(rdd->root, name, namelen);
+	if (p) {
+		list_move_tail(&p->l_node, rdd->middle);
+	} else {
+		p = ovl_cache_entry_new(name, namelen, ino, d_type);
+		if (p == NULL)
+			rdd->err = -ENOMEM;
+		else
+			list_add_tail(&p->l_node, rdd->middle);
+	}
+
+	return rdd->err;
+}
+
+void ovl_cache_free(struct list_head *list)
+{
+	struct ovl_cache_entry *p;
+	struct ovl_cache_entry *n;
+
+	list_for_each_entry_safe(p, n, list, l_node)
+		kfree(p);
+
+	INIT_LIST_HEAD(list);
+}
+
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+	struct ovl_dir_cache *cache = od->cache;
+
+	list_del(&od->cursor.l_node);
+	WARN_ON(cache->refcount <= 0);
+	cache->refcount--;
+	if (!cache->refcount) {
+		if (ovl_dir_cache(dentry) == cache)
+			ovl_set_dir_cache(dentry, NULL);
+
+		ovl_cache_free(&cache->entries);
+		kfree(cache);
+	}
+}
+
+static int ovl_fill_merge(void *buf, const char *name, int namelen,
+			  loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct ovl_readdir_data *rdd = buf;
+
+	rdd->count++;
+	if (!rdd->is_merge)
+		return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+	else
+		return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+}
+
+static inline int ovl_dir_read(struct path *realpath,
+			       struct ovl_readdir_data *rdd)
+{
+	struct file *realfile;
+	int err;
+
+	realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+	if (IS_ERR(realfile))
+		return PTR_ERR(realfile);
+
+	rdd->ctx.pos = 0;
+	do {
+		rdd->count = 0;
+		rdd->err = 0;
+		err = iterate_dir(realfile, &rdd->ctx);
+		if (err >= 0)
+			err = rdd->err;
+	} while (!err && rdd->count);
+	fput(realfile);
+
+	return err;
+}
+
+static void ovl_dir_reset(struct file *file)
+{
+	struct ovl_dir_file *od = file->private_data;
+	struct ovl_dir_cache *cache = od->cache;
+	struct dentry *dentry = file->f_path.dentry;
+	enum ovl_path_type type = ovl_path_type(dentry);
+
+	if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+		ovl_cache_put(od, dentry);
+		od->cache = NULL;
+	}
+	WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
+	if (od->is_real && type == OVL_PATH_MERGE)
+		od->is_real = false;
+}
+
+static int ovl_dir_mark_whiteouts(struct dentry *dir,
+				  struct ovl_readdir_data *rdd)
+{
+	struct ovl_cache_entry *p;
+	struct dentry *dentry;
+	const struct cred *old_cred;
+	struct cred *override_cred;
+
+	override_cred = prepare_creds();
+	if (!override_cred) {
+		ovl_cache_free(rdd->list);
+		return -ENOMEM;
+	}
+
+	/*
+	 * CAP_DAC_OVERRIDE for lookup
+	 */
+	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+	old_cred = override_creds(override_cred);
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	list_for_each_entry(p, rdd->list, l_node) {
+		if (!p->name)
+			continue;
+
+		if (p->type != DT_CHR)
+			continue;
+
+		dentry = lookup_one_len(p->name, dir, p->len);
+		if (IS_ERR(dentry))
+			continue;
+
+		p->is_whiteout = ovl_is_whiteout(dentry);
+		dput(dentry);
+	}
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	revert_creds(old_cred);
+	put_cred(override_cred);
+
+	return 0;
+}
+
+static inline int ovl_dir_read_merged(struct path *upperpath,
+				      struct path *lowerpath,
+				      struct list_head *list)
+{
+	int err;
+	struct rb_root root = RB_ROOT;
+	struct list_head middle;
+	struct ovl_readdir_data rdd = {
+		.ctx.actor = ovl_fill_merge,
+		.list = list,
+		.root = &root,
+		.is_merge = false,
+	};
+
+	if (upperpath->dentry) {
+		err = ovl_dir_read(upperpath, &rdd);
+		if (err)
+			goto out;
+
+		if (lowerpath->dentry) {
+			err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd);
+			if (err)
+				goto out;
+		}
+	}
+	if (lowerpath->dentry) {
+		/*
+		 * Insert lowerpath entries before upperpath ones, this allows
+		 * offsets to be reasonably constant
+		 */
+		list_add(&middle, rdd.list);
+		rdd.middle = &middle;
+		rdd.is_merge = true;
+		err = ovl_dir_read(lowerpath, &rdd);
+		list_del(&middle);
+	}
+out:
+	return err;
+
+}
+
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+	struct ovl_cache_entry *p;
+	loff_t off = 0;
+
+	list_for_each_entry(p, &od->cache->entries, l_node) {
+		if (!p->name)
+			continue;
+		if (off >= pos)
+			break;
+		off++;
+	}
+	list_move_tail(&od->cursor.l_node, &p->l_node);
+}
+
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+	int res;
+	struct path lowerpath;
+	struct path upperpath;
+	struct ovl_dir_cache *cache;
+
+	cache = ovl_dir_cache(dentry);
+	if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+		cache->refcount++;
+		return cache;
+	}
+	ovl_set_dir_cache(dentry, NULL);
+
+	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+	if (!cache)
+		return ERR_PTR(-ENOMEM);
+
+	cache->refcount = 1;
+	INIT_LIST_HEAD(&cache->entries);
+
+	ovl_path_lower(dentry, &lowerpath);
+	ovl_path_upper(dentry, &upperpath);
+
+	res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries);
+	if (res) {
+		ovl_cache_free(&cache->entries);
+		kfree(cache);
+		return ERR_PTR(res);
+	}
+
+	cache->version = ovl_dentry_version_get(dentry);
+	ovl_set_dir_cache(dentry, cache);
+
+	return cache;
+}
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+	struct ovl_dir_file *od = file->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+
+	if (!ctx->pos)
+		ovl_dir_reset(file);
+
+	if (od->is_real)
+		return iterate_dir(od->realfile, ctx);
+
+	if (!od->cache) {
+		struct ovl_dir_cache *cache;
+
+		cache = ovl_cache_get(dentry);
+		if (IS_ERR(cache))
+			return PTR_ERR(cache);
+
+		od->cache = cache;
+		ovl_seek_cursor(od, ctx->pos);
+	}
+
+	while (od->cursor.l_node.next != &od->cache->entries) {
+		struct ovl_cache_entry *p;
+
+		p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
+		/* Skip cursors */
+		if (p->name) {
+			if (!p->is_whiteout) {
+				if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+					break;
+			}
+			ctx->pos++;
+		}
+		list_move(&od->cursor.l_node, &p->l_node);
+	}
+	return 0;
+}
+
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t res;
+	struct ovl_dir_file *od = file->private_data;
+
+	mutex_lock(&file_inode(file)->i_mutex);
+	if (!file->f_pos)
+		ovl_dir_reset(file);
+
+	if (od->is_real) {
+		res = vfs_llseek(od->realfile, offset, origin);
+		file->f_pos = od->realfile->f_pos;
+	} else {
+		res = -EINVAL;
+
+		switch (origin) {
+		case SEEK_CUR:
+			offset += file->f_pos;
+			break;
+		case SEEK_SET:
+			break;
+		default:
+			goto out_unlock;
+		}
+		if (offset < 0)
+			goto out_unlock;
+
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			if (od->cache)
+				ovl_seek_cursor(od, offset);
+		}
+		res = offset;
+	}
+out_unlock:
+	mutex_unlock(&file_inode(file)->i_mutex);
+
+	return res;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+			 int datasync)
+{
+	struct ovl_dir_file *od = file->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct file *realfile = od->realfile;
+
+	/*
+	 * Need to check if we started out being a lower dir, but got copied up
+	 */
+	if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
+		struct inode *inode = file_inode(file);
+
+		mutex_lock(&inode->i_mutex);
+		realfile = od->upperfile;
+		if (!realfile) {
+			struct path upperpath;
+
+			ovl_path_upper(dentry, &upperpath);
+			realfile = ovl_path_open(&upperpath, O_RDONLY);
+			if (IS_ERR(realfile)) {
+				mutex_unlock(&inode->i_mutex);
+				return PTR_ERR(realfile);
+			}
+			od->upperfile = realfile;
+		}
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	return vfs_fsync_range(realfile, start, end, datasync);
+}
+
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+	struct ovl_dir_file *od = file->private_data;
+
+	if (od->cache) {
+		mutex_lock(&inode->i_mutex);
+		ovl_cache_put(od, file->f_path.dentry);
+		mutex_unlock(&inode->i_mutex);
+	}
+	fput(od->realfile);
+	if (od->upperfile)
+		fput(od->upperfile);
+	kfree(od);
+
+	return 0;
+}
+
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+	struct path realpath;
+	struct file *realfile;
+	struct ovl_dir_file *od;
+	enum ovl_path_type type;
+
+	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+	if (!od)
+		return -ENOMEM;
+
+	type = ovl_path_real(file->f_path.dentry, &realpath);
+	realfile = ovl_path_open(&realpath, file->f_flags);
+	if (IS_ERR(realfile)) {
+		kfree(od);
+		return PTR_ERR(realfile);
+	}
+	INIT_LIST_HEAD(&od->cursor.l_node);
+	od->realfile = realfile;
+	od->is_real = (type != OVL_PATH_MERGE);
+	od->is_upper = (type != OVL_PATH_LOWER);
+	file->private_data = od;
+
+	return 0;
+}
+
+const struct file_operations ovl_dir_operations = {
+	.read		= generic_read_dir,
+	.open		= ovl_dir_open,
+	.iterate	= ovl_iterate,
+	.llseek		= ovl_dir_llseek,
+	.fsync		= ovl_dir_fsync,
+	.release	= ovl_dir_release,
+};
+
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+	int err;
+	struct path lowerpath;
+	struct path upperpath;
+	struct ovl_cache_entry *p;
+
+	ovl_path_upper(dentry, &upperpath);
+	ovl_path_lower(dentry, &lowerpath);
+
+	err = ovl_dir_read_merged(&upperpath, &lowerpath, list);
+	if (err)
+		return err;
+
+	err = 0;
+
+	list_for_each_entry(p, list, l_node) {
+		if (p->is_whiteout)
+			continue;
+
+		if (p->name[0] == '.') {
+			if (p->len == 1)
+				continue;
+			if (p->len == 2 && p->name[1] == '.')
+				continue;
+		}
+		err = -ENOTEMPTY;
+		break;
+	}
+
+	return err;
+}
+
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+{
+	struct ovl_cache_entry *p;
+
+	mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT);
+	list_for_each_entry(p, list, l_node) {
+		struct dentry *dentry;
+
+		if (!p->is_whiteout)
+			continue;
+
+		dentry = lookup_one_len(p->name, upper, p->len);
+		if (IS_ERR(dentry)) {
+			pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+			       upper->d_name.name, p->len, p->name,
+			       (int) PTR_ERR(dentry));
+			continue;
+		}
+		ovl_cleanup(upper->d_inode, dentry);
+		dput(dentry);
+	}
+	mutex_unlock(&upper->d_inode->i_mutex);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 0000000..227710a
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,727 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include "overlayfs.h"
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+	struct vfsmount *upper_mnt;
+	struct vfsmount *lower_mnt;
+	struct dentry *workdir;
+};
+
+struct ovl_dir_cache;
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+	struct dentry *__upperdentry;
+	struct dentry *lowerdentry;
+	struct ovl_dir_cache *cache;
+	union {
+		struct {
+			u64 version;
+			bool opaque;
+		};
+		struct rcu_head rcu;
+	};
+};
+
+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
+
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	if (oe->__upperdentry) {
+		if (oe->lowerdentry) {
+			if (S_ISDIR(dentry->d_inode->i_mode))
+				return OVL_PATH_MERGE;
+			else
+				return OVL_PATH_UPPER;
+		} else {
+			if (oe->opaque)
+				return OVL_PATH_UPPER;
+			else
+				return OVL_PATH_PURE_UPPER;
+		}
+	} else {
+		return OVL_PATH_LOWER;
+	}
+}
+
+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+	struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
+	/*
+	 * Make sure to order reads to upperdentry wrt ovl_dentry_update()
+	 */
+	smp_read_barrier_depends();
+	return upperdentry;
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	path->mnt = ofs->upper_mnt;
+	path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+
+	enum ovl_path_type type = ovl_path_type(dentry);
+
+	if (type == OVL_PATH_LOWER)
+		ovl_path_lower(dentry, path);
+	else
+		ovl_path_upper(dentry, path);
+
+	return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return ovl_upperdentry_dereference(oe);
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->lowerdentry;
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	struct dentry *realdentry;
+
+	realdentry = ovl_upperdentry_dereference(oe);
+	if (!realdentry)
+		realdentry = oe->lowerdentry;
+
+	return realdentry;
+}
+
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+{
+	struct dentry *realdentry;
+
+	realdentry = ovl_upperdentry_dereference(oe);
+	if (realdentry) {
+		*is_upper = true;
+	} else {
+		realdentry = oe->lowerdentry;
+		*is_upper = false;
+	}
+	return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	oe->cache = cache;
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	path->mnt = ofs->lower_mnt;
+	path->dentry = oe->lowerdentry;
+}
+
+int ovl_want_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	return ofs->workdir;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	return oe->opaque;
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	oe->opaque = opaque;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+	WARN_ON(oe->__upperdentry);
+	BUG_ON(!upperdentry->d_inode);
+	/*
+	 * Make sure upperdentry is consistent before making it visible to
+	 * ovl_upperdentry_dereference().
+	 */
+	smp_wmb();
+	oe->__upperdentry = upperdentry;
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	return inode && IS_WHITEOUT(inode);
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+	int res;
+	char val;
+	struct inode *inode = dentry->d_inode;
+
+	if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+		return false;
+
+	res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
+	if (res == 1 && val == 'y')
+		return true;
+
+	return false;
+}
+
+static void ovl_dentry_release(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	if (oe) {
+		dput(oe->__upperdentry);
+		dput(oe->lowerdentry);
+		kfree_rcu(oe, rcu);
+	}
+}
+
+static const struct dentry_operations ovl_dentry_operations = {
+	.d_release = ovl_dentry_release,
+};
+
+static struct ovl_entry *ovl_alloc_entry(void)
+{
+	return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
+}
+
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
+					     struct qstr *name)
+{
+	struct dentry *dentry;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	dentry = lookup_one_len(name->name, dir, name->len);
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	if (IS_ERR(dentry)) {
+		if (PTR_ERR(dentry) == -ENOENT)
+			dentry = NULL;
+	} else if (!dentry->d_inode) {
+		dput(dentry);
+		dentry = NULL;
+	}
+	return dentry;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+			  unsigned int flags)
+{
+	struct ovl_entry *oe;
+	struct dentry *upperdir;
+	struct dentry *lowerdir;
+	struct dentry *upperdentry = NULL;
+	struct dentry *lowerdentry = NULL;
+	struct inode *inode = NULL;
+	int err;
+
+	err = -ENOMEM;
+	oe = ovl_alloc_entry();
+	if (!oe)
+		goto out;
+
+	upperdir = ovl_dentry_upper(dentry->d_parent);
+	lowerdir = ovl_dentry_lower(dentry->d_parent);
+
+	if (upperdir) {
+		upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
+		err = PTR_ERR(upperdentry);
+		if (IS_ERR(upperdentry))
+			goto out_put_dir;
+
+		if (lowerdir && upperdentry) {
+			if (ovl_is_whiteout(upperdentry)) {
+				dput(upperdentry);
+				upperdentry = NULL;
+				oe->opaque = true;
+			} else if (ovl_is_opaquedir(upperdentry)) {
+				oe->opaque = true;
+			}
+		}
+	}
+	if (lowerdir && !oe->opaque) {
+		lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
+		err = PTR_ERR(lowerdentry);
+		if (IS_ERR(lowerdentry))
+			goto out_dput_upper;
+	}
+
+	if (lowerdentry && upperdentry &&
+	    (!S_ISDIR(upperdentry->d_inode->i_mode) ||
+	     !S_ISDIR(lowerdentry->d_inode->i_mode))) {
+		dput(lowerdentry);
+		lowerdentry = NULL;
+		oe->opaque = true;
+	}
+
+	if (lowerdentry || upperdentry) {
+		struct dentry *realdentry;
+
+		realdentry = upperdentry ? upperdentry : lowerdentry;
+		err = -ENOMEM;
+		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
+				      oe);
+		if (!inode)
+			goto out_dput;
+		ovl_copyattr(realdentry->d_inode, inode);
+	}
+
+	oe->__upperdentry = upperdentry;
+	oe->lowerdentry = lowerdentry;
+
+	dentry->d_fsdata = oe;
+	d_add(dentry, inode);
+
+	return NULL;
+
+out_dput:
+	dput(lowerdentry);
+out_dput_upper:
+	dput(upperdentry);
+out_put_dir:
+	kfree(oe);
+out:
+	return ERR_PTR(err);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+	return dentry_open(path, flags, current_cred());
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+	struct ovl_fs *ufs = sb->s_fs_info;
+
+	dput(ufs->workdir);
+	mntput(ufs->upper_mnt);
+	mntput(ufs->lower_mnt);
+
+	kfree(ufs);
+}
+
+static const struct super_operations ovl_super_operations = {
+	.put_super	= ovl_put_super,
+};
+
+struct ovl_config {
+	char *lowerdir;
+	char *upperdir;
+	char *workdir;
+};
+
+enum {
+	OPT_LOWERDIR,
+	OPT_UPPERDIR,
+	OPT_WORKDIR,
+	OPT_ERR,
+};
+
+static const match_table_t ovl_tokens = {
+	{OPT_LOWERDIR,			"lowerdir=%s"},
+	{OPT_UPPERDIR,			"upperdir=%s"},
+	{OPT_WORKDIR,			"workdir=%s"},
+	{OPT_ERR,			NULL}
+};
+
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+	char *p;
+
+	config->upperdir = NULL;
+	config->lowerdir = NULL;
+	config->workdir = NULL;
+
+	while ((p = strsep(&opt, ",")) != NULL) {
+		int token;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, ovl_tokens, args);
+		switch (token) {
+		case OPT_UPPERDIR:
+			kfree(config->upperdir);
+			config->upperdir = match_strdup(&args[0]);
+			if (!config->upperdir)
+				return -ENOMEM;
+			break;
+
+		case OPT_LOWERDIR:
+			kfree(config->lowerdir);
+			config->lowerdir = match_strdup(&args[0]);
+			if (!config->lowerdir)
+				return -ENOMEM;
+			break;
+
+		case OPT_WORKDIR:
+			kfree(config->workdir);
+			config->workdir = match_strdup(&args[0]);
+			if (!config->workdir)
+				return -ENOMEM;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+#define OVL_WORKDIR_NAME "work"
+
+static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
+					 struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_inode;
+	struct dentry *work;
+	int err;
+	bool retried = false;
+
+	err = mnt_want_write(mnt);
+	if (err)
+		return ERR_PTR(err);
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+retry:
+	work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
+			      strlen(OVL_WORKDIR_NAME));
+
+	if (!IS_ERR(work)) {
+		struct kstat stat = {
+			.mode = S_IFDIR | 0,
+		};
+
+		if (work->d_inode) {
+			err = -EEXIST;
+			if (retried)
+				goto out_dput;
+
+			retried = true;
+			ovl_cleanup(dir, work);
+			dput(work);
+			goto retry;
+		}
+
+		err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+		if (err)
+			goto out_dput;
+	}
+out_unlock:
+	mutex_unlock(&dir->i_mutex);
+	mnt_drop_write(mnt);
+
+	return work;
+
+out_dput:
+	dput(work);
+	work = ERR_PTR(err);
+	goto out_unlock;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+	int err;
+
+	err = kern_path(name, LOOKUP_FOLLOW, path);
+	if (err) {
+		pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static bool ovl_is_allowed_fs_type(struct dentry *root)
+{
+	const struct dentry_operations *dop = root->d_op;
+
+	/*
+	 * We don't support:
+	 *  - automount filesystems
+	 *  - filesystems with revalidate (FIXME for lower layer)
+	 *  - filesystems with case insensitive names
+	 */
+	if (dop &&
+	    (dop->d_manage || dop->d_automount ||
+	     dop->d_revalidate || dop->d_weak_revalidate ||
+	     dop->d_compare || dop->d_hash)) {
+		return false;
+	}
+	return true;
+}
+
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+	bool ok = false;
+
+	if (workdir != upperdir) {
+		ok = (lock_rename(workdir, upperdir) == NULL);
+		unlock_rename(workdir, upperdir);
+	}
+	return ok;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct path lowerpath;
+	struct path upperpath;
+	struct path workpath;
+	struct inode *root_inode;
+	struct dentry *root_dentry;
+	struct ovl_entry *oe;
+	struct ovl_fs *ufs;
+	struct ovl_config config;
+	int err;
+
+	err = ovl_parse_opt((char *) data, &config);
+	if (err)
+		goto out;
+
+	/* FIXME: workdir is not needed for a R/O mount */
+	err = -EINVAL;
+	if (!config.upperdir || !config.lowerdir || !config.workdir) {
+		pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
+		goto out_free_config;
+	}
+
+	err = -ENOMEM;
+	ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+	if (!ufs)
+		goto out_free_config;
+
+	oe = ovl_alloc_entry();
+	if (oe == NULL)
+		goto out_free_ufs;
+
+	err = ovl_mount_dir(config.upperdir, &upperpath);
+	if (err)
+		goto out_free_oe;
+
+	err = ovl_mount_dir(config.lowerdir, &lowerpath);
+	if (err)
+		goto out_put_upperpath;
+
+	err = ovl_mount_dir(config.workdir, &workpath);
+	if (err)
+		goto out_put_lowerpath;
+
+	err = -EINVAL;
+	if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
+	    !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
+	    !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
+		pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
+		goto out_put_workpath;
+	}
+
+	if (upperpath.mnt != workpath.mnt) {
+		pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+		goto out_put_workpath;
+	}
+	if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+		pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+		goto out_put_workpath;
+	}
+
+	if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
+		pr_err("overlayfs: filesystem of upperdir is not supported\n");
+		goto out_put_workpath;
+	}
+
+	if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
+		pr_err("overlayfs: filesystem of lowerdir is not supported\n");
+		goto out_put_workpath;
+	}
+
+	ufs->upper_mnt = clone_private_mount(&upperpath);
+	err = PTR_ERR(ufs->upper_mnt);
+	if (IS_ERR(ufs->upper_mnt)) {
+		pr_err("overlayfs: failed to clone upperpath\n");
+		goto out_put_workpath;
+	}
+
+	ufs->lower_mnt = clone_private_mount(&lowerpath);
+	err = PTR_ERR(ufs->lower_mnt);
+	if (IS_ERR(ufs->lower_mnt)) {
+		pr_err("overlayfs: failed to clone lowerpath\n");
+		goto out_put_upper_mnt;
+	}
+
+	ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+	err = PTR_ERR(ufs->workdir);
+	if (IS_ERR(ufs->workdir)) {
+		pr_err("overlayfs: failed to create directory %s/%s\n",
+		       config.workdir, OVL_WORKDIR_NAME);
+		goto out_put_lower_mnt;
+	}
+
+	/*
+	 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
+	 * will fail instead of modifying lower fs.
+	 */
+	ufs->lower_mnt->mnt_flags |= MNT_READONLY;
+
+	/* If the upper fs is r/o, we mark overlayfs r/o too */
+	if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
+		sb->s_flags |= MS_RDONLY;
+
+	sb->s_d_op = &ovl_dentry_operations;
+
+	err = -ENOMEM;
+	root_inode = ovl_new_inode(sb, S_IFDIR, oe);
+	if (!root_inode)
+		goto out_put_workdir;
+
+	root_dentry = d_make_root(root_inode);
+	if (!root_dentry)
+		goto out_put_workdir;
+
+	mntput(upperpath.mnt);
+	mntput(lowerpath.mnt);
+	path_put(&workpath);
+
+	oe->__upperdentry = upperpath.dentry;
+	oe->lowerdentry = lowerpath.dentry;
+
+	root_dentry->d_fsdata = oe;
+
+	sb->s_op = &ovl_super_operations;
+	sb->s_root = root_dentry;
+	sb->s_fs_info = ufs;
+
+	return 0;
+
+out_put_workdir:
+	dput(ufs->workdir);
+out_put_lower_mnt:
+	mntput(ufs->lower_mnt);
+out_put_upper_mnt:
+	mntput(ufs->upper_mnt);
+out_put_workpath:
+	path_put(&workpath);
+out_put_lowerpath:
+	path_put(&lowerpath);
+out_put_upperpath:
+	path_put(&upperpath);
+out_free_oe:
+	kfree(oe);
+out_free_ufs:
+	kfree(ufs);
+out_free_config:
+	kfree(config.lowerdir);
+	kfree(config.upperdir);
+	kfree(config.workdir);
+out:
+	return err;
+}
+
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+				const char *dev_name, void *raw_data)
+{
+	return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+
+static struct file_system_type ovl_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "overlayfs",
+	.mount		= ovl_mount,
+	.kill_sb	= kill_anon_super,
+};
+MODULE_ALIAS_FS("overlayfs");
+
+static int __init ovl_init(void)
+{
+	return register_filesystem(&ovl_fs_type);
+}
+
+static void __exit ovl_exit(void)
+{
+	unregister_filesystem(&ovl_fs_type);
+}
+
+module_init(ovl_init);
+module_exit(ovl_exit);
-- 
cgit v1.1


From cc2596392af3b1404421aaef828a255303c46f93 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Fri, 24 Oct 2014 00:14:38 +0200
Subject: overlayfs: add statfs support

Add support for statfs to the overlayfs filesystem.  As the upper layer
is the target of all write operations assume that the space in that
filesystem is the space in the overlayfs.  There will be some inaccuracy as
overwriting a file will copy it up and consume space we were not expecting,
but it is better than nothing.

Use the upper layer dentry and mount from the overlayfs root inode,
passing the statfs call to that filesystem.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/super.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 227710a..aaf562b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -16,17 +16,21 @@
 #include <linux/parser.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/statfs.h>
 #include "overlayfs.h"
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Overlay filesystem");
 MODULE_LICENSE("GPL");
 
+#define OVERLAYFS_SUPER_MAGIC 0x794c764f
+
 /* private information held for overlayfs's superblock */
 struct ovl_fs {
 	struct vfsmount *upper_mnt;
 	struct vfsmount *lower_mnt;
 	struct dentry *workdir;
+	long lower_namelen;
 };
 
 struct ovl_dir_cache;
@@ -383,8 +387,35 @@ static void ovl_put_super(struct super_block *sb)
 	kfree(ufs);
 }
 
+/**
+ * ovl_statfs
+ * @sb: The overlayfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics.  As writes always target the upper layer
+ * filesystem pass the statfs to the same filesystem.
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct dentry *root_dentry = dentry->d_sb->s_root;
+	struct path path;
+	int err;
+
+	ovl_path_upper(root_dentry, &path);
+
+	err = vfs_statfs(&path, buf);
+	if (!err) {
+		buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+		buf->f_type = OVERLAYFS_SUPER_MAGIC;
+	}
+
+	return err;
+}
+
 static const struct super_operations ovl_super_operations = {
 	.put_super	= ovl_put_super,
+	.statfs		= ovl_statfs,
 };
 
 struct ovl_config {
@@ -556,6 +587,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	struct ovl_entry *oe;
 	struct ovl_fs *ufs;
 	struct ovl_config config;
+	struct kstatfs statfs;
 	int err;
 
 	err = ovl_parse_opt((char *) data, &config);
@@ -617,6 +649,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_put_workpath;
 	}
 
+	err = vfs_statfs(&lowerpath, &statfs);
+	if (err) {
+		pr_err("overlayfs: statfs failed on lowerpath\n");
+		goto out_put_workpath;
+	}
+	ufs->lower_namelen = statfs.f_namelen;
+
 	ufs->upper_mnt = clone_private_mount(&upperpath);
 	err = PTR_ERR(ufs->upper_mnt);
 	if (IS_ERR(ufs->upper_mnt)) {
@@ -669,6 +708,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 	root_dentry->d_fsdata = oe;
 
+	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
 	sb->s_op = &ovl_super_operations;
 	sb->s_root = root_dentry;
 	sb->s_fs_info = ufs;
-- 
cgit v1.1


From f45827e84186af152492c6d0dcf4105b4a605f9b Mon Sep 17 00:00:00 2001
From: Erez Zadok <ezk@fsl.cs.sunysb.edu>
Date: Fri, 24 Oct 2014 00:14:38 +0200
Subject: overlayfs: implement show_options

This is useful because of the stacking nature of overlayfs.  Users like to
find out (via /proc/mounts) which lower/upper directory were used at mount
time.

AV: even failing ovl_parse_opt() could've done some kstrdup()
AV: failure of ovl_alloc_entry() should end up with ENOMEM, not EINVAL

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/super.c | 76 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index aaf562b..7dcc24e 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/statfs.h>
+#include <linux/seq_file.h>
 #include "overlayfs.h"
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -25,12 +26,20 @@ MODULE_LICENSE("GPL");
 
 #define OVERLAYFS_SUPER_MAGIC 0x794c764f
 
+struct ovl_config {
+	char *lowerdir;
+	char *upperdir;
+	char *workdir;
+};
+
 /* private information held for overlayfs's superblock */
 struct ovl_fs {
 	struct vfsmount *upper_mnt;
 	struct vfsmount *lower_mnt;
 	struct dentry *workdir;
 	long lower_namelen;
+	/* pathnames of lower and upper dirs, for show_options */
+	struct ovl_config config;
 };
 
 struct ovl_dir_cache;
@@ -384,6 +393,9 @@ static void ovl_put_super(struct super_block *sb)
 	mntput(ufs->upper_mnt);
 	mntput(ufs->lower_mnt);
 
+	kfree(ufs->config.lowerdir);
+	kfree(ufs->config.upperdir);
+	kfree(ufs->config.workdir);
 	kfree(ufs);
 }
 
@@ -413,15 +425,27 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return err;
 }
 
+/**
+ * ovl_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ovl_fs *ufs = sb->s_fs_info;
+
+	seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+	seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+	seq_printf(m, ",workdir=%s", ufs->config.workdir);
+	return 0;
+}
+
 static const struct super_operations ovl_super_operations = {
 	.put_super	= ovl_put_super,
 	.statfs		= ovl_statfs,
-};
-
-struct ovl_config {
-	char *lowerdir;
-	char *upperdir;
-	char *workdir;
+	.show_options	= ovl_show_options,
 };
 
 enum {
@@ -442,10 +466,6 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 {
 	char *p;
 
-	config->upperdir = NULL;
-	config->lowerdir = NULL;
-	config->workdir = NULL;
-
 	while ((p = strsep(&opt, ",")) != NULL) {
 		int token;
 		substring_t args[MAX_OPT_ARGS];
@@ -586,39 +606,40 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	struct dentry *root_dentry;
 	struct ovl_entry *oe;
 	struct ovl_fs *ufs;
-	struct ovl_config config;
 	struct kstatfs statfs;
 	int err;
 
-	err = ovl_parse_opt((char *) data, &config);
-	if (err)
+	err = -ENOMEM;
+	ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+	if (!ufs)
 		goto out;
 
+	err = ovl_parse_opt((char *) data, &ufs->config);
+	if (err)
+		goto out_free_config;
+
 	/* FIXME: workdir is not needed for a R/O mount */
 	err = -EINVAL;
-	if (!config.upperdir || !config.lowerdir || !config.workdir) {
+	if (!ufs->config.upperdir || !ufs->config.lowerdir ||
+	    !ufs->config.workdir) {
 		pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
 		goto out_free_config;
 	}
 
 	err = -ENOMEM;
-	ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
-	if (!ufs)
-		goto out_free_config;
-
 	oe = ovl_alloc_entry();
 	if (oe == NULL)
-		goto out_free_ufs;
+		goto out_free_config;
 
-	err = ovl_mount_dir(config.upperdir, &upperpath);
+	err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
 	if (err)
 		goto out_free_oe;
 
-	err = ovl_mount_dir(config.lowerdir, &lowerpath);
+	err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
 	if (err)
 		goto out_put_upperpath;
 
-	err = ovl_mount_dir(config.workdir, &workpath);
+	err = ovl_mount_dir(ufs->config.workdir, &workpath);
 	if (err)
 		goto out_put_lowerpath;
 
@@ -674,7 +695,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	err = PTR_ERR(ufs->workdir);
 	if (IS_ERR(ufs->workdir)) {
 		pr_err("overlayfs: failed to create directory %s/%s\n",
-		       config.workdir, OVL_WORKDIR_NAME);
+		       ufs->config.workdir, OVL_WORKDIR_NAME);
 		goto out_put_lower_mnt;
 	}
 
@@ -729,12 +750,11 @@ out_put_upperpath:
 	path_put(&upperpath);
 out_free_oe:
 	kfree(oe);
-out_free_ufs:
-	kfree(ufs);
 out_free_config:
-	kfree(config.lowerdir);
-	kfree(config.upperdir);
-	kfree(config.workdir);
+	kfree(ufs->config.lowerdir);
+	kfree(ufs->config.upperdir);
+	kfree(ufs->config.workdir);
+	kfree(ufs);
 out:
 	return err;
 }
-- 
cgit v1.1


From 69c433ed2ecd2d3264efd7afec4439524b319121 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 24 Oct 2014 00:14:39 +0200
Subject: fs: limit filesystem stacking depth

Add a simple read-only counter to super_block that indicates how deep this
is in the stack of filesystems.  Previously ecryptfs was the only stackable
filesystem and it explicitly disallowed multiple layers of itself.

Overlayfs, however, can be stacked recursively and also may be stacked
on top of ecryptfs or vice versa.

To limit the kernel stack usage we must limit the depth of the
filesystem stack.  Initially the limit is set to 2.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/ecryptfs/main.c   | 7 +++++++
 fs/overlayfs/super.c | 9 +++++++++
 2 files changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1b119d3..c4cd1fd 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	s->s_blocksize = path.dentry->d_sb->s_blocksize;
 	s->s_magic = ECRYPTFS_SUPER_MAGIC;
+	s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+
+	rc = -EINVAL;
+	if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
+		goto out_free;
+	}
 
 	inode = ecryptfs_get_inode(path.dentry->d_inode, s);
 	rc = PTR_ERR(inode);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7dcc24e..08b704c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -677,6 +677,15 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	ufs->lower_namelen = statfs.f_namelen;
 
+	sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
+				lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
+
+	err = -EINVAL;
+	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+		goto out_put_workpath;
+	}
+
 	ufs->upper_mnt = clone_private_mount(&upperpath);
 	err = PTR_ERR(ufs->upper_mnt);
 	if (IS_ERR(ufs->upper_mnt)) {
-- 
cgit v1.1


From 51486b900ee92856b977eacfc5bfbe6565028070 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Oct 2014 13:26:21 -0400
Subject: fix inode leaks on d_splice_alias() failure exits

d_splice_alias() callers expect it to either stash the inode reference
into a new alias, or drop the inode reference.  That makes it possible
to just return d_splice_alias() result from ->lookup() instance, without
any extra housekeeping required.

Unfortunately, that should include the failure exits.  If d_splice_alias()
returns an error, it leaves the dentry it has been given negative and
thus it *must* drop the inode reference.  Easily fixed, but it goes way
back and will need backporting.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d5a23fd..3ffef7f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2673,11 +2673,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			if (!IS_ROOT(new)) {
 				spin_unlock(&inode->i_lock);
 				dput(new);
+				iput(inode);
 				return ERR_PTR(-EIO);
 			}
 			if (d_ancestor(new, dentry)) {
 				spin_unlock(&inode->i_lock);
 				dput(new);
+				iput(inode);
 				return ERR_PTR(-EIO);
 			}
 			write_seqlock(&rename_lock);
-- 
cgit v1.1


From 3d268c9b136f51385f9d041f3f2424501b257388 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Oct 2014 22:56:05 -0400
Subject: overlayfs: don't hold ->i_mutex over opening the real directory

just use it to serialize the assignment

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c6787f8..b7d9fb0 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -458,20 +458,27 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
 		struct inode *inode = file_inode(file);
 
-		mutex_lock(&inode->i_mutex);
 		realfile = od->upperfile;
 		if (!realfile) {
 			struct path upperpath;
 
 			ovl_path_upper(dentry, &upperpath);
 			realfile = ovl_path_open(&upperpath, O_RDONLY);
-			if (IS_ERR(realfile)) {
-				mutex_unlock(&inode->i_mutex);
-				return PTR_ERR(realfile);
+			mutex_lock(&inode->i_mutex);
+			if (!od->upperfile) {
+				if (IS_ERR(realfile)) {
+					mutex_unlock(&inode->i_mutex);
+					return PTR_ERR(realfile);
+				}
+				od->upperfile = realfile;
+			} else {
+				/* somebody has beaten us to it */
+				if (!IS_ERR(realfile))
+					fput(realfile);
+				realfile = od->upperfile;
 			}
-			od->upperfile = realfile;
+			mutex_unlock(&inode->i_mutex);
 		}
-		mutex_unlock(&inode->i_mutex);
 	}
 
 	return vfs_fsync_range(realfile, start, end, datasync);
-- 
cgit v1.1


From 68bf8611076a8e4bee8bc8d03ff28bd1e9a9c631 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Oct 2014 22:58:56 -0400
Subject: overlayfs: make ovl_cache_entry->name an array instead of pointer

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index b7d9fb0..9c9872b 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -18,13 +18,13 @@
 #include "overlayfs.h"
 
 struct ovl_cache_entry {
-	const char *name;
 	unsigned int len;
 	unsigned int type;
 	u64 ino;
 	bool is_whiteout;
 	struct list_head l_node;
 	struct rb_node node;
+	char name[];
 };
 
 struct ovl_dir_cache {
@@ -82,13 +82,12 @@ static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
 						   u64 ino, unsigned int d_type)
 {
 	struct ovl_cache_entry *p;
+	size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
 
-	p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
+	p = kmalloc(size, GFP_KERNEL);
 	if (p) {
-		char *name_copy = (char *) (p + 1);
-		memcpy(name_copy, name, len);
-		name_copy[len] = '\0';
-		p->name = name_copy;
+		memcpy(p->name, name, len);
+		p->name[len] = '\0';
 		p->len = len;
 		p->type = d_type;
 		p->ino = ino;
-- 
cgit v1.1


From 49be4fb9cc3431fc4ebc71c764db848483b2a16c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Oct 2014 23:00:53 -0400
Subject: overlayfs: embed root into overlay_readdir_data

no sense having it a pointer - all instances have it pointing to
local variable in the same stack frame

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 9c9872b..a9ee2c1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,7 +36,7 @@ struct ovl_dir_cache {
 struct ovl_readdir_data {
 	struct dir_context ctx;
 	bool is_merge;
-	struct rb_root *root;
+	struct rb_root root;
 	struct list_head *list;
 	struct list_head *middle;
 	int count;
@@ -101,7 +101,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
 				  const char *name, int len, u64 ino,
 				  unsigned int d_type)
 {
-	struct rb_node **newp = &rdd->root->rb_node;
+	struct rb_node **newp = &rdd->root.rb_node;
 	struct rb_node *parent = NULL;
 	struct ovl_cache_entry *p;
 
@@ -126,7 +126,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
 
 	list_add_tail(&p->l_node, rdd->list);
 	rb_link_node(&p->node, parent, newp);
-	rb_insert_color(&p->node, rdd->root);
+	rb_insert_color(&p->node, &rdd->root);
 
 	return 0;
 }
@@ -137,7 +137,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd,
 {
 	struct ovl_cache_entry *p;
 
-	p = ovl_cache_entry_find(rdd->root, name, namelen);
+	p = ovl_cache_entry_find(&rdd->root, name, namelen);
 	if (p) {
 		list_move_tail(&p->l_node, rdd->middle);
 	} else {
@@ -277,12 +277,11 @@ static inline int ovl_dir_read_merged(struct path *upperpath,
 				      struct list_head *list)
 {
 	int err;
-	struct rb_root root = RB_ROOT;
 	struct list_head middle;
 	struct ovl_readdir_data rdd = {
 		.ctx.actor = ovl_fill_merge,
 		.list = list,
-		.root = &root,
+		.root = RB_ROOT,
 		.is_merge = false,
 	};
 
-- 
cgit v1.1


From db6ec212b53abc29a5bb6ac8c810010fc28d5191 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Oct 2014 23:03:03 -0400
Subject: overlayfs: embed middle into overlay_readdir_data

same story...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index a9ee2c1..910553f 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -38,7 +38,7 @@ struct ovl_readdir_data {
 	bool is_merge;
 	struct rb_root root;
 	struct list_head *list;
-	struct list_head *middle;
+	struct list_head middle;
 	int count;
 	int err;
 };
@@ -139,13 +139,13 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd,
 
 	p = ovl_cache_entry_find(&rdd->root, name, namelen);
 	if (p) {
-		list_move_tail(&p->l_node, rdd->middle);
+		list_move_tail(&p->l_node, &rdd->middle);
 	} else {
 		p = ovl_cache_entry_new(name, namelen, ino, d_type);
 		if (p == NULL)
 			rdd->err = -ENOMEM;
 		else
-			list_add_tail(&p->l_node, rdd->middle);
+			list_add_tail(&p->l_node, &rdd->middle);
 	}
 
 	return rdd->err;
@@ -277,7 +277,6 @@ static inline int ovl_dir_read_merged(struct path *upperpath,
 				      struct list_head *list)
 {
 	int err;
-	struct list_head middle;
 	struct ovl_readdir_data rdd = {
 		.ctx.actor = ovl_fill_merge,
 		.list = list,
@@ -301,11 +300,10 @@ static inline int ovl_dir_read_merged(struct path *upperpath,
 		 * Insert lowerpath entries before upperpath ones, this allows
 		 * offsets to be reasonably constant
 		 */
-		list_add(&middle, rdd.list);
-		rdd.middle = &middle;
+		list_add(&rdd.middle, rdd.list);
 		rdd.is_merge = true;
 		err = ovl_dir_read(lowerpath, &rdd);
-		list_del(&middle);
+		list_del(&rdd.middle);
 	}
 out:
 	return err;
-- 
cgit v1.1


From 21e7626b12f25770e2975bc7c7b2e1d5b1d58a57 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 27 Oct 2014 13:52:21 +0100
Subject: btrfs: use macro accessors in superblock validation checks

The initial patch c926093ec516f5d316 (btrfs: add more superblock checks)
did not properly use the macro accessors that wrap endianness and the
code would not work correctly on big endian machines.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2409718..1ae1661 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3817,19 +3817,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	struct btrfs_super_block *sb = fs_info->super_copy;
 	int ret = 0;
 
-	if (sb->root_level > BTRFS_MAX_LEVEL) {
-		printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
-				sb->root_level, BTRFS_MAX_LEVEL);
+	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
+				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
 		ret = -EINVAL;
 	}
-	if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
-		printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
-				sb->chunk_root_level, BTRFS_MAX_LEVEL);
+	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
+				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
 		ret = -EINVAL;
 	}
-	if (sb->log_root_level > BTRFS_MAX_LEVEL) {
-		printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
-				sb->log_root_level, BTRFS_MAX_LEVEL);
+	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
+				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
 		ret = -EINVAL;
 	}
 
@@ -3837,15 +3837,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	 * The common minimum, we don't know if we can trust the nodesize/sectorsize
 	 * items yet, they'll be verified later. Issue just a warning.
 	 */
-	if (!IS_ALIGNED(sb->root, 4096))
+	if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
 				sb->root);
-	if (!IS_ALIGNED(sb->chunk_root, 4096))
+	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
 				sb->chunk_root);
-	if (!IS_ALIGNED(sb->log_root, 4096))
+	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
-				sb->log_root);
+				btrfs_super_log_root(sb));
 
 	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
 		printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
@@ -3857,13 +3857,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
 	 * done later
 	 */
-	if (sb->num_devices > (1UL << 31))
+	if (btrfs_super_num_devices(sb) > (1UL << 31))
 		printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
-				sb->num_devices);
+				btrfs_super_num_devices(sb));
 
-	if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+	if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
 		printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
-				sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+				btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
 		ret = -EINVAL;
 	}
 
@@ -3871,14 +3871,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	 * The generation is a global counter, we'll trust it more than the others
 	 * but it's still possible that it's the one that's wrong.
 	 */
-	if (sb->generation < sb->chunk_root_generation)
+	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
 		printk(KERN_WARNING
 			"BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
-			sb->generation, sb->chunk_root_generation);
-	if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+			btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
+	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+	    && btrfs_super_cache_generation(sb) != (u64)-1)
 		printk(KERN_WARNING
 			"BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
-			sb->generation, sb->cache_generation);
+			btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
 
 	return ret;
 }
-- 
cgit v1.1


From 1a4ed8fdca077d2489ec47d548451be69389e926 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 27 Oct 2014 10:44:24 +0000
Subject: Btrfs: fix invalid leaf slot access in btrfs_lookup_extent()

If we couldn't find our extent item, we accessed the current slot
(path->slots[0]) to check if it corresponds to an equivalent skinny
metadata item. However this slot could be beyond our last item in the
leaf (i.e. path->slots[0] >= btrfs_header_nritems(leaf)), in which case
we shouldn't process it.

Since btrfs_lookup_extent() is only used to find extent items for data
extents, fix this by removing completely the logic that looks up for an
equivalent skinny metadata item, since it can not exist.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c | 10 ++--------
 fs/btrfs/tree-log.c    |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d557264e..fe69edd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
 				 unsigned long count, int wait);
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
 			     u64 offset, int metadata, u64 *refs, u64 *flags);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d599ba..87c0b46f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	int ret;
 	struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
-	if (ret > 0) {
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		if (key.objectid == start &&
-		    key.type == BTRFS_METADATA_ITEM_KEY)
-			ret = 0;
-	}
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2b26dad..6d58d72 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			 * is this extent already allocated in the extent
 			 * allocation tree?  If so, just add a reference
 			 */
-			ret = btrfs_lookup_extent(root, ins.objectid,
+			ret = btrfs_lookup_data_extent(root, ins.objectid,
 						ins.offset);
 			if (ret == 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
-- 
cgit v1.1


From 5ed5f5884116e3841da626d201ef068f23232a3a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 15 Oct 2014 17:19:59 -0400
Subject: Btrfs: properly clean up btrfs_end_io_wq_cache

In one of Dave's cleanup commits he forgot to call btrfs_end_io_wq_exit on
unload, which makes us unable to unload and then re-load the btrfs module.  This
fixes the problem.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a2b97ef..54bd91e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2151,6 +2151,7 @@ static void __exit exit_btrfs_fs(void)
 	extent_map_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
+	btrfs_end_io_wq_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
-- 
cgit v1.1


From d05a2b4cd97071462e77e6a7a8f109c36307182a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 27 Oct 2014 09:19:52 +0000
Subject: Btrfs: fix race that makes btrfs_lookup_extent_info miss skinny
 extent items

We have a race that can lead us to miss skinny extent items in the function
btrfs_lookup_extent_info() when the skinny metadata feature is enabled.
So basically the sequence of steps is:

1) We search in the extent tree for the skinny extent, which returns > 0
   (not found);

2) We check the previous item in the returned leaf for a non-skinny extent,
   and we don't find it;

3) Because we didn't find the non-skinny extent in step 2), we release our
   path to search the extent tree again, but this time for a non-skinny
   extent key;

4) Right after we released our path in step 3), a skinny extent was inserted
   in the extent tree (delayed refs were run) - our second extent tree search
   will miss it, because it's not looking for a skinny extent;

5) After the second search returned (with ret > 0), we look for any delayed
   ref for our extent's bytenr (and we do it while holding a read lock on the
   leaf), but we won't find any, as such delayed ref had just run and completed
   after we released out path in step 3) before doing the second search.

Fix this by removing completely the path release and re-search logic. This is
safe, because if we seach for a metadata item and we don't find it, we have the
guarantee that the returned leaf is the one where the item would be inserted,
and so path->slots[0] > 0 and path->slots[0] - 1 must be the slot where the
non-skinny extent item is if it exists. The only case where path->slots[0] is
zero is when there are no smaller keys in the tree (i.e. no left siblings for
our leaf), in which case the re-search logic isn't needed as well.

This race has been present since the introduction of skinny metadata (change
3173a18f70554fe7880bb2d85c7da566e364eb3c).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 87c0b46f..a84e00d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -780,7 +780,6 @@ search_again:
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 
-again:
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 				&key, path, 0, 0);
 	if (ret < 0)
@@ -796,13 +795,6 @@ again:
 			    key.offset == root->nodesize)
 				ret = 0;
 		}
-		if (ret) {
-			key.objectid = bytenr;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-			key.offset = root->nodesize;
-			btrfs_release_path(path);
-			goto again;
-		}
 	}
 
 	if (ret == 0) {
-- 
cgit v1.1


From a6bbce54efa9145dbcf3029c885549f7ebc40a3b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 29 Oct 2014 08:22:18 +1100
Subject: xfs: bulkstat doesn't release AGI buffer on error

The recent refactoring of the bulkstat code left a small landmine in
the code. If a inobt read fails, then the tree walk is aborted and
returns without releasing the AGI buffer or freeing the cursor. This
can lead to a subsequent bulkstat call hanging trying to grab the
AGI buffer again.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f1deb96..ef8ea05 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -427,7 +427,7 @@ xfs_bulkstat(
 
 			error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
 			if (error)
-				break;
+				goto del_cursor;
 			if (icount) {
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
@@ -442,7 +442,7 @@ xfs_bulkstat(
 			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
 		}
 		if (error)
-			break;
+			goto del_cursor;
 
 		/*
 		 * Loop through inode btree records in this ag,
@@ -454,7 +454,7 @@ xfs_bulkstat(
 			error = xfs_inobt_get_rec(cur, &r, &i);
 			if (error || i == 0) {
 				end_of_ag = 1;
-				break;
+				goto del_cursor;
 			}
 
 			/*
@@ -476,13 +476,17 @@ xfs_bulkstat(
 			error = xfs_btree_increment(cur, 0, &tmp);
 			cond_resched();
 		}
+
 		/*
-		 * Drop the btree buffers and the agi buffer.
-		 * We can't hold any of the locks these represent
-		 * when calling iget.
+		 * Drop the btree buffers and the agi buffer as we can't hold any
+		 * of the locks these represent when calling iget. If there is a
+		 * pending error, then we are done.
 		 */
+del_cursor:
 		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 		xfs_buf_relse(agbp);
+		if (error)
+			break;
 		/*
 		 * Now format all the good inodes into the user's buffer.
 		 */
-- 
cgit v1.1


From d45f00ae43e63eff1b3d79df20610ae1ef645ebd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Oct 2014 18:27:28 -0400
Subject: overlayfs: barriers for opening upper-layer directory

make sure that
	a) all stores done by opening struct file don't leak past storing
the reference in od->upperfile
	b) the lockless side has read dependency barrier

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 910553f..8c8ce9d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -454,12 +454,13 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
 		struct inode *inode = file_inode(file);
 
-		realfile = od->upperfile;
+		realfile =lockless_dereference(od->upperfile);
 		if (!realfile) {
 			struct path upperpath;
 
 			ovl_path_upper(dentry, &upperpath);
 			realfile = ovl_path_open(&upperpath, O_RDONLY);
+			smp_mb__before_spinlock();
 			mutex_lock(&inode->i_mutex);
 			if (!od->upperfile) {
 				if (IS_ERR(realfile)) {
-- 
cgit v1.1


From c2096537d40f026672c4c6adfcd7247ce5799604 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 27 Oct 2014 13:48:48 +0100
Subject: ovl: fix check for cursor

ovl_cache_entry.name is now an array not a pointer, so it makes no sense
test for it being NULL.

Detected by coverity.

From: Miklos Szeredi <mszeredi@suse.cz>
Fixes: 68bf8611076a ("overlayfs: make ovl_cache_entry->name an array instead of
+pointer")
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 8c8ce9d..3fbf0d3 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -21,9 +21,10 @@ struct ovl_cache_entry {
 	unsigned int len;
 	unsigned int type;
 	u64 ino;
-	bool is_whiteout;
 	struct list_head l_node;
 	struct rb_node node;
+	bool is_whiteout;
+	bool is_cursor;
 	char name[];
 };
 
@@ -251,7 +252,7 @@ static int ovl_dir_mark_whiteouts(struct dentry *dir,
 
 	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry(p, rdd->list, l_node) {
-		if (!p->name)
+		if (p->is_cursor)
 			continue;
 
 		if (p->type != DT_CHR)
@@ -307,7 +308,6 @@ static inline int ovl_dir_read_merged(struct path *upperpath,
 	}
 out:
 	return err;
-
 }
 
 static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
@@ -316,7 +316,7 @@ static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
 	loff_t off = 0;
 
 	list_for_each_entry(p, &od->cache->entries, l_node) {
-		if (!p->name)
+		if (p->is_cursor)
 			continue;
 		if (off >= pos)
 			break;
@@ -389,7 +389,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
 
 		p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
 		/* Skip cursors */
-		if (p->name) {
+		if (!p->is_cursor) {
 			if (!p->is_whiteout) {
 				if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
 					break;
@@ -519,6 +519,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
 	od->realfile = realfile;
 	od->is_real = (type != OVL_PATH_MERGE);
 	od->is_upper = (type != OVL_PATH_LOWER);
+	od->cursor.is_cursor = true;
 	file->private_data = od;
 
 	return 0;
-- 
cgit v1.1


From d1b72cc6d8cb766c802fdc70a5edc2f0ba8a2b57 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 27 Oct 2014 15:42:01 +0100
Subject: overlayfs: fix lockdep misannotation

In an overlay directory that shadows an empty lower directory, say
/mnt/a/empty102, do:

 	touch /mnt/a/empty102/x
 	unlink /mnt/a/empty102/x
 	rmdir /mnt/a/empty102

It's actually harmless, but needs another level of nesting between
I_MUTEX_CHILD and I_MUTEX_NORMAL.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c             | 2 +-
 fs/overlayfs/readdir.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 42df664..922f270 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2497,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	}
 
 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
 	return NULL;
 }
 EXPORT_SYMBOL(lock_rename);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 3fbf0d3..401f084 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -571,7 +571,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 {
 	struct ovl_cache_entry *p;
 
-	mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
 	list_for_each_entry(p, list, l_node) {
 		struct dentry *dentry;
 
-- 
cgit v1.1


From f643ff550afbc82a2bc7026f4a6d64427e4fbc99 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Oct 2014 18:37:40 -0400
Subject: isofs_cmp(): we'll never see a dentry for . or ..

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/namei.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 9529564..6f6dd0c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -18,23 +18,6 @@ static int
 isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 {
 	struct qstr qstr;
-
-	if (!compare)
-		return 1;
-
-	/* check special "." and ".." files */
-	if (dlen == 1) {
-		/* "." */
-		if (compare[0] == 0) {
-			if (!dentry->d_name.len)
-				return 0;
-			compare = ".";
-		} else if (compare[0] == 1) {
-			compare = "..";
-			dlen = 2;
-		}
-	}
-
 	qstr.name = compare;
 	qstr.len = dlen;
 	return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
@@ -146,7 +129,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 				(!(de->flags[-sbi->s_high_sierra] & 1))) &&
 			(sbi->s_showassoc ||
 				(!(de->flags[-sbi->s_high_sierra] & 4)))) {
-			match = (isofs_cmp(dentry, dpnt, dlen) == 0);
+			if (dpnt && (dlen > 1 || dpnt[0] > 1))
+				match = (isofs_cmp(dentry, dpnt, dlen) == 0);
 		}
 		if (match) {
 			isofs_normalize_block_and_offset(de,
-- 
cgit v1.1


From 6424babfd68dd8a83d9c60a5242d27038856599f Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hp.com>
Date: Wed, 29 Oct 2014 14:50:22 -0700
Subject: fsnotify: next_i is freed during fsnotify_unmount_inodes.

During file system stress testing on 3.10 and 3.12 based kernels, the
umount command occasionally hung in fsnotify_unmount_inodes in the
section of code:

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

As this section of code holds the global inode_sb_list_lock, eventually
the system hangs trying to acquire the lock.

Multiple crash dumps showed:

The inode->i_state == 0x60 and i_count == 0 and i_sb_list would point
back at itself.  As this is not the value of list upon entry to the
function, the kernel never exits the loop.

To help narrow down problem, the call to list_del_init in
inode_sb_list_del was changed to list_del.  This poisons the pointers in
the i_sb_list and causes a kernel to panic if it transverse a freed
inode.

Subsequent stress testing paniced in fsnotify_unmount_inodes at the
bottom of the list_for_each_entry_safe loop showing next_i had become
free.

We believe the root cause of the problem is that next_i is being freed
during the window of time that the list_for_each_entry_safe loop
temporarily releases inode_sb_list_lock to call fsnotify and
fsnotify_inode_delete.

The code in fsnotify_unmount_inodes attempts to prevent the freeing of
inode and next_i by calling __iget.  However, the code doesn't do the
__iget call on next_i

	if i_count == 0 or
	if i_state & (I_FREEING | I_WILL_FREE)

The patch addresses this issue by advancing next_i in the above two cases
until we either find a next_i which we can __iget or we reach the end of
the list.  This makes the handling of next_i more closely match the
handling of the variable "inode."

The time to reproduce the hang is highly variable (from hours to days.) We
ran the stress test on a 3.10 kernel with the proposed patch for a week
without failure.

During list_for_each_entry_safe, next_i is becoming free causing
the loop to never terminate.  Advance next_i in those cases where
__iget is not done.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hp.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ken Helias <kenhelias@firemail.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inode_mark.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 9ce0622..e849714 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -288,20 +288,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		spin_unlock(&inode->i_lock);
 
 		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-		    atomic_read(&next_i->i_count)) {
+		while (&next_i->i_sb_list != list) {
 			spin_lock(&next_i->i_lock);
-			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+						atomic_read(&next_i->i_count)) {
 				__iget(next_i);
 				need_iput = next_i;
+				spin_unlock(&next_i->i_lock);
+				break;
 			}
 			spin_unlock(&next_i->i_lock);
+			next_i = list_entry(next_i->i_sb_list.next,
+						struct inode, i_sb_list);
 		}
 
 		/*
-		 * We can safely drop inode_sb_list_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.
+		 * We can safely drop inode_sb_list_lock here because either
+		 * we actually hold references on both inode and next_i or
+		 * end of list.  Also no new inodes will be added since the
+		 * umount has begun.
 		 */
 		spin_unlock(&inode_sb_list_lock);
 
-- 
cgit v1.1


From d3556babd7facb8fbc596bada0d67139e3b22330 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 29 Oct 2014 14:50:53 -0700
Subject: ocfs2: fix d_splice_alias() return code checking

d_splice_alias() can return a valid dentry, NULL or an ERR_PTR.
Currently the code checks not for ERR_PTR and will cuase an oops in
ocfs2_dentry_attach_lock().  Fix this by using IS_ERR_OR_NULL().

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8add6f1..b931e04 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -158,7 +158,7 @@ bail_add:
 		 * NOTE: This dentry already has ->d_op set from
 		 * ocfs2_get_parent() and ocfs2_get_dentry()
 		 */
-		if (ret)
+		if (!IS_ERR_OR_NULL(ret))
 			dentry = ret;
 
 		status = ocfs2_dentry_attach_lock(dentry, inode,
-- 
cgit v1.1


From 7a19dee116c8fae7ba7a778043c245194289f5a2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:34:52 +1100
Subject: xfs: Check error during inode btree iteration in xfs_bulkstat()

xfs_bulkstat() doesn't check error return from xfs_btree_increment(). In
case of specific fs corruption that could result in xfs_bulkstat()
entering an infinite loop because we would be looping over the same
chunk over and over again. Fix the problem by checking the return value
and terminating the loop properly.

Coverity-id: 1231338
cc: <stable@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jie Liu <jeff.u.liu@gmail.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ef8ea05..7765ff7 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -474,6 +474,10 @@ xfs_bulkstat(
 			 */
 			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &tmp);
+			if (error) {
+				end_of_ag = 1;
+				goto del_cursor;
+			}
 			cond_resched();
 		}
 
-- 
cgit v1.1


From 5d11fb4b9a1d90983452c029b5e1377af78fda49 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 30 Oct 2014 10:35:11 +1100
Subject: xfs: rework zero range to prevent invalid i_size updates

The zero range operation is analogous to fallocate with the exception of
converting the range to zeroes. E.g., it attempts to allocate zeroed
blocks over the range specified by the caller. The XFS implementation
kills all delalloc blocks currently over the aligned range, converts the
range to allocated zero blocks (unwritten extents) and handles the
partial pages at the ends of the range by sending writes through the
pagecache.

The current implementation suffers from several problems associated with
inode size. If the aligned range covers an extending I/O, said I/O is
discarded and an inode size update from a previous write never makes it
to disk. Further, if an unaligned zero range extends beyond eof, the
page write induced for the partial end page can itself increase the
inode size, even if the zero range request is not supposed to update
i_size (via KEEP_SIZE, similar to an fallocate beyond EOF).

The latter behavior not only incorrectly increases the inode size, but
can lead to stray delalloc blocks on the inode. Typically, post-eof
preallocation blocks are either truncated on release or inode eviction
or explicitly written to by xfs_zero_eof() on natural file size
extension. If the inode size increases due to zero range, however,
associated blocks leak into the address space having never been
converted or mapped to pagecache pages. A direct I/O to such an
uncovered range cannot convert the extent via writeback and will BUG().
For example:

$ xfs_io -fc "pwrite 0 128k" -c "fzero -k 1m 54321" <file>
...
$ xfs_io -d -c "pread 128k 128k" <file>
<BUG>

If the entire delalloc extent happens to not have page coverage
whatsoever (e.g., delalloc conversion couldn't find a large enough free
space extent), even a full file writeback won't convert what's left of
the extent and we'll assert on inode eviction.

Rework xfs_zero_file_space() to avoid buffered I/O for partial pages.
Use the existing hole punch and prealloc mechanisms as primitives for
zero range. This implementation is not efficient nor ideal as we
writeback dirty data over the range and remove existing extents rather
than convert to unwrittern. The former writeback, however, is currently
the only mechanism available to ensure consistency between pagecache and
extent state. Even a pagecache truncate/delalloc punch prior to hole
punch has lead to inconsistencies due to racing with writeback.

This provides a consistent, correct implementation of zero range that
survives fsstress/fsx testing without assert failures. The
implementation can be optimized from this point forward once the
fundamental issue of pagecache and delalloc extent state consistency is
addressed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap_util.c | 72 ++++++++++++++------------------------------------
 1 file changed, 20 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 92e8f99..2810026 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1338,7 +1338,10 @@ xfs_free_file_space(
 	goto out;
 }
 
-
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
 int
 xfs_zero_file_space(
 	struct xfs_inode	*ip,
@@ -1346,65 +1349,30 @@ xfs_zero_file_space(
 	xfs_off_t		len)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	uint			granularity;
-	xfs_off_t		start_boundary;
-	xfs_off_t		end_boundary;
+	uint			blksize;
 	int			error;
 
 	trace_xfs_zero_file_space(ip);
 
-	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	blksize = 1 << mp->m_sb.sb_blocklog;
 
 	/*
-	 * Round the range of extents we are going to convert inwards.  If the
-	 * offset is aligned, then it doesn't get changed so we zero from the
-	 * start of the block offset points to.
+	 * Punch a hole and prealloc the range. We use hole punch rather than
+	 * unwritten extent conversion for two reasons:
+	 *
+	 * 1.) Hole punch handles partial block zeroing for us.
+	 *
+	 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+	 * by virtue of the hole punch.
 	 */
-	start_boundary = round_up(offset, granularity);
-	end_boundary = round_down(offset + len, granularity);
-
-	ASSERT(start_boundary >= offset);
-	ASSERT(end_boundary <= offset + len);
-
-	if (start_boundary < end_boundary - 1) {
-		/*
-		 * Writeback the range to ensure any inode size updates due to
-		 * appending writes make it to disk (otherwise we could just
-		 * punch out the delalloc blocks).
-		 */
-		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-				start_boundary, end_boundary - 1);
-		if (error)
-			goto out;
-		truncate_pagecache_range(VFS_I(ip), start_boundary,
-					 end_boundary - 1);
-
-		/* convert the blocks */
-		error = xfs_alloc_file_space(ip, start_boundary,
-					end_boundary - start_boundary - 1,
-					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
-		if (error)
-			goto out;
-
-		/* We've handled the interior of the range, now for the edges */
-		if (start_boundary != offset) {
-			error = xfs_iozero(ip, offset, start_boundary - offset);
-			if (error)
-				goto out;
-		}
-
-		if (end_boundary != offset + len)
-			error = xfs_iozero(ip, end_boundary,
-					   offset + len - end_boundary);
-
-	} else {
-		/*
-		 * It's either a sub-granularity range or the range spanned lies
-		 * partially across two adjacent blocks.
-		 */
-		error = xfs_iozero(ip, offset, len);
-	}
+	error = xfs_free_file_space(ip, offset, len);
+	if (error)
+		goto out;
 
+	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+				     round_up(offset + len, blksize) -
+				     round_down(offset, blksize),
+				     XFS_BMAPI_PREALLOC);
 out:
 	return error;
 
-- 
cgit v1.1


From 9378c6768e4fca48971e7b6a9075bc006eda981d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:52:57 -0400
Subject: ext4: fix overflow when updating superblock backups after resize

When there are no meta block groups update_backups() will compute the
backup block in 32-bit arithmetics thus possibly overflowing the block
number and corrupting the filesystem. OTOH filesystems without meta
block groups larger than 16 TB should be rare. Fix the problem by doing
the counting in 64-bit arithmetics.

Coverity-id: 741252
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/resize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f298c60..ca45883 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1081,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
 			break;
 
 		if (meta_bg == 0)
-			backup_block = group * bpg + blk_off;
+			backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
 		else
 			backup_block = (ext4_group_first_block_no(sb, group) +
 					ext4_bg_has_super(sb, group));
-- 
cgit v1.1


From 599a9b77ab289d85c2d5c8607624efbe1f552b0f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: fix oops when loading block bitmap failed

When we fail to load block bitmap in __ext4_new_inode() we will
dereference NULL pointer in ext4_journal_get_write_access(). So check
for error from ext4_read_block_bitmap().

Coverity-id: 989065
Cc: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8012a5d..ac644c3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -887,6 +887,10 @@ got:
 		struct buffer_head *block_bitmap_bh;
 
 		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		if (!block_bitmap_bh) {
+			err = -EIO;
+			goto out;
+		}
 		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
 		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
-- 
cgit v1.1


From 98c1a7593fa355fda7f5a5940c8bf5326ca964ba Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: enable journal checksum when metadata checksum feature enabled

If metadata checksumming is turned on for the FS, we need to tell the
journal to use checksumming too.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1eda6ab..5c11e21 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3526,6 +3526,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
+	/* don't forget to enable journal_csum when metadata_csum is enabled. */
+	if (ext4_has_metadata_csum(sb))
+		set_opt(sb, JOURNAL_CHECKSUM);
+
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
 		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-- 
cgit v1.1


From 6b992ff25658367089db4a82666e232b65d55eae Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: disallow changing journal_csum option during remount

ext4 does not permit changing the metadata or journal checksum feature
flag while mounted.  Until we decide to support that, don't allow a
remount to change the journal_csum flag (right now we silently fail to
change anything).

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5c11e21..96059e0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4845,6 +4845,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
+	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+	    test_opt(sb, JOURNAL_CHECKSUM)) {
+		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+			 "during remount not supported");
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
 		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
-- 
cgit v1.1


From 50460fe8c6d1d95b16427936e351f277a1c72d43 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: remove extent status procfs files if journal load fails

If we can't load the journal, remove the procfs files for the extent
status information file to avoid leaking resources.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 96059e0..2c9e686 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3947,7 +3947,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
 	    !(sb->s_flags & MS_RDONLY))
 		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
-			goto failed_mount3;
+			goto failed_mount3a;
 
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
@@ -3956,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
-			goto failed_mount3;
+			goto failed_mount3a;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -4244,6 +4244,7 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
+failed_mount3a:
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
 	del_timer_sync(&sbi->s_err_report);
-- 
cgit v1.1


From a41537e69b4aa43f0fea02498c2595a81267383b Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: prevent bugon on race between write/fcntl

O_DIRECT flags can be toggeled via fcntl(F_SETFL). But this value checked
twice inside ext4_file_write_iter() and __generic_file_write() which
result in BUG_ON inside ext4_direct_IO.

Let's initialize iocb->private unconditionally.

TESTCASE: xfstest:generic/036  https://patchwork.ozlabs.org/patch/402445/

#TYPICAL STACK TRACE:
kernel BUG at fs/ext4/inode.c:2960!
invalid opcode: 0000 [#1] SMP
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000
RIP: 0010:[<ffffffff811fabf2>]  [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP: 0018:ffff88080f90bb58  EFLAGS: 00010246
RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818
RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001
RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80
R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28
FS:  00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0
Stack:
 ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30
 0000000000000200 0000000000000200 0000000000000001 0000000000000200
 ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200
Call Trace:
 [<ffffffff8112ca9d>] generic_file_direct_write+0xed/0x180
 [<ffffffff8112f2b2>] __generic_file_write_iter+0x222/0x370
 [<ffffffff811f495b>] ext4_file_write_iter+0x34b/0x400
 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810abd94>] ? __lock_acquire+0x274/0x700
 [<ffffffff811f4610>] ? ext4_unwritten_wait+0xb0/0xb0
 [<ffffffff811bd756>] aio_run_iocb+0x286/0x410
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
 [<ffffffff811bc05b>] ? lookup_ioctx+0x4b/0xf0
 [<ffffffff811bde3b>] do_io_submit+0x55b/0x740
 [<ffffffff811bdcaa>] ? do_io_submit+0x3ca/0x740
 [<ffffffff811be030>] SyS_io_submit+0x10/0x20
 [<ffffffff815ce192>] system_call_fastpath+0x16/0x1b
Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c
24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89
RIP  [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
 RSP <ffff88080f90bb58>

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: stable@vger.kernel.org
---
 fs/ext4/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24..8131be8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
 	}
 
+	iocb->private = &overwrite;
 	if (o_direct) {
 		blk_start_plug(&plug);
 
-		iocb->private = &overwrite;
 
 		/* check whether we do a DIO overwrite or not */
 		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
-- 
cgit v1.1


From d48458d4a768cece43f80a081a26cf912877da9c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: jbd2: use a better hash function for the revoke table

The old hash function didn't work well for 64-bit block numbers, and
used undefined (negative) shift right behavior.  Use the generic
64-bit hash function instead.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reported-by: Andrey Ryabinin <a.ryabinin@samsung.com>
---
 fs/jbd2/revoke.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index d5e95a1..c6cbaef 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -92,6 +92,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #endif
 
 static struct kmem_cache *jbd2_revoke_record_cache;
@@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 
 /* Utility functions to maintain the revoke table */
 
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
 static inline int hash(journal_t *journal, unsigned long long block)
 {
-	struct jbd2_revoke_table_s *table = journal->j_revoke;
-	int hash_shift = table->hash_shift;
-	int hash = (int)block ^ (int)((block >> 31) >> 1);
-
-	return ((hash << (hash_shift - 6)) ^
-		(hash >> 13) ^
-		(hash << (hash_shift - 12))) & (table->hash_size - 1);
+	return hash_64(block, journal->j_revoke->hash_shift);
 }
 
 static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
-- 
cgit v1.1


From 6050d47adcadbb53582434d919ed7f038d936712 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: ext4: bail out from make_indexed_dir() on first error

When ext4_handle_dirty_dx_node() or ext4_handle_dirty_dirent_node()
fail, there's really something wrong with the fs and there's no point in
continuing further. Just return error from make_indexed_dir() in that
case. Also initialize frames array so that if we return early due to
error, dx_release() doesn't try to dereference uninitialized memory
(which could happen also due to error in do_split()).

Coverity-id: 741300
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/namei.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 123798c..4262118 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1816,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	ext4fs_dirhash(name, namelen, &hinfo);
+	memset(frames, 0, sizeof(frames));
 	frame = frames;
 	frame->entries = entries;
 	frame->at = entries;
 	frame->bh = bh;
 	bh = bh2;
 
-	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
-	ext4_handle_dirty_dirent_node(handle, dir, bh);
+	retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+	if (retval)
+		goto out_frames;	
+	retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	if (retval)
+		goto out_frames;	
 
 	de = do_split(handle,dir, &bh, frame, &hinfo);
 	if (IS_ERR(de)) {
-		/*
-		 * Even if the block split failed, we have to properly write
-		 * out all the changes we did so far. Otherwise we can end up
-		 * with corrupted filesystem.
-		 */
-		ext4_mark_inode_dirty(handle, dir);
-		dx_release(frames);
-		return PTR_ERR(de);
+		retval = PTR_ERR(de);
+		goto out_frames;
 	}
 	dx_release(frames);
 
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
 	return retval;
+out_frames:
+	/*
+	 * Even if the block split failed, we have to properly write
+	 * out all the changes we did so far. Otherwise we can end up
+	 * with corrupted filesystem.
+	 */
+	ext4_mark_inode_dirty(handle, dir);
+	dx_release(frames);
+	return retval;
 }
 
 /*
-- 
cgit v1.1


From 4f879ca687a5f2473b952937ce92c795a39019b4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: ext4: bail early when clearing inode journal flag fails

When clearing inode journal flag, we call jbd2_journal_flush() to force
all the journalled data to their final locations. Currently we ignore
when this fails and continue clearing inode journal flag. This isn't a
big problem because when jbd2_journal_flush() fails, journal is likely
aborted anyway. But it can still lead to somewhat confusing results so
rather bail out early.

Coverity-id: 989044
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e9777f9..3356ab5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4959,7 +4959,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else {
-		jbd2_journal_flush(journal);
+		err = jbd2_journal_flush(journal);
+		if (err < 0) {
+			jbd2_journal_unlock_updates(journal);
+			ext4_inode_resume_unlocked_dio(inode);
+			return err;
+		}
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	}
 	ext4_set_aops(inode);
-- 
cgit v1.1


From ae9e9c6aeea6f91ccb4fb369d7dd8f1a8b5f6a58 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: ext4: make ext4_ext_convert_to_initialized() return proper number of
 blocks

ext4_ext_convert_to_initialized() can return more blocks than are
actually allocated from map->m_lblk in case where initial part of the
on-disk extent is zeroed out. Luckily this doesn't have serious
consequences because the caller currently uses the return value
only to unmap metadata buffers. Anyway this is a data
corruption/exposure problem waiting to happen so fix it.

Coverity-id: 1226848
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 37043d0..0b16fb4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3603,11 +3603,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		}
 	}
 
-	allocated = ext4_split_extent(handle, inode, ppath,
-				      &split_map, split_flag, flags);
-	if (allocated < 0)
-		err = allocated;
-
+	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
+				flags);
+	if (err > 0)
+		err = 0;
 out:
 	/* If we have gotten a failure, don't zero out status tree */
 	if (!err)
-- 
cgit v1.1


From 69a91c237ab0ebe4e9fdeaf6d0090c85275594ec Mon Sep 17 00:00:00 2001
From: Eric Rannaud <e@nanocritical.com>
Date: Thu, 30 Oct 2014 01:51:01 -0700
Subject: fs: allow open(dir, O_TMPFILE|..., 0) with mode 0

The man page for open(2) indicates that when O_CREAT is specified, the
'mode' argument applies only to future accesses to the file:

	Note that this mode applies only to future accesses of the newly
	created file; the open() call that creates a read-only file
	may well return a read/write file descriptor.

The man page for open(2) implies that 'mode' is treated identically by
O_CREAT and O_TMPFILE.

O_TMPFILE, however, behaves differently:

	int fd = open("/tmp", O_TMPFILE | O_RDWR, 0);
	assert(fd == -1);
	assert(errno == EACCES);

	int fd = open("/tmp", O_TMPFILE | O_RDWR, 0600);
	assert(fd > 0);

For O_CREAT, do_last() sets acc_mode to MAY_OPEN only:

	if (*opened & FILE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		will_truncate = false;
		acc_mode = MAY_OPEN;
		path_to_nameidata(path, nd);
		goto finish_open_created;
	}

But for O_TMPFILE, do_tmpfile() passes the full op->acc_mode to
may_open().

This patch lines up the behavior of O_TMPFILE with O_CREAT. After the
inode is created, may_open() is called with acc_mode = MAY_OPEN, in
do_tmpfile().

A different, but related glibc bug revealed the discrepancy:
https://sourceware.org/bugzilla/show_bug.cgi?id=17523

The glibc lazily loads the 'mode' argument of open() and openat() using
va_arg() only if O_CREAT is present in 'flags' (to support both the 2
argument and the 3 argument forms of open; same idea for openat()).
However, the glibc ignores the 'mode' argument if O_TMPFILE is in
'flags'.

On x86_64, for open(), it magically works anyway, as 'mode' is in
RDX when entering open(), and is still in RDX on SYSCALL, which is where
the kernel looks for the 3rd argument of a syscall.

But openat() is not quite so lucky: 'mode' is in RCX when entering the
glibc wrapper for openat(), while the kernel looks for the 4th argument
of a syscall in R10. Indeed, the syscall calling convention differs from
the regular calling convention in this respect on x86_64. So the kernel
sees mode = 0 when trying to use glibc openat() with O_TMPFILE, and
fails with EACCES.

Signed-off-by: Eric Rannaud <e@nanocritical.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 42df664..7851289 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3154,7 +3154,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
 	if (error)
 		goto out2;
 	audit_inode(pathname, nd->path.dentry, 0);
-	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	/* Don't check for other permissions, the inode was just created */
+	error = may_open(&nd->path, MAY_OPEN, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = nd->path.mnt;
-- 
cgit v1.1


From b0afd8e5db7b11aa9078e82e7f9abc30dc35a3c1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Oct 2014 18:40:11 -0400
Subject: isofs: don't bother with ->d_op for normal case

we only need it for joliet and case-insensitive mounts

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/inode.c | 24 ++----------------------
 fs/isofs/namei.c |  2 ++
 2 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 881b3bd..fe839b9 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -29,13 +29,9 @@
 #define BEQUIET
 
 static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
 		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
-static int isofs_dentry_cmp(const struct dentry *parent,
-		const struct dentry *dentry,
-		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
 static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
@@ -135,10 +131,6 @@ static const struct super_operations isofs_sops = {
 
 static const struct dentry_operations isofs_dentry_ops[] = {
 	{
-		.d_hash		= isofs_hash,
-		.d_compare	= isofs_dentry_cmp,
-	},
-	{
 		.d_hash		= isofs_hashi,
 		.d_compare	= isofs_dentry_cmpi,
 	},
@@ -258,25 +250,12 @@ static int isofs_dentry_cmp_common(
 }
 
 static int
-isofs_hash(const struct dentry *dentry, struct qstr *qstr)
-{
-	return isofs_hash_common(qstr, 0);
-}
-
-static int
 isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hashi_common(qstr, 0);
 }
 
 static int
-isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
-		unsigned int len, const char *str, const struct qstr *name)
-{
-	return isofs_dentry_cmp_common(len, str, name, 0, 0);
-}
-
-static int
 isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
@@ -930,7 +909,8 @@ root_found:
 	if (opt.check == 'r')
 		table++;
 
-	s->s_d_op = &isofs_dentry_ops[table];
+	if (table)
+		s->s_d_op = &isofs_dentry_ops[table - 1];
 
 	/* get the root dentry */
 	s->s_root = d_make_root(inode);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 6f6dd0c..7b543e6 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -20,6 +20,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 	struct qstr qstr;
 	qstr.name = compare;
 	qstr.len = dlen;
+	if (likely(!dentry->d_op))
+		return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
 	return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 
-- 
cgit v1.1


From b2de525f095708b2adbadaec3f1e4017a23d1e09 Mon Sep 17 00:00:00 2001
From: David Jeffery <djeffery@redhat.com>
Date: Mon, 29 Sep 2014 10:21:10 -0400
Subject: Return short read or 0 at end of a raw device, not EIO

Author: David Jeffery <djeffery@redhat.com>
Changes to the basic direct I/O code have broken the raw driver when reading
to the end of a raw device.  Instead of returning a short read for a read that
extends partially beyond the device's end or 0 when at the end of the device,
these reads now return EIO.

The raw driver needs the same end of device handling as was added for normal
block devices.  Using blkdev_read_iter, which has the needed size checks,
prevents the EIO conditions at the end of the device.

Signed-off-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d411..1d9c9f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1585,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL_GPL(blkdev_write_iter);
 
-static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *bd_inode = file->f_mapping->host;
@@ -1599,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	iov_iter_truncate(to, size);
 	return generic_file_read_iter(iocb, to);
 }
+EXPORT_SYMBOL_GPL(blkdev_read_iter);
 
 /*
  * Try to release a page associated with block device when the system
-- 
cgit v1.1


From 9f2f7d4c8dfcf4617af5de6ea381b91deac3db48 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 31 Oct 2014 20:02:42 +0100
Subject: ovl: initialize ->is_cursor

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 401f084..4e9d7c1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -93,6 +93,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
 		p->type = d_type;
 		p->ino = ino;
 		p->is_whiteout = false;
+		p->is_cursor = false;
 	}
 
 	return p;
-- 
cgit v1.1


From 6e5aafb27419f32575b27ef9d6a31e5d54661aca Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 4 Nov 2014 06:59:04 -0800
Subject: Btrfs: fix kfree on list_head in btrfs_lookup_csums_range error
 cleanup

If we hit any errors in btrfs_lookup_csums_range, we'll loop through all
the csums we allocate and free them.  But the code was using list_entry
incorrectly, and ended up trying to free the on-stack list_head instead.

This bug came from commit 0678b6185

btrfs: Don't BUG_ON kzalloc error in btrfs_lookup_csums_range()

Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: Erik Berg <btrfs@slipsprogrammoer.no>
cc: stable@vger.kernel.org # 3.3 or newer
---
 fs/btrfs/file-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 783a943..84a2d18 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	ret = 0;
 fail:
 	while (ret < 0 && !list_empty(&tmplist)) {
-		sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+		sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
 		list_del(&sums->list);
 		kfree(sums);
 	}
-- 
cgit v1.1


From 3f822c6264954660babce757fb45792fd3af273e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 4 Nov 2014 16:11:03 +0100
Subject: ovl: don't poison cursor

ovl_cache_put() can be called from ovl_dir_reset() if the cache needs to be
rebuilt.  We did list_del() on the cursor, which results in an Oops on the
poisoned pointer in ovl_seek_cursor().

Reported-by: Jordi Pujol Palomer <jordipujolp@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: Jordi Pujol Palomer <jordipujolp@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 4e9d7c1..2a7ef4f 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -168,7 +168,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
 {
 	struct ovl_dir_cache *cache = od->cache;
 
-	list_del(&od->cursor.l_node);
+	list_del_init(&od->cursor.l_node);
 	WARN_ON(cache->refcount <= 0);
 	cache->refcount--;
 	if (!cache->refcount) {
-- 
cgit v1.1


From 7e8631e8b9d4e9f698c09c7e7309c96249180ff9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Nov 2014 15:18:29 -0500
Subject: fix breakage in o2net_send_tcp_msg()

uninitialized msghdr.  Broken in "ocfs2: don't open-code kernel_recvmsg()"
by me ;-/

Cc: stable@vger.kernel.org # 3.15+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/cluster/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 97de0fb..a960440 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
 			      size_t veclen, size_t total)
 {
 	int ret;
-	struct msghdr msg;
+	struct msghdr msg = {.msg_flags = 0,};
 
 	if (sock == NULL) {
 		ret = -EINVAL;
-- 
cgit v1.1


From afa947cb52a8e73fe71915a0b0af6fcf98dfbe1a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:29:57 +1100
Subject: xfs: bulkstat btree walk doesn't terminate

The bulkstat code has several different ways of detecting the end of
an AG when doing a walk. They are not consistently detected, and the
code that checks for the end of AG conditions is not consistently
coded. Hence the are conditions where the walk code can get stuck in
an endless loop making no progress and not triggering any
termination conditions.

Convert all the "tmp/i" status return codes from btree operations
to a common name (stat) and apply end-of-ag detection to these
operations consistently.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7765ff7..16737cb 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -356,7 +356,6 @@ xfs_bulkstat(
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
 	int                     fmterror;/* bulkstat formatter result */
-	int			i;	/* loop index */
 	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_ino_t		ino;	/* inode number (filesystem) */
@@ -366,11 +365,11 @@ xfs_bulkstat(
 	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
-	int			tmp;	/* result value from btree calls */
 	int			ubcount; /* size of user's buffer */
 	int			ubleft;	/* bytes left in user's buffer */
 	char			__user *ubufp;	/* pointer into user's buffer */
 	int			ubelem;	/* spaces used in user's buffer */
+	int			stat;
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
@@ -436,13 +435,15 @@ xfs_bulkstat(
 				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			}
 			/* Increment to the next record */
-			error = xfs_btree_increment(cur, 0, &tmp);
+			error = xfs_btree_increment(cur, 0, &stat);
 		} else {
 			/* Start of ag.  Lookup the first inode chunk */
-			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
 		}
-		if (error)
+		if (error || stat == 0) {
+			end_of_ag = 1;
 			goto del_cursor;
+		}
 
 		/*
 		 * Loop through inode btree records in this ag,
@@ -451,8 +452,8 @@ xfs_bulkstat(
 		while (irbp < irbufend && icount < ubcount) {
 			struct xfs_inobt_rec_incore	r;
 
-			error = xfs_inobt_get_rec(cur, &r, &i);
-			if (error || i == 0) {
+			error = xfs_inobt_get_rec(cur, &r, &stat);
+			if (error || stat == 0) {
 				end_of_ag = 1;
 				goto del_cursor;
 			}
@@ -473,8 +474,8 @@ xfs_bulkstat(
 			 * Set agino to after this chunk and bump the cursor.
 			 */
 			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-			error = xfs_btree_increment(cur, 0, &tmp);
-			if (error) {
+			error = xfs_btree_increment(cur, 0, &stat);
+			if (error || stat == 0) {
 				end_of_ag = 1;
 				goto del_cursor;
 			}
-- 
cgit v1.1


From bf4a5af20d25ecc8876978ad34b8db83b4235f3c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:30:30 +1100
Subject: xfs: bulkstat chunk formatting cursor is broken

The xfs_bulkstat_agichunk formatting cursor takes buffer values from
the main loop and passes them via the structure to the chunk
formatter, and the writes the changed values back into the main loop
local variables. Unfortunately, this complex dance is full of corner
cases that aren't handled correctly.

The biggest problem is that it is double handling the information in
both the main loop and the chunk formatting function, leading to
inconsistent updates and endless loops where progress is not made.

To fix this, push the struct xfs_bulkstat_agichunk outwards to be
the primary holder of user buffer information. this removes the
double handling in the main loop.

Also, pass the last inode processed by the chunk formatter as a
separate parameter as it purely an output variable and is not
related to the user buffer consumption cursor.

Finally, the chunk formatting code is not shared by anyone, so make
it local to xfs_itable.c.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 59 +++++++++++++++++++++++++----------------------------
 fs/xfs/xfs_itable.h | 16 ---------------
 2 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 16737cb..50a3e59 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -262,20 +262,26 @@ xfs_bulkstat_grab_ichunk(
 
 #define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
 
+struct xfs_bulkstat_agichunk {
+	char		__user **ac_ubuffer;/* pointer into user's buffer */
+	int		ac_ubleft;	/* bytes left in user's buffer */
+	int		ac_ubelem;	/* spaces used in user's buffer */
+};
+
 /*
  * Process inodes in chunk with a pointer to a formatter function
  * that will iget the inode and fill in the appropriate structure.
  */
-int
+static int
 xfs_bulkstat_ag_ichunk(
 	struct xfs_mount		*mp,
 	xfs_agnumber_t			agno,
 	struct xfs_inobt_rec_incore	*irbp,
 	bulkstat_one_pf			formatter,
 	size_t				statstruct_size,
-	struct xfs_bulkstat_agichunk	*acp)
+	struct xfs_bulkstat_agichunk	*acp,
+	xfs_ino_t			*lastino)
 {
-	xfs_ino_t			lastino = acp->ac_lastino;
 	char				__user **ubufp = acp->ac_ubuffer;
 	int				ubleft = acp->ac_ubleft;
 	int				ubelem = acp->ac_ubelem;
@@ -295,7 +301,7 @@ xfs_bulkstat_ag_ichunk(
 
 		/* Skip if this inode is free */
 		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
-			lastino = ino;
+			*lastino = ino;
 			continue;
 		}
 
@@ -313,7 +319,7 @@ xfs_bulkstat_ag_ichunk(
 				ubleft = 0;
 				break;
 			}
-			lastino = ino;
+			*lastino = ino;
 			continue;
 		}
 		if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -325,10 +331,9 @@ xfs_bulkstat_ag_ichunk(
 			*ubufp += ubused;
 		ubleft -= ubused;
 		ubelem++;
-		lastino = ino;
+		*lastino = ino;
 	}
 
-	acp->ac_lastino = lastino;
 	acp->ac_ubleft = ubleft;
 	acp->ac_ubelem = ubelem;
 
@@ -355,7 +360,6 @@ xfs_bulkstat(
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
-	int                     fmterror;/* bulkstat formatter result */
 	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_ino_t		ino;	/* inode number (filesystem) */
@@ -366,10 +370,8 @@ xfs_bulkstat(
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
 	int			ubcount; /* size of user's buffer */
-	int			ubleft;	/* bytes left in user's buffer */
-	char			__user *ubufp;	/* pointer into user's buffer */
-	int			ubelem;	/* spaces used in user's buffer */
 	int			stat;
+	struct xfs_bulkstat_agichunk ac;
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
@@ -386,11 +388,13 @@ xfs_bulkstat(
 	}
 
 	ubcount = *ubcountp; /* statstruct's */
-	ubleft = ubcount * statstruct_size; /* bytes */
-	*ubcountp = ubelem = 0;
+	ac.ac_ubuffer = &ubuffer;
+	ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+	ac.ac_ubelem = 0;
+
+	*ubcountp = 0;
 	*done = 0;
-	fmterror = 0;
-	ubufp = ubuffer;
+
 	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
 	if (!irbuf)
 		return -ENOMEM;
@@ -402,7 +406,7 @@ xfs_bulkstat(
 	 * inode returned; 0 means start of the allocation group.
 	 */
 	rval = 0;
-	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+	while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) {
 		cond_resched();
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
 		if (error)
@@ -497,28 +501,21 @@ del_cursor:
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
-		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
-			struct xfs_bulkstat_agichunk ac;
-
-			ac.ac_lastino = lastino;
-			ac.ac_ubuffer = &ubuffer;
-			ac.ac_ubleft = ubleft;
-			ac.ac_ubelem = ubelem;
+		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft);
+		     irbp++) {
 			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
-					formatter, statstruct_size, &ac);
+					formatter, statstruct_size, &ac,
+					&lastino);
 			if (error)
 				rval = error;
 
-			lastino = ac.ac_lastino;
-			ubleft = ac.ac_ubleft;
-			ubelem = ac.ac_ubelem;
-
 			cond_resched();
 		}
+
 		/*
 		 * Set up for the next loop iteration.
 		 */
-		if (XFS_BULKSTAT_UBLEFT(ubleft)) {
+		if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) {
 			if (end_of_ag) {
 				agno++;
 				agino = 0;
@@ -531,11 +528,11 @@ del_cursor:
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
 	kmem_free(irbuf);
-	*ubcountp = ubelem;
+	*ubcountp = ac.ac_ubelem;
 	/*
 	 * Found some inodes, return them now and return the error next time.
 	 */
-	if (ubelem)
+	if (ac.ac_ubelem)
 		rval = 0;
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index aaed080..6ea8b39 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 			       int		*ubused,
 			       int		*stat);
 
-struct xfs_bulkstat_agichunk {
-	xfs_ino_t	ac_lastino;	/* last inode returned */
-	char		__user **ac_ubuffer;/* pointer into user's buffer */
-	int		ac_ubleft;	/* bytes left in user's buffer */
-	int		ac_ubelem;	/* spaces used in user's buffer */
-};
-
-int
-xfs_bulkstat_ag_ichunk(
-	struct xfs_mount		*mp,
-	xfs_agnumber_t			agno,
-	struct xfs_inobt_rec_incore	*irbp,
-	bulkstat_one_pf			formatter,
-	size_t				statstruct_size,
-	struct xfs_bulkstat_agichunk	*acp);
-
 /*
  * Values for stat return value.
  */
-- 
cgit v1.1


From 2b831ac6bc87d3cbcbb1a8816827b6923403e461 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:30:58 +1100
Subject: xfs: bulkstat chunk-formatter has issues

The loop construct has issues:
	- clustidx is completely unused, so remove it.
	- the loop tries to be smart by terminating when the
	  "freecount" tells it that all inodes are free. Just drop
	  it as in most cases we have to scan all inodes in the
	  chunk anyway.
	- move the "user buffer left" condition check to the only
	  point where we consume space int eh user buffer.
	- move the initialisation of agino out of the loop, leaving
	  just a simple loop control logic using the clusteridx.

Also, double handling of the user buffer variables leads to problems
tracking the current state - use the cursor variables directly
rather than keeping local copies and then having to update the
cursor before returning.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 58 ++++++++++++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 50a3e59..7ea2b11 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -283,59 +283,49 @@ xfs_bulkstat_ag_ichunk(
 	xfs_ino_t			*lastino)
 {
 	char				__user **ubufp = acp->ac_ubuffer;
-	int				ubleft = acp->ac_ubleft;
-	int				ubelem = acp->ac_ubelem;
-	int				chunkidx, clustidx;
+	int				chunkidx;
 	int				error = 0;
 	xfs_agino_t			agino;
 
-	for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
-	     XFS_BULKSTAT_UBLEFT(ubleft) &&
-	     irbp->ir_freecount < XFS_INODES_PER_CHUNK;
-	     chunkidx++, clustidx++, agino++) {
-		int		fmterror;	/* bulkstat formatter result */
+	agino = irbp->ir_startino;
+	for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
+	     chunkidx++, agino++) {
+		int		fmterror;
 		int		ubused;
 		xfs_ino_t	ino = XFS_AGINO_TO_INO(mp, agno, agino);
 
-		ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
-
 		/* Skip if this inode is free */
 		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
 			*lastino = ino;
 			continue;
 		}
 
-		/*
-		 * Count used inodes as free so we can tell when the
-		 * chunk is used up.
-		 */
-		irbp->ir_freecount++;
-
 		/* Get the inode and fill in a single buffer */
 		ubused = statstruct_size;
-		error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
-		if (fmterror == BULKSTAT_RV_NOTHING) {
-			if (error && error != -ENOENT && error != -EINVAL) {
-				ubleft = 0;
-				break;
-			}
-			*lastino = ino;
-			continue;
-		}
-		if (fmterror == BULKSTAT_RV_GIVEUP) {
-			ubleft = 0;
+		error = formatter(mp, ino, *ubufp, acp->ac_ubleft,
+				  &ubused, &fmterror);
+		if (fmterror == BULKSTAT_RV_GIVEUP ||
+		    (error && error != -ENOENT && error != -EINVAL)) {
+			acp->ac_ubleft = 0;
 			ASSERT(error);
 			break;
 		}
-		if (*ubufp)
-			*ubufp += ubused;
-		ubleft -= ubused;
-		ubelem++;
+
+		/* be careful not to leak error if at end of chunk */
+		if (fmterror == BULKSTAT_RV_NOTHING || error) {
+			*lastino = ino;
+			error = 0;
+			continue;
+		}
+
+		*ubufp += ubused;
+		acp->ac_ubleft -= ubused;
+		acp->ac_ubelem++;
 		*lastino = ino;
-	}
 
-	acp->ac_ubleft = ubleft;
-	acp->ac_ubelem = ubelem;
+		if (acp->ac_ubleft < statstruct_size)
+			break;
+	}
 
 	return error;
 }
-- 
cgit v1.1


From 6e57c542cb7e0e580eb53ae76a77875c7d92b4b1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:31:13 +1100
Subject: xfs: bulkstat main loop logic is a mess

There are a bunch of variables tha tare more wildy scoped than they
need to be, obfuscated user buffer checks and tortured "next inode"
tracking. This all needs cleaning up to expose the real issues that
need fixing.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 56 +++++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7ea2b11..acae335 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -348,30 +348,23 @@ xfs_bulkstat(
 	xfs_agino_t		agino;	/* inode # in allocation group */
 	xfs_agnumber_t		agno;	/* allocation group number */
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
-	int			end_of_ag; /* set if we've seen the ag end */
-	int			error;	/* error code */
-	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
-	xfs_ino_t		ino;	/* inode number (filesystem) */
-	xfs_inobt_rec_incore_t	*irbp;	/* current irec buffer pointer */
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
-	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
 	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
 	int			ubcount; /* size of user's buffer */
-	int			stat;
 	struct xfs_bulkstat_agichunk ac;
+	int			error = 0;
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
 	 */
-	ino = (xfs_ino_t)*lastinop;
-	lastino = ino;
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agino = XFS_INO_TO_AGINO(mp, ino);
+	lastino = *lastinop;
+	agno = XFS_INO_TO_AGNO(mp, lastino);
+	agino = XFS_INO_TO_AGINO(mp, lastino);
 	if (agno >= mp->m_sb.sb_agcount ||
-	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+	    lastino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 		*done = 1;
 		*ubcountp = 0;
 		return 0;
@@ -396,8 +389,13 @@ xfs_bulkstat(
 	 * inode returned; 0 means start of the allocation group.
 	 */
 	rval = 0;
-	while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) {
-		cond_resched();
+	while (agno < mp->m_sb.sb_agcount) {
+		struct xfs_inobt_rec_incore	*irbp = irbuf;
+		struct xfs_inobt_rec_incore	*irbufend = irbuf + nirbuf;
+		bool				end_of_ag = false;
+		int				icount = 0;
+		int				stat;
+
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
 		if (error)
 			break;
@@ -407,10 +405,6 @@ xfs_bulkstat(
 		 */
 		cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
 					    XFS_BTNUM_INO);
-		irbp = irbuf;
-		irbufend = irbuf + nirbuf;
-		end_of_ag = 0;
-		icount = 0;
 		if (agino > 0) {
 			/*
 			 * In the middle of an allocation group, we need to get
@@ -435,7 +429,7 @@ xfs_bulkstat(
 			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
 		}
 		if (error || stat == 0) {
-			end_of_ag = 1;
+			end_of_ag = true;
 			goto del_cursor;
 		}
 
@@ -448,7 +442,7 @@ xfs_bulkstat(
 
 			error = xfs_inobt_get_rec(cur, &r, &stat);
 			if (error || stat == 0) {
-				end_of_ag = 1;
+				end_of_ag = true;
 				goto del_cursor;
 			}
 
@@ -470,7 +464,7 @@ xfs_bulkstat(
 			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &stat);
 			if (error || stat == 0) {
-				end_of_ag = 1;
+				end_of_ag = true;
 				goto del_cursor;
 			}
 			cond_resched();
@@ -491,7 +485,7 @@ del_cursor:
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
-		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft);
+		     irbp < irbufend && ac.ac_ubleft >= statstruct_size;
 		     irbp++) {
 			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
 					formatter, statstruct_size, &ac,
@@ -502,17 +496,15 @@ del_cursor:
 			cond_resched();
 		}
 
-		/*
-		 * Set up for the next loop iteration.
-		 */
-		if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) {
-			if (end_of_ag) {
-				agno++;
-				agino = 0;
-			} else
-				agino = XFS_INO_TO_AGINO(mp, lastino);
-		} else
+		/* If we've run out of space, we are done */
+		if (ac.ac_ubleft < statstruct_size)
 			break;
+
+		if (end_of_ag) {
+			agno++;
+			agino = 0;
+		} else
+			agino = XFS_INO_TO_AGINO(mp, lastino);
 	}
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
-- 
cgit v1.1


From febe3cbe38b0bc0a925906dc90e8d59048851f87 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:31:15 +1100
Subject: xfs: bulkstat error handling is broken

The error propagation is a horror - xfs_bulkstat() returns
a rval variable which is only set if there are formatter errors. Any
sort of btree walk error or corruption will cause the bulkstat walk
to terminate but will not pass an error back to userspace. Worse
is the fact that formatter errors will also be ignored if any inodes
were correctly formatted into the user buffer.

Hence bulkstat can fail badly yet still report success to userspace.
This causes significant issues with xfsdump not dumping everything
in the filesystem yet reporting success. It's not until a restore
fails that there is any indication that the dump was bad and tha
bulkstat failed. This patch now triggers xfsdump to fail with
bulkstat errors rather than silently missing files in the dump.

This now causes bulkstat to fail when the lastino cookie does not
fall inside an existing inode chunk. The pre-3.17 code tolerated
that error by allowing the code to move to the next inode chunk
as the agino target is guaranteed to fall into the next btree
record.

With the fixes up to this point in the series, xfsdump now passes on
the troublesome filesystem image that exposes all these bugs.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_itable.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index acae335..ff3f431 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk(
 	XFS_WANT_CORRUPTED_RETURN(stat == 1);
 
 	/* Check if the record contains the inode in request */
-	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
-		return -EINVAL;
+	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
+		*icount = 0;
+		return 0;
+	}
 
 	idx = agino - irec->ir_startino + 1;
 	if (idx < XFS_INODES_PER_CHUNK &&
@@ -352,7 +354,6 @@ xfs_bulkstat(
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
 	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
-	int			rval;	/* return value error code */
 	int			ubcount; /* size of user's buffer */
 	struct xfs_bulkstat_agichunk ac;
 	int			error = 0;
@@ -388,7 +389,6 @@ xfs_bulkstat(
 	 * Loop over the allocation groups, starting from the last
 	 * inode returned; 0 means start of the allocation group.
 	 */
-	rval = 0;
 	while (agno < mp->m_sb.sb_agcount) {
 		struct xfs_inobt_rec_incore	*irbp = irbuf;
 		struct xfs_inobt_rec_incore	*irbufend = irbuf + nirbuf;
@@ -491,13 +491,16 @@ del_cursor:
 					formatter, statstruct_size, &ac,
 					&lastino);
 			if (error)
-				rval = error;
+				break;
 
 			cond_resched();
 		}
 
-		/* If we've run out of space, we are done */
-		if (ac.ac_ubleft < statstruct_size)
+		/*
+		 * If we've run out of space or had a formatting error, we
+		 * are now done
+		 */
+		if (ac.ac_ubleft < statstruct_size || error)
 			break;
 
 		if (end_of_ag) {
@@ -511,11 +514,17 @@ del_cursor:
 	 */
 	kmem_free(irbuf);
 	*ubcountp = ac.ac_ubelem;
+
 	/*
-	 * Found some inodes, return them now and return the error next time.
+	 * We found some inodes, so clear the error status and return them.
+	 * The lastino pointer will point directly at the inode that triggered
+	 * any error that occurred, so on the next call the error will be
+	 * triggered again and propagated to userspace as there will be no
+	 * formatted inodes in the buffer.
 	 */
 	if (ac.ac_ubelem)
-		rval = 0;
+		error = 0;
+
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
 		 * If we ran out of filesystem, mark lastino as off
@@ -527,7 +536,7 @@ del_cursor:
 	} else
 		*lastinop = (xfs_ino_t)lastino;
 
-	return rval;
+	return error;
 }
 
 int
-- 
cgit v1.1


From 002758992693ae63c04122603ea9261a0a58d728 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:33:52 +1100
Subject: xfs: track bulkstat progress by agino

The bulkstat main loop progress is tracked by the "lastino"
variable, which is a full 64 bit inode. However, the loop actually
works on agno/agino pairs, and so there's a significant disconnect
between the rest of the loop and the main cursor. Convert this to
use the agino, and pass the agino into the chunk formatting function
and convert it too.

This gets rid of the inconsistency in the loop processing, and
finally makes it simple for us to skip inodes at any point in the
loop simply by incrementing the agino cursor.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 71 +++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ff3f431..894924a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -282,30 +282,31 @@ xfs_bulkstat_ag_ichunk(
 	bulkstat_one_pf			formatter,
 	size_t				statstruct_size,
 	struct xfs_bulkstat_agichunk	*acp,
-	xfs_ino_t			*lastino)
+	xfs_agino_t			*last_agino)
 {
 	char				__user **ubufp = acp->ac_ubuffer;
 	int				chunkidx;
 	int				error = 0;
-	xfs_agino_t			agino;
+	xfs_agino_t			agino = irbp->ir_startino;
 
-	agino = irbp->ir_startino;
 	for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
 	     chunkidx++, agino++) {
 		int		fmterror;
 		int		ubused;
-		xfs_ino_t	ino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+		/* inode won't fit in buffer, we are done */
+		if (acp->ac_ubleft < statstruct_size)
+			break;
 
 		/* Skip if this inode is free */
-		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
-			*lastino = ino;
+		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
 			continue;
-		}
 
 		/* Get the inode and fill in a single buffer */
 		ubused = statstruct_size;
-		error = formatter(mp, ino, *ubufp, acp->ac_ubleft,
-				  &ubused, &fmterror);
+		error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
+				  *ubufp, acp->ac_ubleft, &ubused, &fmterror);
+
 		if (fmterror == BULKSTAT_RV_GIVEUP ||
 		    (error && error != -ENOENT && error != -EINVAL)) {
 			acp->ac_ubleft = 0;
@@ -315,7 +316,6 @@ xfs_bulkstat_ag_ichunk(
 
 		/* be careful not to leak error if at end of chunk */
 		if (fmterror == BULKSTAT_RV_NOTHING || error) {
-			*lastino = ino;
 			error = 0;
 			continue;
 		}
@@ -323,12 +323,18 @@ xfs_bulkstat_ag_ichunk(
 		*ubufp += ubused;
 		acp->ac_ubleft -= ubused;
 		acp->ac_ubelem++;
-		*lastino = ino;
-
-		if (acp->ac_ubleft < statstruct_size)
-			break;
 	}
 
+	/*
+	 * Post-update *last_agino. At this point, agino will always point one
+	 * inode past the last inode we processed successfully. Hence we
+	 * substract that inode when setting the *last_agino cursor so that we
+	 * return the correct cookie to userspace. On the next bulkstat call,
+	 * the inode under the lastino cookie will be skipped as we have already
+	 * processed it here.
+	 */
+	*last_agino = agino - 1;
+
 	return error;
 }
 
@@ -352,7 +358,6 @@ xfs_bulkstat(
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
 	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
-	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
 	int			ubcount; /* size of user's buffer */
 	struct xfs_bulkstat_agichunk ac;
@@ -361,11 +366,10 @@ xfs_bulkstat(
 	/*
 	 * Get the last inode value, see if there's nothing to do.
 	 */
-	lastino = *lastinop;
-	agno = XFS_INO_TO_AGNO(mp, lastino);
-	agino = XFS_INO_TO_AGINO(mp, lastino);
+	agno = XFS_INO_TO_AGNO(mp, *lastinop);
+	agino = XFS_INO_TO_AGINO(mp, *lastinop);
 	if (agno >= mp->m_sb.sb_agcount ||
-	    lastino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+	    *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
 		*done = 1;
 		*ubcountp = 0;
 		return 0;
@@ -420,7 +424,6 @@ xfs_bulkstat(
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
 				irbp++;
-				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			}
 			/* Increment to the next record */
 			error = xfs_btree_increment(cur, 0, &stat);
@@ -458,10 +461,6 @@ xfs_bulkstat(
 				irbp++;
 				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
 			}
-			/*
-			 * Set agino to after this chunk and bump the cursor.
-			 */
-			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &stat);
 			if (error || stat == 0) {
 				end_of_ag = true;
@@ -481,7 +480,9 @@ del_cursor:
 		if (error)
 			break;
 		/*
-		 * Now format all the good inodes into the user's buffer.
+		 * Now format all the good inodes into the user's buffer. The
+		 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+		 * for the next loop iteration.
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
@@ -489,7 +490,7 @@ del_cursor:
 		     irbp++) {
 			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
 					formatter, statstruct_size, &ac,
-					&lastino);
+					&agino);
 			if (error)
 				break;
 
@@ -506,8 +507,7 @@ del_cursor:
 		if (end_of_ag) {
 			agno++;
 			agino = 0;
-		} else
-			agino = XFS_INO_TO_AGINO(mp, lastino);
+		}
 	}
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
@@ -525,16 +525,13 @@ del_cursor:
 	if (ac.ac_ubelem)
 		error = 0;
 
-	if (agno >= mp->m_sb.sb_agcount) {
-		/*
-		 * If we ran out of filesystem, mark lastino as off
-		 * the end of the filesystem, so the next call
-		 * will return immediately.
-		 */
-		*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+	/*
+	 * If we ran out of filesystem, lastino will point off the end of
+	 * the filesystem so the next call will return immediately.
+	 */
+	*lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
+	if (agno >= mp->m_sb.sb_agcount)
 		*done = 1;
-	} else
-		*lastinop = (xfs_ino_t)lastino;
 
 	return error;
 }
-- 
cgit v1.1