From aa9660196b250b850c9d06046c9f3b1eb965a708 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Mon, 1 Oct 2012 16:50:54 -0300
Subject: ext3: fix possible non-initialized variable on
 htree_dirblock_to_tree()

This is a backport of ext4 commit 90b0a9732 which fixes a possible
non-initialized variable on htree_dirblock_to_tree().
Ext3 has the same non initialized variable, but, in any case it will be
initialized by ext3_get_blocks_handle(), which will avoid the bug to be
triggered, but, the non-initialized variable by htree_dirblock_to_tree() is
still a bug.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8f4fdda..7f6c938 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -559,7 +559,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 {
 	struct buffer_head *bh;
 	struct ext3_dir_entry_2 *de, *top;
-	int err, count = 0;
+	int err = 0, count = 0;
 
 	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
 	if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
-- 
cgit v1.1


From c3d59ad6ab0b3d01c10f326bbc9b089424a3a5c4 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Tue, 2 Oct 2012 23:59:23 -0300
Subject: ext3: ext3_bread usage audit

This is the ext3 version of the same patch applied to Ext4, where such goal is
to audit the usage of ext3_bread() due a possible misinterpretion of its return
value.

Focused on directory blocks, a NULL value returned from ext3_bread() means a
hole, which cannot exist into a directory inode. It can pass undetected after a
fix in an uninitialized error variable.

The (now) initialized variable into ext3_getblk() may lead to a zero'ed return
value of ext3_bread() to its callers, which can make the caller do not detect
the hole in the directory inode.

This patch creates a new wrapper function ext3_dir_bread() which checks for
holes properly, reports error, and returns EIO in that case.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/namei.c | 38 ++++++++++++++++++++------------------
 fs/ext3/namei.h | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7f6c938..890b894 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -46,8 +46,7 @@ static struct buffer_head *ext3_append(handle_t *handle,
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
-	bh = ext3_bread(handle, inode, *block, 1, err);
-	if (bh) {
+	if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
 		inode->i_size += inode->i_sb->s_blocksize;
 		EXT3_I(inode)->i_disksize = inode->i_size;
 		*err = ext3_journal_get_write_access(handle, bh);
@@ -339,8 +338,10 @@ dx_probe(struct qstr *entry, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+	if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
+		*err = ERR_BAD_DX_DIR;
 		goto fail;
+	}
 	root = (struct dx_root *) bh->b_data;
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -436,8 +437,10 @@ dx_probe(struct qstr *entry, struct inode *dir,
 		frame->entries = entries;
 		frame->at = at;
 		if (!indirect--) return frame;
-		if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+		if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
+			*err = ERR_BAD_DX_DIR;
 			goto fail2;
+		}
 		at = entries = ((struct dx_node *) bh->b_data)->entries;
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
 			ext3_warning(dir->i_sb, __func__,
@@ -535,8 +538,8 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 	 * block so no check is necessary
 	 */
 	while (num_frames--) {
-		if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
-				      0, &err)))
+		if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
+					  0, &err)))
 			return err; /* Failure */
 		p++;
 		brelse (p->bh);
@@ -562,7 +565,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	int err = 0, count = 0;
 
 	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
-	if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
+
+	if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
 		return err;
 
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
@@ -976,7 +980,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
 		return NULL;
 	do {
 		block = dx_get_block(frame->at);
-		if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+		if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
 			goto errout;
 
 		retval = search_dirblock(bh, dir, entry,
@@ -1458,9 +1462,9 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
 	for (block = 0; block < blocks; block++) {
-		bh = ext3_bread(handle, dir, block, 0, &retval);
-		if(!bh)
+		if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
 			return retval;
+
 		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
 		if (retval != -ENOSPC)
 			return retval;
@@ -1500,7 +1504,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	entries = frame->entries;
 	at = frame->at;
 
-	if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+	if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
 		goto cleanup;
 
 	BUFFER_TRACE(bh, "get_write_access");
@@ -1790,8 +1794,7 @@ retry:
 	inode->i_op = &ext3_dir_inode_operations;
 	inode->i_fop = &ext3_dir_operations;
 	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	dir_block = ext3_bread (handle, inode, 0, 1, &err);
-	if (!dir_block)
+	if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
 		goto out_clear_inode;
 
 	BUFFER_TRACE(dir_block, "get_write_access");
@@ -1859,7 +1862,7 @@ static int empty_dir (struct inode * inode)
 
 	sb = inode->i_sb;
 	if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
-	    !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+	    !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
 			ext3_error(inode->i_sb, __func__,
 				   "error %d reading directory #%lu offset 0",
@@ -1890,9 +1893,8 @@ static int empty_dir (struct inode * inode)
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
 			err = 0;
 			brelse (bh);
-			bh = ext3_bread (NULL, inode,
-				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
-			if (!bh) {
+			if (!(bh = ext3_dir_bread (NULL, inode,
+				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
 				if (err)
 					ext3_error(sb, __func__,
 						   "error %d reading directory"
@@ -2388,7 +2390,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 				goto end_rename;
 		}
 		retval = -EIO;
-		dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+		dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
 		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
index f2ce2b0..46304d8 100644
--- a/fs/ext3/namei.h
+++ b/fs/ext3/namei.h
@@ -6,3 +6,22 @@
 */
 
 extern struct dentry *ext3_get_parent(struct dentry *child);
+
+static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
+						 struct inode *inode,
+						 int block, int create,
+						 int *err)
+{
+	struct buffer_head *bh;
+
+	bh = ext3_bread(handle, inode, block, create, err);
+
+	if (!bh && !(*err)) {
+		*err = -EIO;
+		ext3_error(inode->i_sb, __func__,
+			   "Directory hole detected on inode %lu\n",
+			   inode->i_ino);
+		return NULL;
+	}
+	return bh;
+}
-- 
cgit v1.1


From ae2cf4284e198684cad8f654923dd8062ee46f88 Mon Sep 17 00:00:00 2001
From: Zhao Hongjiang <zhaohongjiang@huawei.com>
Date: Tue, 9 Oct 2012 13:44:36 +0800
Subject: ext2: fix return values on parse_options() failure

parse_options() in ext2 should return 0 when parse the mount options fails.

Signed-off-by: Zhao Hongjiang <zhaohongjiang@huawei.com>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 6c205d0..fa04d02 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -469,7 +469,7 @@ static int parse_options(char *options, struct super_block *sb)
 			uid = make_kuid(current_user_ns(), option);
 			if (!uid_valid(uid)) {
 				ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option);
-				return -1;
+				return 0;
 
 			}
 			sbi->s_resuid = uid;
@@ -480,7 +480,7 @@ static int parse_options(char *options, struct super_block *sb)
 			gid = make_kgid(current_user_ns(), option);
 			if (!gid_valid(gid)) {
 				ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option);
-				return -1;
+				return 0;
 			}
 			sbi->s_resgid = gid;
 			break;
-- 
cgit v1.1


From 66415f6b50f5bcbc56c78df54aacee73f3d03a3d Mon Sep 17 00:00:00 2001
From: Zhao Hongjiang <zhaohongjiang@huawei.com>
Date: Tue, 9 Oct 2012 13:48:47 +0800
Subject: ext3: fix return values on parse_options() failure

parse_options() in ext3 should return 0 when parse the mount options fails.

Signed-off-by: Zhao Hongjiang <zhaohongjiang@huawei.com>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 17ae5c8..ebf8312 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1001,7 +1001,7 @@ static int parse_options (char *options, struct super_block *sb,
 			uid = make_kuid(current_user_ns(), option);
 			if (!uid_valid(uid)) {
 				ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
-				return -1;
+				return 0;
 
 			}
 			sbi->s_resuid = uid;
@@ -1012,7 +1012,7 @@ static int parse_options (char *options, struct super_block *sb,
 			gid = make_kgid(current_user_ns(), option);
 			if (!gid_valid(gid)) {
 				ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
-				return -1;
+				return 0;
 			}
 			sbi->s_resgid = gid;
 			break;
-- 
cgit v1.1


From 6c29c50fda25c2ac2ce45a9b042ff7e424aa8eac Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 9 Oct 2012 23:30:17 +0200
Subject: quota: Silence warning about PRJQUOTA not being handled in
 need_print_warning()

PRJQUOTA value of quota type should never reach need_print_warning() since XFS
(which is the only fs which uses that type) doesn't use generic functions
calling this function. Anyway, add PRJQUOTA case to the switch to make gcc
happy.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 557a9c20..05ae3c9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1160,6 +1160,8 @@ static int need_print_warning(struct dquot_warn *warn)
 			return uid_eq(current_fsuid(), warn->w_dq_id.uid);
 		case GRPQUOTA:
 			return in_group_p(warn->w_dq_id.gid);
+		case PRJQUOTA:	/* Never taken... Just make gcc happy */
+			return 0;
 	}
 	return 0;
 }
-- 
cgit v1.1


From 45525b26a46cd593cb72070304c4cd7c8391bd37 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Oct 2012 13:30:07 -0400
Subject: fix a leak in replace_fd() users

replace_fd() began with "eats a reference, tries to insert into
descriptor table" semantics; at some point I'd switched it to
much saner current behaviour ("try to insert into descriptor
table, grabbing a new reference if inserted; caller should do
fput() in any case"), but forgot to update the callers.
Mea culpa...

[Spotted by Pavel Roskin, who has really weird system with pipe-fed
coredumps as part of what he considers a normal boot ;-)]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index fd37fac..ce47379 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -450,11 +450,12 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 
 	cp->file = files[1];
 
-	replace_fd(0, files[0], 0);
+	err = replace_fd(0, files[0], 0);
+	fput(files[0]);
 	/* and disallow core files too */
 	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
 
-	return 0;
+	return err;
 }
 
 void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
-- 
cgit v1.1


From 32f8516a8c733d281faa9f6666b509035246505c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Oct 2012 17:31:23 -0700
Subject: mm, mempolicy: fix printing stack contents in numa_maps

When reading /proc/pid/numa_maps, it's possible to return the contents of
the stack where the mempolicy string should be printed if the policy gets
freed from beneath us.

This happens because mpol_to_str() may return an error the
stack-allocated buffer is then printed without ever being stored.

There are two possible error conditions in mpol_to_str():

 - if the buffer allocated is insufficient for the string to be stored,
   and

 - if the mempolicy has an invalid mode.

The first error condition is not triggered in any of the callers to
mpol_to_str(): at least 50 bytes is always allocated on the stack and this
is sufficient for the string to be written.  A future patch should convert
this into BUILD_BUG_ON() since we know the maximum strlen possible, but
that's not -rc material.

The second error condition is possible if a race occurs in dropping a
reference to a task's mempolicy causing it to be freed during the read().
The slab poison value is then used for the mode and mpol_to_str() returns
-EINVAL.

This race is only possible because get_vma_policy() believes that
mm->mmap_sem protects task->mempolicy, which isn't true.  The exit path
does not hold mm->mmap_sem when dropping the reference or setting
task->mempolicy to NULL: it uses task_lock(task) instead.

Thus, it's required for the caller of a task mempolicy to hold
task_lock(task) while grabbing the mempolicy and reading it.  Callers with
a vma policy store their mempolicy earlier and can simply increment the
reference count so it's guaranteed not to be freed.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 79827ce..14df880 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1158,6 +1158,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
 	struct file *file = vma->vm_file;
+	struct task_struct *task = proc_priv->task;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {};
 	struct mempolicy *pol;
@@ -1177,9 +1178,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.private = md;
 	walk.mm = mm;
 
-	pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+	task_lock(task);
+	pol = get_vma_policy(task, vma, vma->vm_start);
 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
+	task_unlock(task);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1189,7 +1192,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_printf(m, " heap");
 	} else {
-		pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
+		pid_t tid = vm_is_stack(task, vma, is_pid);
 		if (tid != 0) {
 			/*
 			 * Thread stack in /proc/PID/task/TID/maps or
-- 
cgit v1.1


From cd0b16c1c3cda12dbed1f8de8f1a9b0591990724 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 13 Oct 2012 00:30:28 -0400
Subject: NLM: nlm_lookup_file() may return NLMv4-specific error codes

If the filehandle is stale, or open access is denied for some reason,
nlm_fopen() may return one of the NLMv4-specific error codes nlm4_stale_fh
or nlm4_failed. These get passed right through nlm_lookup_file(),
and so when nlmsvc_retrieve_args() calls the latter, it needs to filter
the result through the cast_status() machinery.

Failure to do so, will trigger the BUG_ON() in encode_nlm_stat...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Reported-by: Larry McVoy <lm@bitmover.com>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/clntxdr.c | 2 +-
 fs/lockd/svcproc.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index d269ada..982d267 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -223,7 +223,7 @@ static void encode_nlm_stat(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+	WARN_ON_ONCE(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
 	p = xdr_reserve_space(xdr, 4);
 	*p = stat;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3009a36..21171f0 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -68,7 +68,8 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain file pointer. Not used by FREE_ALL call. */
 	if (filp != NULL) {
-		if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0)
+		error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh));
+		if (error != 0)
 			goto no_locks;
 		*filp = file;
 
-- 
cgit v1.1


From 43385846968b082bb6c174e8b17479e5123b8d73 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 17 Oct 2012 20:41:15 -0700
Subject: fs, xattr: fix bug when removing a name not in xattr list

Commit 38f38657444d ("xattr: extract simple_xattr code from tmpfs") moved
some code from tmpfs but introduced a subtle bug along the way.

If the name passed to simple_xattr_remove() does not exist in the list of
xattrs, then it is possible to call kfree(new_xattr) when new_xattr is
actually initialized to itself on the stack via uninitialized_var().

This causes a BUG() since the memory was not allocated via the slab
allocator and was not bypassed through to the page allocator because it
was too large.

Initialize the local variable to NULL so the kfree() never takes place.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index e164ddd..e21c119 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -846,7 +846,7 @@ static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
 			      const void *value, size_t size, int flags)
 {
 	struct simple_xattr *xattr;
-	struct simple_xattr *uninitialized_var(new_xattr);
+	struct simple_xattr *new_xattr = NULL;
 	int err = 0;
 
 	/* value == NULL means remove */
-- 
cgit v1.1


From 9e7814404b77c3e8920bee4277162bf3a7460505 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 19 Oct 2012 17:00:55 +0900
Subject: hold task->mempolicy while numa_maps scans.

  /proc/<pid>/numa_maps scans vma and show mempolicy under
  mmap_sem. It sometimes accesses task->mempolicy which can
  be freed without mmap_sem and numa_maps can show some
  garbage while scanning.

This patch tries to take reference count of task->mempolicy at reading
numa_maps before calling get_vma_policy(). By this, task->mempolicy
will not be freed until numa_maps reaches its end.

V2->v3
  -  updated comments to be more verbose.
  -  removed task_lock() in numa_maps code.
V1->V2
  -  access task->mempolicy only once and remember it.  Becase kernel/exit.c
     can overwrite it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h |  4 ++++
 fs/proc/task_mmu.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cceaab0..43973b0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/proc_fs.h>
 struct  ctl_table_header;
+struct  mempolicy;
 
 extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
@@ -74,6 +75,9 @@ struct proc_maps_private {
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
 #endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *task_mempolicy;
+#endif
 };
 
 void proc_init_inodecache(void);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 14df880..90c63f9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -90,10 +90,55 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * These functions are for numa_maps but called in generic **maps seq_file
+ * ->start(), ->stop() ops.
+ *
+ * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
+ * Each mempolicy object is controlled by reference counting. The problem here
+ * is how to avoid accessing dead mempolicy object.
+ *
+ * Because we're holding mmap_sem while reading seq_file, it's safe to access
+ * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
+ *
+ * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
+ * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
+ * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
+ * gurantee the task never exits under us. But taking task_lock() around
+ * get_vma_plicy() causes lock order problem.
+ *
+ * To access task->mempolicy without lock, we hold a reference count of an
+ * object pointed by task->mempolicy and remember it. This will guarantee
+ * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
+ */
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+	struct task_struct *task = priv->task;
+
+	task_lock(task);
+	priv->task_mempolicy = task->mempolicy;
+	mpol_get(priv->task_mempolicy);
+	task_unlock(task);
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+	mpol_put(priv->task_mempolicy);
+}
+#else
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+#endif
+
 static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
 	if (vma && vma != priv->tail_vma) {
 		struct mm_struct *mm = vma->vm_mm;
+		release_task_mempolicy(priv);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
@@ -132,7 +177,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 
 	tail_vma = get_gate_vma(priv->task->mm);
 	priv->tail_vma = tail_vma;
-
+	hold_task_mempolicy(priv);
 	/* Start with last addr hint */
 	vma = find_vma(mm, last_addr);
 	if (last_addr && vma) {
@@ -159,6 +204,7 @@ out:
 	if (vma)
 		return vma;
 
+	release_task_mempolicy(priv);
 	/* End of vmas has been reached */
 	m->version = (tail_vma != NULL)? 0: -1UL;
 	up_read(&mm->mmap_sem);
@@ -1178,11 +1224,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.private = md;
 	walk.mm = mm;
 
-	task_lock(task);
 	pol = get_vma_policy(task, vma, vma->vm_start);
 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
-	task_unlock(task);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
-- 
cgit v1.1