Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull second vfs pile from Al Viro: "The stuff in there: fsfreeze deadlock fixes by Jan (essentially, the deadlock reproduced by xfstests 068), symlink and hardlink restriction patches, plus assorted cleanups and fixes. Note that another fsfreeze deadlock (emergency thaw one) is *not* dealt with - the series by Fernando conflicts a lot with Jan's, breaks userland ABI (FIFREEZE semantics gets changed) and trades the deadlock for massive vfsmount leak; this is going to be handled next cycle. There probably will be another pull request, but that stuff won't be in it." Fix up trivial conflicts due to unrelated changes next to each other in drivers/{staging/gdm72xx/usb_boot.c, usb/gadget/storage_common.c} * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (54 commits) delousing target_core_file a bit Documentation: Correct s_umount state for freeze_fs/unfreeze_fs fs: Remove old freezing mechanism ext2: Implement freezing btrfs: Convert to new freezing mechanism nilfs2: Convert to new freezing mechanism ntfs: Convert to new freezing mechanism fuse: Convert to new freezing mechanism gfs2: Convert to new freezing mechanism ocfs2: Convert to new freezing mechanism xfs: Convert to new freezing code ext4: Convert to new freezing mechanism fs: Protect write paths by sb_start_write - sb_end_write fs: Skip atime update on frozen filesystem fs: Add freezing handling to mnt_want_write() / mnt_drop_write() fs: Improve filesystem freezing handling switch the protection of percpu_counter list to spinlock nfsd: Push mnt_want_write() outside of i_mutex btrfs: Push mnt_want_write() outside of i_mutex fat: Push mnt_want_write() outside of i_mutex ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-08-01 10:26:23 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-08-01 10:26:23 -0700
commit: a0e881b7c189fa2bd76c024dbff91e79511c971d (patch)
tree: 0c801918565b08921d21aceee5b326f64d998f5f /fs/super.c
parent: eff0d13f3823f35d70228cd151d2a2c89288ff32 (diff)
parent: dbc6e0222d79e78925fe20733844a796a4b72cf9 (diff)
download: op-kernel-dev-a0e881b7c189fa2bd76c024dbff91e79511c971d.zip
op-kernel-dev-a0e881b7c189fa2bd76c024dbff91e79511c971d.tar.gz
1 files changed, 230 insertions, 22 deletions
diff --git a/fs/super.c b/fs/super.c
index 4bf7144..b05cf47 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
+#include <linux/lockdep.h>
 #include "internal.h"
 
 
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
 
+static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+	"sb_writers",
+	"sb_pagefaults",
+	"sb_internal",
+};
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 	return total_objects;
 }
 
+static int init_sb_writers(struct super_block *s, struct file_system_type *type)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+		err = percpu_counter_init(&s->s_writers.counter[i], 0);
+		if (err < 0)
+			goto err_out;
+		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+				 &type->s_writers_key[i], 0);
+	}
+	init_waitqueue_head(&s->s_writers.wait);
+	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	return 0;
+err_out:
+	while (--i >= 0)
+		percpu_counter_destroy(&s->s_writers.counter[i]);
+	return err;
+}
+
+static void destroy_sb_writers(struct super_block *s)
+{
+	int i;
+
+	for (i = 0; i < SB_FREEZE_LEVELS; i++)
+		percpu_counter_destroy(&s->s_writers.counter[i]);
+}
+
 /**
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 
 	if (s) {
 		if (security_sb_alloc(s)) {
+			/*
+			 * We cannot call security_sb_free() without
+			 * security_sb_alloc() succeeding. So bail out manually
+			 */
 			kfree(s);
 			s = NULL;
 			goto out;
 		}
 #ifdef CONFIG_SMP
 		s->s_files = alloc_percpu(struct list_head);
-		if (!s->s_files) {
-			security_sb_free(s);
-			kfree(s);
-			s = NULL;
-			goto out;
-		} else {
+		if (!s->s_files)
+			goto err_out;
+		else {
 			int i;
 
 			for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 #else
 		INIT_LIST_HEAD(&s->s_files);
 #endif
+		if (init_sb_writers(s, type))
+			goto err_out;
 		s->s_flags = flags;
 		s->s_bdi = &default_backing_dev_info;
 		INIT_HLIST_NODE(&s->s_instances);
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		mutex_init(&s->s_dquot.dqio_mutex);
 		mutex_init(&s->s_dquot.dqonoff_mutex);
 		init_rwsem(&s->s_dquot.dqptr_sem);
-		init_waitqueue_head(&s->s_wait_unfrozen);
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	}
 out:
 	return s;
+err_out:
+	security_sb_free(s);
+#ifdef CONFIG_SMP
+	if (s->s_files)
+		free_percpu(s->s_files);
+#endif
+	destroy_sb_writers(s);
+	kfree(s);
+	s = NULL;
+	goto out;
 }
 
 /**
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)
 #ifdef CONFIG_SMP
 	free_percpu(s->s_files);
 #endif
+	destroy_sb_writers(s);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
@@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
 {
 	while (1) {
 		struct super_block *s = get_super(bdev);
-		if (!s || s->s_frozen == SB_UNFROZEN)
+		if (!s || s->s_writers.frozen == SB_UNFROZEN)
 			return s;
 		up_read(&s->s_umount);
-		vfs_check_frozen(s, SB_FREEZE_WRITE);
+		wait_event(s->s_writers.wait_unfrozen,
+			   s->s_writers.frozen == SB_UNFROZEN);
 		put_super(s);
 	}
 }
@@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	int retval;
 	int remount_ro;
 
-	if (sb->s_frozen != SB_UNFROZEN)
+	if (sb->s_writers.frozen != SB_UNFROZEN)
 		return -EBUSY;
 
 #ifdef CONFIG_BLOCK
@@ -1163,6 +1213,120 @@ out:
 	return ERR_PTR(error);
 }
 
+/*
+ * This is an internal function, please use sb_end_{write,pagefault,intwrite}
+ * instead.
+ */
+void __sb_end_write(struct super_block *sb, int level)
+{
+	percpu_counter_dec(&sb->s_writers.counter[level-1]);
+	/*
+	 * Make sure s_writers are updated before we wake up waiters in
+	 * freeze_super().
+	 */
+	smp_mb();
+	if (waitqueue_active(&sb->s_writers.wait))
+		wake_up(&sb->s_writers.wait);
+	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+}
+EXPORT_SYMBOL(__sb_end_write);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * We want lockdep to tell us about possible deadlocks with freezing but
+ * it's it bit tricky to properly instrument it. Getting a freeze protection
+ * works as getting a read lock but there are subtle problems. XFS for example
+ * gets freeze protection on internal level twice in some cases, which is OK
+ * only because we already hold a freeze protection also on higher level. Due
+ * to these cases we have to tell lockdep we are doing trylock when we
+ * already hold a freeze protection for a higher freeze level.
+ */
+static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
+				unsigned long ip)
+{
+	int i;
+
+	if (!trylock) {
+		for (i = 0; i < level - 1; i++)
+			if (lock_is_held(&sb->s_writers.lock_map[i])) {
+				trylock = true;
+				break;
+			}
+	}
+	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
+}
+#endif
+
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+retry:
+	if (unlikely(sb->s_writers.frozen >= level)) {
+		if (!wait)
+			return 0;
+		wait_event(sb->s_writers.wait_unfrozen,
+			   sb->s_writers.frozen < level);
+	}
+
+#ifdef CONFIG_LOCKDEP
+	acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+#endif
+	percpu_counter_inc(&sb->s_writers.counter[level-1]);
+	/*
+	 * Make sure counter is updated before we check for frozen.
+	 * freeze_super() first sets frozen and then checks the counter.
+	 */
+	smp_mb();
+	if (unlikely(sb->s_writers.frozen >= level)) {
+		__sb_end_write(sb, level);
+		goto retry;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(__sb_start_write);
+
+/**
+ * sb_wait_write - wait until all writers to given file system finish
+ * @sb: the super for which we wait
+ * @level: type of writers we wait for (normal vs page fault)
+ *
+ * This function waits until there are no writers of given type to given file
+ * system. Caller of this function should make sure there can be no new writers
+ * of type @level before calling this function. Otherwise this function can
+ * livelock.
+ */
+static void sb_wait_write(struct super_block *sb, int level)
+{
+	s64 writers;
+
+	/*
+	 * We just cycle-through lockdep here so that it does not complain
+	 * about returning with lock to userspace
+	 */
+	rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+
+	do {
+		DEFINE_WAIT(wait);
+
+		/*
+		 * We use a barrier in prepare_to_wait() to separate setting
+		 * of frozen and checking of the counter
+		 */
+		prepare_to_wait(&sb->s_writers.wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
+		if (writers)
+			schedule();
+
+		finish_wait(&sb->s_writers.wait, &wait);
+	} while (writers);
+}
+
 /**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
@@ -1170,6 +1334,31 @@ out:
  * Syncs the super to make sure the filesystem is consistent and calls the fs's
  * freeze_fs.  Subsequent calls to this without first thawing the fs will return
  * -EBUSY.
+ *
+ * During this function, sb->s_writers.frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
+ * writes should be blocked, though page faults are still allowed. We wait for
+ * all writes to complete and then proceed to the next stage.
+ *
+ * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
+ * but internal fs threads can still modify the filesystem (although they
+ * should not dirty new pages or inodes), writeback can run etc. After waiting
+ * for all running page faults we sync the filesystem which will clean all
+ * dirty pages and inodes (no new dirty pages or inodes can be created when
+ * sync is running).
+ *
+ * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
+ * modification are blocked (e.g. XFS preallocation truncation on inode
+ * reclaim). This is usually implemented by blocking new transactions for
+ * filesystems that have them and need this additional guard. After all
+ * internal writers are finished we call ->freeze_fs() to finish filesystem
+ * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
+ * mostly auxiliary for filesystems to verify they do not modify frozen fs.
+ *
+ * sb->s_writers.frozen is protected by sb->s_umount.
  */
 int freeze_super(struct super_block *sb)
 {
@@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb)
 
 	atomic_inc(&sb->s_active);
 	down_write(&sb->s_umount);
-	if (sb->s_frozen) {
+	if (sb->s_writers.frozen != SB_UNFROZEN) {
 		deactivate_locked_super(sb);
 		return -EBUSY;
 	}
@@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb)
 	}
 
 	if (sb->s_flags & MS_RDONLY) {
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
+		/* Nothing to do really... */
+		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 		up_write(&sb->s_umount);
 		return 0;
 	}
 
-	sb->s_frozen = SB_FREEZE_WRITE;
+	/* From now on, no new normal writers can start */
+	sb->s_writers.frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
+	up_write(&sb->s_umount);
+
+	sb_wait_write(sb, SB_FREEZE_WRITE);
+
+	/* Now we go and block page faults... */
+	down_write(&sb->s_umount);
+	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
 	smp_wmb();
 
+	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
+
+	/* All writers are done so after syncing there won't be dirty data */
 	sync_filesystem(sb);
 
-	sb->s_frozen = SB_FREEZE_TRANS;
+	/* Now wait for internal filesystem counter */
+	sb->s_writers.frozen = SB_FREEZE_FS;
 	smp_wmb();
+	sb_wait_write(sb, SB_FREEZE_FS);
 
-	sync_blockdev(sb->s_bdev);
 	if (sb->s_op->freeze_fs) {
 		ret = sb->s_op->freeze_fs(sb);
 		if (ret) {
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
-			sb->s_frozen = SB_UNFROZEN;
+			sb->s_writers.frozen = SB_UNFROZEN;
 			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
+			wake_up(&sb->s_writers.wait_unfrozen);
 			deactivate_locked_super(sb);
 			return ret;
 		}
 	}
+	/*
+	 * This is just for debugging purposes so that fs can warn if it
+	 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+	 */
+	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 	up_write(&sb->s_umount);
 	return 0;
 }
@@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb)
 	int error;
 
 	down_write(&sb->s_umount);
-	if (sb->s_frozen == SB_UNFROZEN) {
+	if (sb->s_writers.frozen == SB_UNFROZEN) {
 		up_write(&sb->s_umount);
 		return -EINVAL;
 	}
@@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb)
 		if (error) {
 			printk(KERN_ERR
 				"VFS:Filesystem thaw failed\n");
-			sb->s_frozen = SB_FREEZE_TRANS;
 			up_write(&sb->s_umount);
 			return error;
 		}
 	}
 
 out:
-	sb->s_frozen = SB_UNFROZEN;
+	sb->s_writers.frozen = SB_UNFROZEN;
 	smp_wmb();
-	wake_up(&sb->s_wait_unfrozen);
+	wake_up(&sb->s_writers.wait_unfrozen);
 	deactivate_locked_super(sb);
 
 	return 0;
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-08-01 10:26:23 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-08-01 10:26:23 -0700
commit	a0e881b7c189fa2bd76c024dbff91e79511c971d (patch)
tree	0c801918565b08921d21aceee5b326f64d998f5f /fs/super.c
parent	eff0d13f3823f35d70228cd151d2a2c89288ff32 (diff)
parent	dbc6e0222d79e78925fe20733844a796a4b72cf9 (diff)
download	op-kernel-dev-a0e881b7c189fa2bd76c024dbff91e79511c971d.zip op-kernel-dev-a0e881b7c189fa2bd76c024dbff91e79511c971d.tar.gz