188 files changed, 6549 insertions, 5386 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b0..979992d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -121,9 +121,25 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
+config TMPFS_XATTR
+	bool "Tmpfs extended attributes"
+	depends on TMPFS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  Currently this enables support for the trusted.* and
+	  security.* namespaces.
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on tmpfs.
+
 config TMPFS_POSIX_ACL
 	bool "Tmpfs POSIX Access Control Lists"
-	depends on TMPFS
+	depends on TMPFS_XATTR
 	select GENERIC_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d305..1bffbe0 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 	int res;
 	char buf[16];
 
+	memset(&bprm, 0, sizeof(bprm));
+
 	/* Create the file name */
 	sprintf(buf, "/lib/lib%d.so", id);
 
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 	if (!bprm.cred)
 		goto out;
 
+	/* We don't really care about recalculating credentials at this point
+	 * as we're past the point of no return and are dealing with shared
+	 * libraries.
+	 */
+	bprm.cred_prepared = 1;
+
 	res = prepare_binprm(&bprm);
 
 	if (!IS_ERR_VALUE(res))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ba41da5..96fcfa5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -10,6 +10,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/prefetch.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 199a801..f340f7c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -709,7 +709,7 @@ again:
 	WARN_ON(cur->checked);
 	if (!list_empty(&cur->upper)) {
 		/*
-		 * the backref was added previously when processsing
+		 * the backref was added previously when processing
 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 		 */
 		BUG_ON(!list_is_singular(&cur->upper));
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 7cb0f7f..75c47cd 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -7,6 +7,7 @@ config CIFS
 	select CRYPTO_MD5
 	select CRYPTO_HMAC
 	select CRYPTO_ARC4
+	select CRYPTO_DES
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
@@ -152,16 +153,28 @@ config CIFS_ACL
 	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
 	    is handed over to the application/caller.
 
-config CIFS_EXPERIMENTAL
-	  bool "CIFS Experimental Features (EXPERIMENTAL)"
+config CIFS_SMB2
+	bool "SMB2 network file system support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && INET && BROKEN
+	select NLS
+	select KEYS
+	select FSCACHE
+	select DNS_RESOLVER
+
+	help
+	  This enables experimental support for the SMB2 (Server Message Block
+	  version 2) protocol. The SMB2 protocol is the successor to the
+	  popular CIFS and SMB network file sharing protocols. SMB2 is the
+	  native file sharing mechanism for recent versions of Windows
+	  operating systems (since Vista).  SMB2 enablement will eventually
+	  allow users better performance, security and features, than would be
+	  possible with cifs. Note that smb2 mount options also are simpler
+	  (compared to cifs) due to protocol improvements.
+
+	  Unless you are a developer or tester, say N.
+
+config CIFS_NFSD_EXPORT
+	  bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
 	  help
-	    Enables cifs features under testing. These features are
-	    experimental and currently include DFS support and directory
-	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
-	    mechanism which will be used for Kerberos session negotiation
-	    and uid remapping.  Some of these features also may depend on
-	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
-	    (which is disabled by default). See the file fs/cifs/README
-	    for more details.  If unsure, say N.
-
+	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index d875584..005d524 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
-	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
+	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
 	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o
 
diff --git a/fs/cifs/README b/fs/cifs/README
index 74ab165f..4a3ca0e 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -704,18 +704,6 @@ the start of smb requests and responses can be enabled via:
 
 	echo 1 > /proc/fs/cifs/traceSMB
 
-Two other experimental features are under development. To test these
-requires enabling CONFIG_CIFS_EXPERIMENTAL
-
-	cifsacl support needed to retrieve approximated mode bits based on
-		the contents on the CIFS ACL.
-
-	lease support: cifs will check the oplock state before calling into
-	the vfs to see if we can grant a lease on a file.
-
-	DNOTIFY fcntl: needed for support of directory change 
-			    notification and perhaps later for file leases)
-
 Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
 if the kernel was configured with cifs statistics enabled.  The statistics
 represent the number of successful (ie non-zero return code from the server) 
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 30d01bc..18f4272 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -63,7 +63,7 @@ void cifs_dump_detail(struct smb_hdr *smb)
 	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
 		  smb->Command, smb->Status.CifsError,
 		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-	cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
+	cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
 }
 
 
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ac51cd2..a9d5692 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -58,9 +58,7 @@ struct cifs_sb_info {
 	unsigned int mnt_cifs_flags;
 	int	prepathlen;
 	char   *prepath; /* relative path under the share to mount to */
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	char   *mountdata; /* mount options received at mount time */
-#endif
+	char   *mountdata; /* options received at mount time or via DFS refs */
 	struct backing_dev_info bdi;
 	struct delayed_work prune_tlinks;
 };
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 644dd882..6d02fd5 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -82,6 +82,9 @@ int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
 char *cifs_strndup_from_ucs(const char *src, const int maxlen,
 			    const bool is_unicode,
 			    const struct nls_table *codepage);
+extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+			const struct nls_table *cp, int mapChars);
+
 #endif
 
 /*
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index beeebf1..f3c6fb9 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -23,24 +23,16 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-
-static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
-	{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
-	{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
-;
-
-
 /* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
 	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
@@ -50,50 +42,385 @@ static const struct cifs_sid sid_authusers = {
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
 
+const struct cred *root_cred;
 
-int match_sid(struct cifs_sid *ctsid)
+static void
+shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
+			int *nr_del)
 {
-	int i, j;
-	int num_subauth, num_sat, num_saw;
-	struct cifs_sid *cwsid;
+	struct rb_node *node;
+	struct rb_node *tmp;
+	struct cifs_sid_id *psidid;
+
+	node = rb_first(root);
+	while (node) {
+		tmp = node;
+		node = rb_next(tmp);
+		psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
+		if (nr_to_scan == 0 || *nr_del == nr_to_scan)
+			++(*nr_rem);
+		else {
+			if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
+						&& psidid->refcount == 0) {
+				rb_erase(tmp, root);
+				++(*nr_del);
+			} else
+				++(*nr_rem);
+		}
+	}
+}
+
+/*
+ * Run idmap cache shrinker.
+ */
+static int
+cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+{
+	int nr_del = 0;
+	int nr_rem = 0;
+	struct rb_root *root;
+
+	root = &uidtree;
+	spin_lock(&siduidlock);
+	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+	spin_unlock(&siduidlock);
+
+	root = &gidtree;
+	spin_lock(&sidgidlock);
+	shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+	spin_unlock(&sidgidlock);
+
+	return nr_rem;
+}
+
+static struct shrinker cifs_shrinker = {
+	.shrink = cifs_idmap_shrinker,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static int
+cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	char *payload;
+
+	payload = kmalloc(datalen, GFP_KERNEL);
+	if (!payload)
+		return -ENOMEM;
+
+	memcpy(payload, data, datalen);
+	key->payload.data = payload;
+	return 0;
+}
+
+static inline void
+cifs_idmap_key_destroy(struct key *key)
+{
+	kfree(key->payload.data);
+}
 
-	if (!ctsid)
-		return -1;
+struct key_type cifs_idmap_key_type = {
+	.name        = "cifs.idmap",
+	.instantiate = cifs_idmap_key_instantiate,
+	.destroy     = cifs_idmap_key_destroy,
+	.describe    = user_describe,
+	.match       = user_match,
+};
+
+static void
+sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+{
+	int i;
+	unsigned long saval;
+	char *strptr;
 
-	for (i = 0; i < NUM_WK_SIDS; ++i) {
-		cwsid = &(wksidarr[i].cifssid);
+	strptr = sidstr;
 
-		/* compare the revision */
-		if (ctsid->revision != cwsid->revision)
-			continue;
+	sprintf(strptr, "%s", "S");
+	strptr = sidstr + strlen(sidstr);
 
-		/* compare all of the six auth values */
-		for (j = 0; j < 6; ++j) {
-			if (ctsid->authority[j] != cwsid->authority[j])
-				break;
+	sprintf(strptr, "-%d", sidptr->revision);
+	strptr = sidstr + strlen(sidstr);
+
+	for (i = 0; i < 6; ++i) {
+		if (sidptr->authority[i]) {
+			sprintf(strptr, "-%d", sidptr->authority[i]);
+			strptr = sidstr + strlen(sidstr);
 		}
-		if (j < 6)
-			continue; /* all of the auth values did not match */
-
-		/* compare all of the subauth values if any */
-		num_sat = ctsid->num_subauth;
-		num_saw = cwsid->num_subauth;
-		num_subauth = num_sat < num_saw ? num_sat : num_saw;
-		if (num_subauth) {
-			for (j = 0; j < num_subauth; ++j) {
-				if (ctsid->sub_auth[j] != cwsid->sub_auth[j])
-					break;
-			}
-			if (j < num_subauth)
-				continue; /* all sub_auth values do not match */
+	}
+
+	for (i = 0; i < sidptr->num_subauth; ++i) {
+		saval = le32_to_cpu(sidptr->sub_auth[i]);
+		sprintf(strptr, "-%ld", saval);
+		strptr = sidstr + strlen(sidstr);
+	}
+}
+
+static void
+id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
+		struct cifs_sid_id **psidid, char *typestr)
+{
+	int rc;
+	char *strptr;
+	struct rb_node *node = root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rb_node **linkto = &(root->rb_node);
+	struct cifs_sid_id *lsidid;
+
+	while (node) {
+		lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+		parent = node;
+		rc = compare_sids(sidptr, &((lsidid)->sid));
+		if (rc > 0) {
+			linkto = &(node->rb_left);
+			node = node->rb_left;
+		} else if (rc < 0) {
+			linkto = &(node->rb_right);
+			node = node->rb_right;
+		}
+	}
+
+	memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
+	(*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+	(*psidid)->refcount = 0;
+
+	sprintf((*psidid)->sidstr, "%s", typestr);
+	strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+	sid_to_str(&(*psidid)->sid, strptr);
+
+	clear_bit(SID_ID_PENDING, &(*psidid)->state);
+	clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+
+	rb_link_node(&(*psidid)->rbnode, parent, linkto);
+	rb_insert_color(&(*psidid)->rbnode, root);
+}
+
+static struct cifs_sid_id *
+id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
+{
+	int rc;
+	struct rb_node *node = root->rb_node;
+	struct cifs_sid_id *lsidid;
+
+	while (node) {
+		lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+		rc = compare_sids(sidptr, &((lsidid)->sid));
+		if (rc > 0) {
+			node = node->rb_left;
+		} else if (rc < 0) {
+			node = node->rb_right;
+		} else /* node found */
+			return lsidid;
+	}
+
+	return NULL;
+}
+
+static int
+sidid_pending_wait(void *unused)
+{
+	schedule();
+	return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+static int
+sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+		struct cifs_fattr *fattr, uint sidtype)
+{
+	int rc;
+	unsigned long cid;
+	struct key *idkey;
+	const struct cred *saved_cred;
+	struct cifs_sid_id *psidid, *npsidid;
+	struct rb_root *cidtree;
+	spinlock_t *cidlock;
+
+	if (sidtype == SIDOWNER) {
+		cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
+		cidlock = &siduidlock;
+		cidtree = &uidtree;
+	} else if (sidtype == SIDGROUP) {
+		cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
+		cidlock = &sidgidlock;
+		cidtree = &gidtree;
+	} else
+		return -ENOENT;
+
+	spin_lock(cidlock);
+	psidid = id_rb_search(cidtree, psid);
+
+	if (!psidid) { /* node does not exist, allocate one & attempt adding */
+		spin_unlock(cidlock);
+		npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+		if (!npsidid)
+			return -ENOMEM;
+
+		npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+		if (!npsidid->sidstr) {
+			kfree(npsidid);
+			return -ENOMEM;
+		}
+
+		spin_lock(cidlock);
+		psidid = id_rb_search(cidtree, psid);
+		if (psidid) { /* node happened to get inserted meanwhile */
+			++psidid->refcount;
+			spin_unlock(cidlock);
+			kfree(npsidid->sidstr);
+			kfree(npsidid);
+		} else {
+			psidid = npsidid;
+			id_rb_insert(cidtree, psid, &psidid,
+					sidtype == SIDOWNER ? "os:" : "gs:");
+			++psidid->refcount;
+			spin_unlock(cidlock);
 		}
+	} else {
+		++psidid->refcount;
+		spin_unlock(cidlock);
+	}
+
+	/*
+	 * If we are here, it is safe to access psidid and its fields
+	 * since a reference was taken earlier while holding the spinlock.
+	 * A reference on the node is put without holding the spinlock
+	 * and it is OK to do so in this case, shrinker will not erase
+	 * this node until all references are put and we do not access
+	 * any fields of the node after a reference is put .
+	 */
+	if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+		cid = psidid->id;
+		psidid->time = jiffies; /* update ts for accessing */
+		goto sid_to_id_out;
+	}
 
-		cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
-		return 0; /* sids compare/match */
+	if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
+		goto sid_to_id_out;
+
+	if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+		saved_cred = override_creds(root_cred);
+		idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+		if (IS_ERR(idkey))
+			cFYI(1, "%s: Can't map SID to an id", __func__);
+		else {
+			cid = *(unsigned long *)idkey->payload.value;
+			psidid->id = cid;
+			set_bit(SID_ID_MAPPED, &psidid->state);
+			key_put(idkey);
+			kfree(psidid->sidstr);
+		}
+		revert_creds(saved_cred);
+		psidid->time = jiffies; /* update ts for accessing */
+		clear_bit(SID_ID_PENDING, &psidid->state);
+		wake_up_bit(&psidid->state, SID_ID_PENDING);
+	} else {
+		rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+				sidid_pending_wait, TASK_INTERRUPTIBLE);
+		if (rc) {
+			cFYI(1, "%s: sidid_pending_wait interrupted %d",
+					__func__, rc);
+			--psidid->refcount; /* decremented without spinlock */
+			return rc;
+		}
+		if (test_bit(SID_ID_MAPPED, &psidid->state))
+			cid = psidid->id;
 	}
 
-	cFYI(1, "No matching sid");
-	return -1;
+sid_to_id_out:
+	--psidid->refcount; /* decremented without spinlock */
+	if (sidtype == SIDOWNER)
+		fattr->cf_uid = cid;
+	else
+		fattr->cf_gid = cid;
+
+	return 0;
+}
+
+int
+init_cifs_idmap(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	cFYI(1, "Registering the %s key type\n", cifs_idmap_key_type.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&cifs_idmap_key_type);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	root_cred = cred;
+
+	spin_lock_init(&siduidlock);
+	uidtree = RB_ROOT;
+	spin_lock_init(&sidgidlock);
+	gidtree = RB_ROOT;
+
+	register_shrinker(&cifs_shrinker);
+
+	cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void
+exit_cifs_idmap(void)
+{
+	key_revoke(root_cred->thread_keyring);
+	unregister_key_type(&cifs_idmap_key_type);
+	put_cred(root_cred);
+	unregister_shrinker(&cifs_shrinker);
+	cFYI(1, "Unregistered %s key type\n", cifs_idmap_key_type.name);
+}
+
+void
+cifs_destroy_idmaptrees(void)
+{
+	struct rb_root *root;
+	struct rb_node *node;
+
+	root = &uidtree;
+	spin_lock(&siduidlock);
+	while ((node = rb_first(root)))
+		rb_erase(node, root);
+	spin_unlock(&siduidlock);
+
+	root = &gidtree;
+	spin_lock(&sidgidlock);
+	while ((node = rb_first(root)))
+		rb_erase(node, root);
+	spin_unlock(&sidgidlock);
 }
 
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -104,16 +431,24 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	int num_subauth, num_sat, num_saw;
 
 	if ((!ctsid) || (!cwsid))
-		return 0;
+		return 1;
 
 	/* compare the revision */
-	if (ctsid->revision != cwsid->revision)
-		return 0;
+	if (ctsid->revision != cwsid->revision) {
+		if (ctsid->revision > cwsid->revision)
+			return 1;
+		else
+			return -1;
+	}
 
 	/* compare all of the six auth values */
 	for (i = 0; i < 6; ++i) {
-		if (ctsid->authority[i] != cwsid->authority[i])
-			return 0;
+		if (ctsid->authority[i] != cwsid->authority[i]) {
+			if (ctsid->authority[i] > cwsid->authority[i])
+				return 1;
+			else
+				return -1;
+		}
 	}
 
 	/* compare all of the subauth values if any */
@@ -122,12 +457,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	num_subauth = num_sat < num_saw ? num_sat : num_saw;
 	if (num_subauth) {
 		for (i = 0; i < num_subauth; ++i) {
-			if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
-				return 0;
+			if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+				if (ctsid->sub_auth[i] > cwsid->sub_auth[i])
+					return 1;
+				else
+					return -1;
+			}
 		}
 	}
 
-	return 1; /* sids compare/match */
+	return 0; /* sids compare/match */
 }
 
 
@@ -382,22 +721,22 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 #ifdef CONFIG_CIFS_DEBUG2
 			dump_ace(ppace[i], end_of_acl);
 #endif
-			if (compare_sids(&(ppace[i]->sid), pownersid))
+			if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &user_mask);
-			if (compare_sids(&(ppace[i]->sid), pgrpsid))
+			if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &group_mask);
-			if (compare_sids(&(ppace[i]->sid), &sid_everyone))
+			if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &other_mask);
-			if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+			if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
@@ -475,10 +814,10 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 
 
 /* Convert CIFS ACL to POSIX form */
-static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
-			  struct cifs_fattr *fattr)
+static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
+		struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
 {
-	int rc;
+	int rc = 0;
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
 	struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
 	char *end_of_acl = ((char *)pntsd) + acl_len;
@@ -500,12 +839,26 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 		 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
-	if (rc)
+	if (rc) {
+		cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
+	if (rc) {
+		cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc);
 		return rc;
+	}
 
 	rc = parse_sid(group_sid_ptr, end_of_acl);
-	if (rc)
+	if (rc) {
+		cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc);
 		return rc;
+	}
+	rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
+	if (rc) {
+		cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc);
+		return rc;
+	}
 
 	if (dacloffset)
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
@@ -520,7 +873,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 	memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
 			sizeof(struct cifs_sid)); */
 
-	return 0;
+	return rc;
 }
 
 
@@ -688,7 +1041,7 @@ out:
 }
 
 /* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 				struct inode *inode, const char *path)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -727,7 +1080,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 		rc = PTR_ERR(pntsd);
 		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
 	} else {
-		rc = parse_sec_desc(pntsd, acllen, fattr);
+		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
 		kfree(pntsd);
 		if (rc)
 			cERROR(1, "parse sec desc failed rc = %d", rc);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index c4ae7d0..5c902c7 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -39,6 +39,15 @@
 #define ACCESS_ALLOWED	0
 #define ACCESS_DENIED	1
 
+#define SIDOWNER 1
+#define SIDGROUP 2
+#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
+
+#define SID_ID_MAPPED 0
+#define SID_ID_PENDING 1
+#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
+#define SID_MAP_RETRY (300 * HZ)   /* wait 5 minutes for next attempt to map */
+
 struct cifs_ntsd {
 	__le16 revision; /* revision level */
 	__le16 type;
@@ -74,7 +83,21 @@ struct cifs_wksid {
 	char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
 
-extern int match_sid(struct cifs_sid *);
+struct cifs_sid_id {
+	unsigned int refcount; /* increment with spinlock, decrement without */
+	unsigned long id;
+	unsigned long time;
+	unsigned long state;
+	char *sidstr;
+	struct rb_node rbnode;
+	struct cifs_sid sid;
+};
+
+#ifdef __KERNEL__
+extern struct key_type cifs_idmap_key_type;
+extern const struct cred *root_cred;
+#endif /* KERNEL */
+
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d1a016b..45c3f78 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -60,7 +60,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 		server->session_key.response, server->session_key.len);
 
 	crypto_shash_update(&server->secmech.sdescmd5->shash,
-		cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+		cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
 
 	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
 
@@ -268,10 +268,11 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
 }
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 			char *lnm_session_key)
 {
 	int i;
+	int rc;
 	char password_with_pad[CIFS_ENCPWD_SIZE];
 
 	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
@@ -282,7 +283,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 		memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
 		memcpy(lnm_session_key, password_with_pad,
 			CIFS_ENCPWD_SIZE);
-		return;
+		return 0;
 	}
 
 	/* calculate old style session key */
@@ -299,10 +300,9 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 	for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
 		password_with_pad[i] = toupper(password_with_pad[i]);
 
-	SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
+	rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
 
-	/* clear password before we return/free memory */
-	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+	return rc;
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5c412b3..493b74c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -128,29 +128,22 @@ cifs_read_super(struct super_block *sb, void *data,
 	}
 	cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
 
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	/* copy mount params to sb for use in submounts */
-	/* BB: should we move this after the mount so we
-	 * do not have to do the copy on failed mounts?
-	 * BB: May be it is better to do simple copy before
-	 * complex operation (mount), and in case of fail
-	 * just exit instead of doing mount and attempting
-	 * undo it if this copy fails?*/
+	/*
+	 * Copy mount params to sb for use in submounts. Better to do
+	 * the copy here and deal with the error before cleanup gets
+	 * complicated post-mount.
+	 */
 	if (data) {
-		int len = strlen(data);
-		cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+		cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
 		if (cifs_sb->mountdata == NULL) {
 			bdi_destroy(&cifs_sb->bdi);
 			kfree(sb->s_fs_info);
 			sb->s_fs_info = NULL;
 			return -ENOMEM;
 		}
-		strncpy(cifs_sb->mountdata, data, len + 1);
-		cifs_sb->mountdata[len] = '\0';
 	}
-#endif
 
-	rc = cifs_mount(sb, cifs_sb, data, devname);
+	rc = cifs_mount(sb, cifs_sb, devname);
 
 	if (rc) {
 		if (!silent)
@@ -163,7 +156,7 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_bdi = &cifs_sb->bdi;
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
-	inode = cifs_root_iget(sb, ROOT_I);
+	inode = cifs_root_iget(sb);
 
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
@@ -184,12 +177,12 @@ cifs_read_super(struct super_block *sb, void *data,
 	else
 		sb->s_d_op = &cifs_dentry_ops;
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cFYI(1, "export ops supported");
 		sb->s_export_op = &cifs_export_ops;
 	}
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
 
 	return 0;
 
@@ -202,12 +195,10 @@ out_no_root:
 
 out_mount_failed:
 	if (cifs_sb) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
 		if (cifs_sb->mountdata) {
 			kfree(cifs_sb->mountdata);
 			cifs_sb->mountdata = NULL;
 		}
-#endif
 		unload_nls(cifs_sb->local_nls);
 		bdi_destroy(&cifs_sb->bdi);
 		kfree(cifs_sb);
@@ -231,12 +222,10 @@ cifs_put_super(struct super_block *sb)
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
 		cERROR(1, "cifs_umount failed with return code %d", rc);
-#ifdef CONFIG_CIFS_DFS_UPCALL
 	if (cifs_sb->mountdata) {
 		kfree(cifs_sb->mountdata);
 		cifs_sb->mountdata = NULL;
 	}
-#endif
 
 	unload_nls(cifs_sb->local_nls);
 	bdi_destroy(&cifs_sb->bdi);
@@ -618,16 +607,31 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 {
 	/* origin == SEEK_END => we must revalidate the cached file length */
 	if (origin == SEEK_END) {
-		int retval;
-
-		/* some applications poll for the file length in this strange
-		   way so we must seek to end on non-oplocked files by
-		   setting the revalidate time to zero */
-		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
-
-		retval = cifs_revalidate_file(file);
-		if (retval < 0)
-			return (loff_t)retval;
+		int rc;
+		struct inode *inode = file->f_path.dentry->d_inode;
+
+		/*
+		 * We need to be sure that all dirty pages are written and the
+		 * server has the newest file length.
+		 */
+		if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+		    inode->i_mapping->nrpages != 0) {
+			rc = filemap_fdatawait(inode->i_mapping);
+			if (rc) {
+				mapping_set_error(inode->i_mapping, rc);
+				return rc;
+			}
+		}
+		/*
+		 * Some applications poll for the file length in this strange
+		 * way so we must seek to end on non-oplocked files by
+		 * setting the revalidate time to zero.
+		 */
+		CIFS_I(inode)->time = 0;
+
+		rc = cifs_revalidate_file_attr(file);
+		if (rc < 0)
+			return (loff_t)rc;
 	}
 	return generic_file_llseek_unlocked(file, offset, origin);
 }
@@ -760,10 +764,11 @@ const struct file_operations cifs_file_strict_ops = {
 };
 
 const struct file_operations cifs_file_direct_ops = {
-	/* no aio, no readv -
-	   BB reevaluate whether they can be done with directio, no cache */
-	.read = cifs_user_read,
-	.write = cifs_user_write,
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_user_readv,
+	.aio_write = cifs_user_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -815,10 +820,11 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-	/* no mmap, no aio, no readv -
-	   BB reevaluate whether they can be done with directio, no cache */
-	.read = cifs_user_read,
-	.write = cifs_user_write,
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_user_readv,
+	.aio_write = cifs_user_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_fsync,
@@ -981,10 +987,10 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
 	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
-#endif
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 /*
  *  Initialize Global counters
  */
@@ -1033,22 +1039,33 @@ init_cifs(void)
 	if (rc)
 		goto out_destroy_mids;
 
-	rc = register_filesystem(&cifs_fs_type);
-	if (rc)
-		goto out_destroy_request_bufs;
 #ifdef CONFIG_CIFS_UPCALL
 	rc = register_key_type(&cifs_spnego_key_type);
 	if (rc)
-		goto out_unregister_filesystem;
-#endif
+		goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_UPCALL */
+
+#ifdef CONFIG_CIFS_ACL
+	rc = init_cifs_idmap();
+	if (rc)
+		goto out_register_key_type;
+#endif /* CONFIG_CIFS_ACL */
+
+	rc = register_filesystem(&cifs_fs_type);
+	if (rc)
+		goto out_init_cifs_idmap;
 
 	return 0;
 
-#ifdef CONFIG_CIFS_UPCALL
-out_unregister_filesystem:
-	unregister_filesystem(&cifs_fs_type);
+out_init_cifs_idmap:
+#ifdef CONFIG_CIFS_ACL
+	exit_cifs_idmap();
+out_register_key_type:
 #endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
 out_destroy_request_bufs:
+#endif
 	cifs_destroy_request_bufs();
 out_destroy_mids:
 	cifs_destroy_mids();
@@ -1070,6 +1087,10 @@ exit_cifs(void)
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	cifs_dfs_release_automount_timer();
 #endif
+#ifdef CONFIG_CIFS_ACL
+	cifs_destroy_idmaptrees();
+	exit_cifs_idmap();
+#endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
 #endif
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a9371b6..64313f7 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -47,7 +47,7 @@ extern void cifs_sb_deactive(struct super_block *sb);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *);
 extern int cifs_create(struct inode *, struct dentry *, int,
 		       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -59,9 +59,11 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
+extern int cifs_revalidate_file_attr(struct file *filp);
+extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
-extern void cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
@@ -80,12 +82,12 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-			      size_t read_size, loff_t *poffset);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
 				 unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-			       size_t write_size, loff_t *poffset);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 				  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
@@ -123,9 +125,9 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "1.71"
+#define CIFS_VERSION   "1.72"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a5d1106..76b4517 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -274,7 +274,8 @@ struct cifsSesInfo {
 	int capabilities;
 	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for
 				TCP names - will ipv6 and sctp addresses fit? */
-	char *user_name;
+	char *user_name;	/* must not be null except during init of sess
+				   and after mount option parsing we fill it */
 	char *domainName;
 	char *password;
 	struct session_key auth_key;
@@ -780,10 +781,12 @@ GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
  */
 GLOBAL_EXTERN spinlock_t	cifs_file_list_lock;
 
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
 /* DirNotify response queue */
 GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 
 /*
  * Global transaction id (XID) information
@@ -830,6 +833,11 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 /* reconnect after this many failed echo attempts */
 GLOBAL_EXTERN unsigned short echo_retries;
 
+GLOBAL_EXTERN struct rb_root uidtree;
+GLOBAL_EXTERN struct rb_root gidtree;
+GLOBAL_EXTERN spinlock_t siduidlock;
+GLOBAL_EXTERN spinlock_t sidgidlock;
+
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b5c8cc5..de3aa28 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -397,9 +397,9 @@
 #define GETU32(var)  (*((__u32 *)var))	/* BB check for endian issues */
 
 struct smb_hdr {
-	__u32 smb_buf_length;	/* big endian on wire *//* BB length is only two
-		or three bytes - with one or two byte type preceding it that are
-		zero - we could mask the type byte off just in case BB */
+	__be32 smb_buf_length;	/* BB length is only two (rarely three) bytes,
+		with one or two byte "type" preceding it that will be
+		zero - we could mask the type byte off */
 	__u8 Protocol[4];
 	__u8 Command;
 	union {
@@ -428,43 +428,28 @@ struct smb_hdr {
 	__u8 WordCount;
 } __attribute__((packed));
 
-/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
-#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
-			 (2 * (smb_var)->WordCount))
+/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */
+static inline void *
+BCC(struct smb_hdr *smb)
+{
+	return (void *)smb + sizeof(*smb) + 2 * smb->WordCount;
+}
 
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
 #define pByteArea(smb_var) (BCC(smb_var) + 2)
 
-/* get the converted ByteCount for a SMB packet and return it */
-static inline __u16
-get_bcc(struct smb_hdr *hdr)
-{
-	__u16 *bc_ptr = (__u16 *)BCC(hdr);
-
-	return get_unaligned(bc_ptr);
-}
-
 /* get the unconverted ByteCount for a SMB packet and return it */
 static inline __u16
-get_bcc_le(struct smb_hdr *hdr)
+get_bcc(struct smb_hdr *hdr)
 {
 	__le16 *bc_ptr = (__le16 *)BCC(hdr);
 
 	return get_unaligned_le16(bc_ptr);
 }
 
-/* set the ByteCount for a SMB packet in host-byte order */
-static inline void
-put_bcc(__u16 count, struct smb_hdr *hdr)
-{
-	__u16 *bc_ptr = (__u16 *)BCC(hdr);
-
-	put_unaligned(count, bc_ptr);
-}
-
 /* set the ByteCount for a SMB packet in little-endian */
 static inline void
-put_bcc_le(__u16 count, struct smb_hdr *hdr)
+put_bcc(__u16 count, struct smb_hdr *hdr)
 {
 	__le16 *bc_ptr = (__le16 *)BCC(hdr);
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8096f27..6e69e06 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -53,6 +53,9 @@ do {								\
 	cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",	\
 	     __func__, curr_xid, (int)rc);			\
 } while (0)
+extern int init_cifs_idmap(void);
+extern void exit_cifs_idmap(void);
+extern void cifs_destroy_idmaptrees(void);
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
 					struct cifsTconInfo *tcon);
@@ -90,7 +93,6 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
-extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
 			struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -143,8 +145,10 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
 extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 					const char *, u32 *);
+extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
+				const char *);
 
-extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
+extern int cifs_mount(struct super_block *, struct cifs_sb_info *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
 extern void cifs_dfs_release_automount_timer(void);
@@ -304,12 +308,13 @@ extern int CIFSSMBUnixQuerySymLink(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName, char **syminfo,
 			const struct nls_table *nls_codepage);
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
 extern int CIFSSMBQueryReparseLinkInfo(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
 			char *symlinkinfo, const int buflen, __u16 fid,
 			const struct nls_table *nls_codepage);
-
+#endif /* temporarily unused until cifs_symlink fixed */
 extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
@@ -348,8 +353,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName, __u64 *inode_number,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-			const struct nls_table *cp, int mapChars);
 
 extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 netfid, const __u64 len,
@@ -383,9 +386,15 @@ extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifsSesInfo *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-extern void calc_lanman_hash(const char *password, const char *cryptkey,
+extern int calc_lanman_hash(const char *password, const char *cryptkey,
 				bool encrypt, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+			const int notify_subdirs, const __u16 netfid,
+			__u32 filter, struct file *file, int multishot,
+			const struct nls_table *nls_codepage);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 extern int CIFSSMBCopy(int xid,
 			struct cifsTconInfo *source_tcon,
 			const char *fromName,
@@ -393,10 +402,6 @@ extern int CIFSSMBCopy(int xid,
 			const char *toName, const int flags,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
-			const int notify_subdirs, const __u16 netfid,
-			__u32 filter, struct file *file, int multishot,
-			const struct nls_table *nls_codepage);
 extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
 			const unsigned char *ea_name, char *EAData,
@@ -427,9 +432,6 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 		struct cifs_sb_info *cifs_sb, int xid);
 extern int mdfour(unsigned char *, unsigned char *, int);
 extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-			unsigned char *p24);
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
+extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 			unsigned char *p24);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index df959ba..83df937 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -339,12 +339,13 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
 	    get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
 		goto vt2_err;
 
-	/* check that bcc is at least as big as parms + data */
-	/* check that bcc is less than negotiated smb buffer */
 	total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
 	if (total_size >= 512)
 		goto vt2_err;
 
+	/* check that bcc is at least as big as parms + data, and that it is
+	 * less than negotiated smb buffer
+	 */
 	total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
 	if (total_size > get_bcc(&pSMB->hdr) ||
 	    total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
@@ -357,6 +358,13 @@ vt2_err:
 	return -EINVAL;
 }
 
+static inline void inc_rfc1001_len(void *pSMB, int count)
+{
+	struct smb_hdr *hdr = (struct smb_hdr *)pSMB;
+
+	be32_add_cpu(&hdr->smb_buf_length, count);
+}
+
 int
 CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 {
@@ -409,7 +417,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		count += strlen(protocols[i].name) + 1;
 		/* null at end of source and target buffers anyway */
 	}
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -541,10 +549,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->secType = RawNTLMSSP;
 	else if (secFlags & CIFSSEC_MAY_LANMAN)
 		server->secType = LANMAN;
-/* #ifdef CONFIG_CIFS_EXPERIMENTAL
-	else if (secFlags & CIFSSEC_MAY_PLNTXT)
-		server->secType = ??
-#endif */
 	else {
 		rc = -EOPNOTSUPP;
 		cERROR(1, "Invalid security type");
@@ -578,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 	if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) &&
 		(server->capabilities & CAP_EXTENDED_SECURITY)) {
-		count = pSMBr->ByteCount;
+		count = get_bcc(&pSMBr->hdr);
 		if (count < 16) {
 			rc = -EIO;
 			goto neg_err_exit;
@@ -732,9 +736,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 	smb->hdr.Tid = 0xffff;
 	smb->hdr.WordCount = 1;
 	put_unaligned_le16(1, &smb->EchoCount);
-	put_bcc_le(1, &smb->hdr);
+	put_bcc(1, &smb->hdr);
 	smb->Data[0] = 'a';
-	smb->hdr.smb_buf_length += 3;
+	inc_rfc1001_len(smb, 3);
 
 	rc = cifs_call_async(server, (struct smb_hdr *)smb,
 				cifs_echo_callback, server);
@@ -852,7 +856,7 @@ PsxDelete:
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -898,7 +902,7 @@ DelFileRetry:
 	pSMB->SearchAttributes =
 	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -942,7 +946,7 @@ RmDirRetry:
 	}
 
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -985,7 +989,7 @@ MkDirRetry:
 	}
 
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -1063,7 +1067,7 @@ PsxCreat:
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -1075,7 +1079,7 @@ PsxCreat:
 	cFYI(1, "copying inode info");
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-	if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+	if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
 		rc = -EIO;	/* bad smb */
 		goto psx_create_err;
 	}
@@ -1096,7 +1100,7 @@ PsxCreat:
 		pRetData->Type = cpu_to_le32(-1); /* unknown */
 		cFYI(DBG2, "unknown type");
 	} else {
-		if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
+		if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
 					+ sizeof(FILE_UNIX_BASIC_INFO)) {
 			cERROR(1, "Open response data too small");
 			pRetData->Type = cpu_to_le32(-1);
@@ -1228,7 +1232,7 @@ OldOpenRetry:
 	pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY);
 	pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition));
 	count += name_len;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
@@ -1341,7 +1345,7 @@ openRetry:
 	    SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
 
 	count += name_len;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
@@ -1426,7 +1430,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	}
 
 	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
 			 &resp_buf_type, CIFS_LOG_ERROR);
 	cifs_stats_inc(&tcon->num_reads);
@@ -1560,7 +1564,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 
 	pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
 	pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 
 	if (wct == 14)
 		pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -1644,11 +1648,12 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 
 	pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF);
 	pSMB->DataLengthHigh = cpu_to_le16(count >> 16);
-	smb_hdr_len = pSMB->hdr.smb_buf_length + 1; /* hdr + 1 byte pad */
+	/* header + 1 byte pad */
+	smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1;
 	if (wct == 14)
-		pSMB->hdr.smb_buf_length += count+1;
+		inc_rfc1001_len(pSMB, count + 1);
 	else /* wct == 12 */
-		pSMB->hdr.smb_buf_length += count+5; /* smb data starts later */
+		inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */
 	if (wct == 14)
 		pSMB->ByteCount = cpu_to_le16(count + 1);
 	else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ {
@@ -1748,7 +1753,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 		/* oplock break */
 		count = 0;
 	}
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	if (waitFlag) {
@@ -1839,14 +1844,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	pSMB->Fid = smb_file_id;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	if (waitFlag) {
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *) pSMBr, &bytes_returned);
 	} else {
 		iov[0].iov_base = (char *)pSMB;
-		iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+		iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 		rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
 				&resp_buf_type, timeout);
 		pSMB = NULL; /* request buf already freed by SendReceive2. Do
@@ -1862,7 +1867,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		__u16 data_count;
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) {
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) {
 			rc = -EIO;      /* bad smb */
 			goto plk_err_exit;
 		}
@@ -2012,7 +2017,7 @@ renameRetry:
 	}
 
 	count = 1 /* 1st signature byte */  + name_len + name_len2;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2092,7 +2097,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 	pSMB->InformationLevel =
 		cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2159,7 +2164,7 @@ copyRetry:
 	}
 
 	count = 1 /* 1st signature byte */  + name_len + name_len2;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2249,7 +2254,7 @@ createSymLinkRetry:
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2335,7 +2340,7 @@ createHardLinkRetry:
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2406,7 +2411,7 @@ winCreateHardLinkRetry:
 	}
 
 	count = 1 /* string type byte */  + name_len + name_len2;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2477,7 +2482,7 @@ querySymLinkRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2489,7 +2494,7 @@ querySymLinkRetry:
 
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		/* BB also check enough total bytes returned */
-		if (rc || (pSMBr->ByteCount < 2))
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			rc = -EIO;
 		else {
 			bool is_unicode;
@@ -2516,7 +2521,17 @@ querySymLinkRetry:
 	return rc;
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
+/*
+ *	Recent Windows versions now create symlinks more frequently
+ *	and they use the "reparse point" mechanism below.  We can of course
+ *	do symlinks nicely to Samba and other servers which support the
+ *	CIFS Unix Extensions and we can also do SFU symlinks and "client only"
+ *	"MF" symlinks optionally, but for recent Windows we really need to
+ *	reenable the code below and fix the cifs_symlink callers to handle this.
+ *	In the interim this code has been moved to its own config option so
+ *	it is not compiled in by default until callers fixed up and more tested.
+ */
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
@@ -2561,14 +2576,14 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	} else {		/* decode response */
 		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
 		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
-		if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
-		/* BB also check enough total bytes returned */
+		if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
+			/* BB also check enough total bytes returned */
 			rc = -EIO;	/* bad smb */
 			goto qreparse_out;
 		}
 		if (data_count && (data_count < 2048)) {
 			char *end_of_smb = 2 /* sizeof byte count */ +
-				pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
+			       get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
 
 			struct reparse_data *reparse_buf =
 						(struct reparse_data *)
@@ -2618,7 +2633,7 @@ qreparse_out:
 
 	return rc;
 }
-#endif /* CIFS_EXPERIMENTAL */
+#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */
 
 #ifdef CONFIG_CIFS_POSIX
 
@@ -2814,7 +2829,7 @@ queryAclRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2826,8 +2841,8 @@ queryAclRetry:
 		/* decode response */
 
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -2908,7 +2923,7 @@ setAclRetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2966,7 +2981,7 @@ GetExtAttrRetry:
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS);
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2976,8 +2991,8 @@ GetExtAttrRetry:
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			/* If rc should we check for EOPNOSUPP and
 			   disable the srvino flag? or in caller? */
 			rc = -EIO;      /* bad smb */
@@ -3052,6 +3067,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 	char *end_of_smb;
 	__u32 data_count, data_offset, parm_count, parm_offset;
 	struct smb_com_ntransact_rsp *pSMBr;
+	u16 bcc;
 
 	*pdatalen = 0;
 	*pparmlen = 0;
@@ -3061,8 +3077,8 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 	pSMBr = (struct smb_com_ntransact_rsp *)buf;
 
-	/* ByteCount was converted from little endian in SendReceive */
-	end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+	bcc = get_bcc(&pSMBr->hdr);
+	end_of_smb = 2 /* sizeof byte count */ + bcc +
 			(char *)&pSMBr->ByteCount;
 
 	data_offset = le32_to_cpu(pSMBr->DataOffset);
@@ -3088,7 +3104,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 			*ppdata, data_count, (data_count + *ppdata),
 			end_of_smb, pSMBr);
 		return -EINVAL;
-	} else if (parm_count + data_count > pSMBr->ByteCount) {
+	} else if (parm_count + data_count > bcc) {
 		cFYI(1, "parm count and data count larger than SMB");
 		return -EINVAL;
 	}
@@ -3124,9 +3140,9 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
 				     CIFS_ACL_DACL);
 	pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
-	pSMB->hdr.smb_buf_length += 11;
+	inc_rfc1001_len(pSMB, 11);
 	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
 			 0);
@@ -3235,10 +3251,9 @@ setCifsAclRetry:
 		memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
 			(char *) pntsd,
 			acllen);
-		pSMB->hdr.smb_buf_length += (byte_count + data_count);
-
+		inc_rfc1001_len(pSMB, byte_count + data_count);
 	} else
-		pSMB->hdr.smb_buf_length += byte_count;
+		inc_rfc1001_len(pSMB, byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3289,7 +3304,7 @@ QInfRetry:
 	}
 	pSMB->BufferFormat = 0x04;
 	name_len++; /* account for buffer type byte */
-	pSMB->hdr.smb_buf_length += (__u16) name_len;
+	inc_rfc1001_len(pSMB, (__u16)name_len);
 	pSMB->ByteCount = cpu_to_le16(name_len);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3364,7 +3379,7 @@ QFileInfoRetry:
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3375,7 +3390,7 @@ QFileInfoRetry:
 
 		if (rc) /* BB add auto retry on EOPNOTSUPP? */
 			rc = -EIO;
-		else if (pSMBr->ByteCount < 40)
+		else if (get_bcc(&pSMBr->hdr) < 40)
 			rc = -EIO;	/* bad smb */
 		else if (pFindData) {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3451,7 +3466,7 @@ QPathInfoRetry:
 	else
 		pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3463,9 +3478,9 @@ QPathInfoRetry:
 
 		if (rc) /* BB add auto retry on EOPNOTSUPP? */
 			rc = -EIO;
-		else if (!legacy && (pSMBr->ByteCount < 40))
+		else if (!legacy && get_bcc(&pSMBr->hdr) < 40)
 			rc = -EIO;	/* bad smb */
-		else if (legacy && (pSMBr->ByteCount < 24))
+		else if (legacy && get_bcc(&pSMBr->hdr) < 24)
 			rc = -EIO;  /* 24 or 26 expected but we do not read
 					last field */
 		else if (pFindData) {
@@ -3532,7 +3547,7 @@ UnixQFileInfoRetry:
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3541,7 +3556,7 @@ UnixQFileInfoRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
 			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
 				   "Unix Extensions can be disabled on mount "
 				   "by specifying the nosfu mount option.");
@@ -3617,7 +3632,7 @@ UnixQPathInfoRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3627,7 +3642,7 @@ UnixQPathInfoRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
 			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
 				   "Unix Extensions can be disabled on mount "
 				   "by specifying the nosfu mount option.");
@@ -3731,7 +3746,7 @@ findFirstRetry:
 
 	/* BB what should we set StorageType to? Does it matter? BB */
 	pSMB->SearchStorageType = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3860,7 +3875,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	byte_count = params + 1 /* pad */ ;
 	pSMB->TotalParameterCount = cpu_to_le16(params);
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4022,7 +4037,7 @@ GetInodeNumberRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4032,8 +4047,8 @@ GetInodeNumberRetry:
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			/* If rc should we check for EOPNOSUPP and
 			disable the srvino flag? or in caller? */
 			rc = -EIO;      /* bad smb */
@@ -4246,7 +4261,7 @@ getDFSRetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->MaxReferralLevel = cpu_to_le16(3);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -4258,13 +4273,13 @@ getDFSRetry:
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 	/* BB Also check if enough total bytes returned? */
-	if (rc || (pSMBr->ByteCount < 17)) {
+	if (rc || get_bcc(&pSMBr->hdr) < 17) {
 		rc = -EIO;      /* bad smb */
 		goto GetDFSRefExit;
 	}
 
 	cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
-				pSMBr->ByteCount,
+				get_bcc(&pSMBr->hdr),
 				le16_to_cpu(pSMBr->t2.DataOffset));
 
 	/* parse returned result into more usable form */
@@ -4320,7 +4335,7 @@ oldQFSInfoRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4330,12 +4345,12 @@ oldQFSInfoRetry:
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 18))
+		if (rc || get_bcc(&pSMBr->hdr) < 18)
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
 			cFYI(1, "qfsinf resp BCC: %d  Offset %d",
-				 pSMBr->ByteCount, data_offset);
+				 get_bcc(&pSMBr->hdr), data_offset);
 
 			response_data = (FILE_SYSTEM_ALLOC_INFO *)
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4399,7 +4414,7 @@ QFSInfoRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4409,7 +4424,7 @@ QFSInfoRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 24))
+		if (rc || get_bcc(&pSMBr->hdr) < 24)
 			rc = -EIO;	/* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4479,7 +4494,7 @@ QFSAttributeRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4489,7 +4504,7 @@ QFSAttributeRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 13)) {
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
 			/* BB also check if enough bytes returned */
 			rc = -EIO;	/* bad smb */
 		} else {
@@ -4550,7 +4565,7 @@ QFSDeviceRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4560,7 +4575,8 @@ QFSDeviceRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO)))
+		if (rc || get_bcc(&pSMBr->hdr) <
+			  sizeof(FILE_SYSTEM_DEVICE_INFO))
 			rc = -EIO;	/* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4619,7 +4635,7 @@ QFSUnixRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4629,7 +4645,7 @@ QFSUnixRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 13)) {
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4702,7 +4718,7 @@ SETFSUnixRetry:
 	pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION);
 	pSMB->ClientUnixCap = cpu_to_le64(cap);
 
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4764,7 +4780,7 @@ QFSPosixRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4774,7 +4790,7 @@ QFSPosixRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 13)) {
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4890,7 +4906,7 @@ SetEOFRetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	parm_data->FileSize = cpu_to_le64(size);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4969,7 +4985,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
 	}
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc) {
@@ -5037,7 +5053,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	else
 		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
@@ -5096,7 +5112,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	pSMB->Fid = fid;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	*data_offset = delete_file ? 1 : 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
@@ -5169,7 +5185,7 @@ SetTimesRetry:
 	else
 		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -5221,7 +5237,7 @@ SetAttrLgcyRetry:
 	}
 	pSMB->attr = cpu_to_le16(dos_attrs);
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -5326,7 +5342,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	pSMB->Fid = fid;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	cifs_fill_unix_set_info(data_offset, args);
@@ -5402,7 +5418,7 @@ setPermsRetry:
 	pSMB->TotalDataCount = pSMB->DataCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 
 	cifs_fill_unix_set_info(data_offset, args);
 
@@ -5418,79 +5434,6 @@ setPermsRetry:
 	return rc;
 }
 
-int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
-		  const int notify_subdirs, const __u16 netfid,
-		  __u32 filter, struct file *pfile, int multishot,
-		  const struct nls_table *nls_codepage)
-{
-	int rc = 0;
-	struct smb_com_transaction_change_notify_req *pSMB = NULL;
-	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
-	struct dir_notify_req *dnotify_req;
-	int bytes_returned;
-
-	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
-	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
-	if (rc)
-		return rc;
-
-	pSMB->TotalParameterCount = 0 ;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le32(2);
-	/* BB find exact data count max from sess structure BB */
-	pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					     MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-
-	pSMB->MaxSetupCount = 4;
-	pSMB->Reserved = 0;
-	pSMB->ParameterOffset = 0;
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 4; /* single byte does not need le conversion */
-	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	if (notify_subdirs)
-		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
-	pSMB->Reserved2 = 0;
-	pSMB->CompletionFilter = cpu_to_le32(filter);
-	pSMB->Fid = netfid; /* file handle always le */
-	pSMB->ByteCount = 0;
-
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *)pSMBr, &bytes_returned,
-			 CIFS_ASYNC_OP);
-	if (rc) {
-		cFYI(1, "Error in Notify = %d", rc);
-	} else {
-		/* Add file to outstanding requests */
-		/* BB change to kmem cache alloc */
-		dnotify_req = kmalloc(
-						sizeof(struct dir_notify_req),
-						 GFP_KERNEL);
-		if (dnotify_req) {
-			dnotify_req->Pid = pSMB->hdr.Pid;
-			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
-			dnotify_req->Mid = pSMB->hdr.Mid;
-			dnotify_req->Tid = pSMB->hdr.Tid;
-			dnotify_req->Uid = pSMB->hdr.Uid;
-			dnotify_req->netfid = netfid;
-			dnotify_req->pfile = pfile;
-			dnotify_req->filter = filter;
-			dnotify_req->multishot = multishot;
-			spin_lock(&GlobalMid_Lock);
-			list_add_tail(&dnotify_req->lhead,
-					&GlobalDnotifyReqList);
-			spin_unlock(&GlobalMid_Lock);
-		} else
-			rc = -ENOMEM;
-	}
-	cifs_buf_release(pSMB);
-	return rc;
-}
-
 #ifdef CONFIG_CIFS_XATTR
 /*
  * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
@@ -5560,7 +5503,7 @@ QAllEAsRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -5576,7 +5519,7 @@ QAllEAsRetry:
 	of these trans2 responses */
 
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-	if (rc || (pSMBr->ByteCount < 4)) {
+	if (rc || get_bcc(&pSMBr->hdr) < 4) {
 		rc = -EIO;	/* bad smb */
 		goto QAllEAsOut;
 	}
@@ -5773,7 +5716,7 @@ SetEARetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -5787,5 +5730,99 @@ SetEARetry:
 
 	return rc;
 }
-
 #endif
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
+/*
+ *	Years ago the kernel added a "dnotify" function for Samba server,
+ *	to allow network clients (such as Windows) to display updated
+ *	lists of files in directory listings automatically when
+ *	files are added by one user when another user has the
+ *	same directory open on their desktop.  The Linux cifs kernel
+ *	client hooked into the kernel side of this interface for
+ *	the same reason, but ironically when the VFS moved from
+ *	"dnotify" to "inotify" it became harder to plug in Linux
+ *	network file system clients (the most obvious use case
+ *	for notify interfaces is when multiple users can update
+ *	the contents of the same directory - exactly what network
+ *	file systems can do) although the server (Samba) could
+ *	still use it.  For the short term we leave the worker
+ *	function ifdeffed out (below) until inotify is fixed
+ *	in the VFS to make it easier to plug in network file
+ *	system clients.  If inotify turns out to be permanently
+ *	incompatible for network fs clients, we could instead simply
+ *	expose this config flag by adding a future cifs (and smb2) notify ioctl.
+ */
+int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+		  const int notify_subdirs, const __u16 netfid,
+		  __u32 filter, struct file *pfile, int multishot,
+		  const struct nls_table *nls_codepage)
+{
+	int rc = 0;
+	struct smb_com_transaction_change_notify_req *pSMB = NULL;
+	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
+	struct dir_notify_req *dnotify_req;
+	int bytes_returned;
+
+	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->TotalParameterCount = 0 ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le32(2);
+	/* BB find exact data count max from sess structure BB */
+	pSMB->MaxDataCount = 0; /* same in little endian or be */
+/* BB VERIFY verify which is correct for above BB */
+	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+					     MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 4; /* single byte does not need le conversion */
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	if (notify_subdirs)
+		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
+	pSMB->Reserved2 = 0;
+	pSMB->CompletionFilter = cpu_to_le32(filter);
+	pSMB->Fid = netfid; /* file handle always le */
+	pSMB->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *)pSMBr, &bytes_returned,
+			 CIFS_ASYNC_OP);
+	if (rc) {
+		cFYI(1, "Error in Notify = %d", rc);
+	} else {
+		/* Add file to outstanding requests */
+		/* BB change to kmem cache alloc */
+		dnotify_req = kmalloc(
+						sizeof(struct dir_notify_req),
+						 GFP_KERNEL);
+		if (dnotify_req) {
+			dnotify_req->Pid = pSMB->hdr.Pid;
+			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
+			dnotify_req->Mid = pSMB->hdr.Mid;
+			dnotify_req->Tid = pSMB->hdr.Tid;
+			dnotify_req->Uid = pSMB->hdr.Uid;
+			dnotify_req->netfid = netfid;
+			dnotify_req->pfile = pfile;
+			dnotify_req->filter = filter;
+			dnotify_req->multishot = multishot;
+			spin_lock(&GlobalMid_Lock);
+			list_add_tail(&dnotify_req->lhead,
+					&GlobalDnotifyReqList);
+			spin_unlock(&GlobalMid_Lock);
+		} else
+			rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	return rc;
+}
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 277262a..da284e3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,6 +102,7 @@ struct smb_vol {
 	bool fsc:1;	/* enable fscache */
 	bool mfsymlinks:1; /* use Minshall+French Symlinks */
 	bool multiuser:1;
+	bool use_smb2:1; /* force smb2 use on mount instead of cifs */
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
@@ -316,19 +317,19 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
 
 	/* fix up the BCC */
-	byte_count = get_bcc_le(pTargetSMB);
+	byte_count = get_bcc(pTargetSMB);
 	byte_count += total_in_buf2;
 	/* is the result too big for the field? */
 	if (byte_count > USHRT_MAX)
 		return -EPROTO;
-	put_bcc_le(byte_count, pTargetSMB);
+	put_bcc(byte_count, pTargetSMB);
 
-	byte_count = pTargetSMB->smb_buf_length;
+	byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
 	byte_count += total_in_buf2;
 	/* don't allow buffer to overflow */
 	if (byte_count > CIFSMaxBufSize)
 		return -ENOBUFS;
-	pTargetSMB->smb_buf_length = byte_count;
+	pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
 
 	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
 
@@ -495,8 +496,7 @@ incomplete_rcv:
 		/* Note that FC 1001 length is big endian on the wire,
 		but we convert it here so it is always manipulated
 		as host byte order */
-		pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
-		smb_buffer->smb_buf_length = pdu_length;
+		pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
 
 		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
 
@@ -735,7 +735,7 @@ multi_t2_fnd:
 		sock_release(csocket);
 		server->ssocket = NULL;
 	}
-	/* buffer usuallly freed in free_mid - need to free it here on exit */
+	/* buffer usually freed in free_mid - need to free it here on exit */
 	cifs_buf_release(bigbuf);
 	if (smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(smallbuf);
@@ -818,10 +818,11 @@ extract_hostname(const char *unc)
 }
 
 static int
-cifs_parse_mount_options(char *options, const char *devname,
+cifs_parse_mount_options(const char *mountdata, const char *devname,
 			 struct smb_vol *vol)
 {
 	char *value, *data, *end;
+	char *mountdata_copy, *options;
 	unsigned int  temp_len, i, j;
 	char separator[2];
 	short int override_uid = -1;
@@ -861,9 +862,14 @@ cifs_parse_mount_options(char *options, const char *devname,
 
 	vol->actimeo = CIFS_DEF_ACTIMEO;
 
-	if (!options)
-		return 1;
+	if (!mountdata)
+		goto cifs_parse_mount_err;
+
+	mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL);
+	if (!mountdata_copy)
+		goto cifs_parse_mount_err;
 
+	options = mountdata_copy;
 	end = options + strlen(options);
 	if (strncmp(options, "sep=", 4) == 0) {
 		if (options[4] != 0) {
@@ -889,17 +895,22 @@ cifs_parse_mount_options(char *options, const char *devname,
 			if (!value) {
 				printk(KERN_WARNING
 				       "CIFS: invalid or missing username\n");
-				return 1;	/* needs_arg; */
+				goto cifs_parse_mount_err;
 			} else if (!*value) {
 				/* null user, ie anonymous, authentication */
 				vol->nullauth = 1;
 			}
 			if (strnlen(value, MAX_USERNAME_SIZE) <
 						MAX_USERNAME_SIZE) {
-				vol->username = value;
+				vol->username = kstrdup(value, GFP_KERNEL);
+				if (!vol->username) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for username\n");
+					goto cifs_parse_mount_err;
+				}
 			} else {
 				printk(KERN_WARNING "CIFS: username too long\n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if (strnicmp(data, "pass", 4) == 0) {
 			if (!value) {
@@ -963,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 				if (vol->password == NULL) {
 					printk(KERN_WARNING "CIFS: no memory "
 							    "for password\n");
-					return 1;
+					goto cifs_parse_mount_err;
 				}
 				for (i = 0, j = 0; i < temp_len; i++, j++) {
 					vol->password[j] = value[i];
@@ -979,7 +990,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 				if (vol->password == NULL) {
 					printk(KERN_WARNING "CIFS: no memory "
 							    "for password\n");
-					return 1;
+					goto cifs_parse_mount_err;
 				}
 				strcpy(vol->password, value);
 			}
@@ -989,11 +1000,16 @@ cifs_parse_mount_options(char *options, const char *devname,
 				vol->UNCip = NULL;
 			} else if (strnlen(value, INET6_ADDRSTRLEN) <
 							INET6_ADDRSTRLEN) {
-				vol->UNCip = value;
+				vol->UNCip = kstrdup(value, GFP_KERNEL);
+				if (!vol->UNCip) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for UNC IP\n");
+					goto cifs_parse_mount_err;
+				}
 			} else {
 				printk(KERN_WARNING "CIFS: ip address "
 						    "too long\n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if (strnicmp(data, "sec", 3) == 0) {
 			if (!value || !*value) {
@@ -1006,7 +1022,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 				/* vol->secFlg |= CIFSSEC_MUST_SEAL |
 					CIFSSEC_MAY_KRB5; */
 				cERROR(1, "Krb5 cifs privacy not supported");
-				return 1;
+				goto cifs_parse_mount_err;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5;
 			} else if (strnicmp(value, "ntlmsspi", 8) == 0) {
@@ -1036,7 +1052,23 @@ cifs_parse_mount_options(char *options, const char *devname,
 				vol->nullauth = 1;
 			} else {
 				cERROR(1, "bad security option: %s", value);
-				return 1;
+				goto cifs_parse_mount_err;
+			}
+		} else if (strnicmp(data, "vers", 3) == 0) {
+			if (!value || !*value) {
+				cERROR(1, "no protocol version specified"
+					  " after vers= mount option");
+			} else if ((strnicmp(value, "cifs", 4) == 0) ||
+				   (strnicmp(value, "1", 1) == 0)) {
+				/* this is the default */
+				continue;
+			} else if ((strnicmp(value, "smb2", 4) == 0) ||
+				   (strnicmp(value, "2", 1) == 0)) {
+#ifdef CONFIG_CIFS_SMB2
+				vol->use_smb2 = true;
+#else
+				cERROR(1, "smb2 support not enabled");
+#endif /* CONFIG_CIFS_SMB2 */
 			}
 		} else if ((strnicmp(data, "unc", 3) == 0)
 			   || (strnicmp(data, "target", 6) == 0)
@@ -1044,12 +1076,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 			if (!value || !*value) {
 				printk(KERN_WARNING "CIFS: invalid path to "
 						    "network resource\n");
-				return 1;	/* needs_arg; */
+				goto cifs_parse_mount_err;
 			}
 			if ((temp_len = strnlen(value, 300)) < 300) {
 				vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
 				if (vol->UNC == NULL)
-					return 1;
+					goto cifs_parse_mount_err;
 				strcpy(vol->UNC, value);
 				if (strncmp(vol->UNC, "//", 2) == 0) {
 					vol->UNC[0] = '\\';
@@ -1058,27 +1090,32 @@ cifs_parse_mount_options(char *options, const char *devname,
 					printk(KERN_WARNING
 					       "CIFS: UNC Path does not begin "
 					       "with // or \\\\ \n");
-					return 1;
+					goto cifs_parse_mount_err;
 				}
 			} else {
 				printk(KERN_WARNING "CIFS: UNC name too long\n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if ((strnicmp(data, "domain", 3) == 0)
 			   || (strnicmp(data, "workgroup", 5) == 0)) {
 			if (!value || !*value) {
 				printk(KERN_WARNING "CIFS: invalid domain name\n");
-				return 1;	/* needs_arg; */
+				goto cifs_parse_mount_err;
 			}
 			/* BB are there cases in which a comma can be valid in
 			a domain name and need special handling? */
 			if (strnlen(value, 256) < 256) {
-				vol->domainname = value;
+				vol->domainname = kstrdup(value, GFP_KERNEL);
+				if (!vol->domainname) {
+					printk(KERN_WARNING "CIFS: no memory "
+							    "for domainname\n");
+					goto cifs_parse_mount_err;
+				}
 				cFYI(1, "Domain name set");
 			} else {
 				printk(KERN_WARNING "CIFS: domain name too "
 						    "long\n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if (strnicmp(data, "srcaddr", 7) == 0) {
 			vol->srcaddr.ss_family = AF_UNSPEC;
@@ -1086,7 +1123,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			if (!value || !*value) {
 				printk(KERN_WARNING "CIFS: srcaddr value"
 				       " not specified.\n");
-				return 1;	/* needs_arg; */
+				goto cifs_parse_mount_err;
 			}
 			i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
 						 value, strlen(value));
@@ -1094,20 +1131,20 @@ cifs_parse_mount_options(char *options, const char *devname,
 				printk(KERN_WARNING "CIFS:  Could not parse"
 				       " srcaddr: %s\n",
 				       value);
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if (strnicmp(data, "prefixpath", 10) == 0) {
 			if (!value || !*value) {
 				printk(KERN_WARNING
 					"CIFS: invalid path prefix\n");
-				return 1;       /* needs_argument */
+				goto cifs_parse_mount_err;
 			}
 			if ((temp_len = strnlen(value, 1024)) < 1024) {
 				if (value[0] != '/')
 					temp_len++;  /* missing leading slash */
 				vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
 				if (vol->prepath == NULL)
-					return 1;
+					goto cifs_parse_mount_err;
 				if (value[0] != '/') {
 					vol->prepath[0] = '/';
 					strcpy(vol->prepath+1, value);
@@ -1116,24 +1153,33 @@ cifs_parse_mount_options(char *options, const char *devname,
 				cFYI(1, "prefix path %s", vol->prepath);
 			} else {
 				printk(KERN_WARNING "CIFS: prefix too long\n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if (strnicmp(data, "iocharset", 9) == 0) {
 			if (!value || !*value) {
 				printk(KERN_WARNING "CIFS: invalid iocharset "
 						    "specified\n");
-				return 1;	/* needs_arg; */
+				goto cifs_parse_mount_err;
 			}
 			if (strnlen(value, 65) < 65) {
-				if (strnicmp(value, "default", 7))
-					vol->iocharset = value;
+				if (strnicmp(value, "default", 7)) {
+					vol->iocharset = kstrdup(value,
+								 GFP_KERNEL);
+
+					if (!vol->iocharset) {
+						printk(KERN_WARNING "CIFS: no "
+								   "memory for"
+								   "charset\n");
+						goto cifs_parse_mount_err;
+					}
+				}
 				/* if iocharset not set then load_nls_default
 				   is used by caller */
 				cFYI(1, "iocharset set to %s", value);
 			} else {
 				printk(KERN_WARNING "CIFS: iocharset name "
 						    "too long.\n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 		} else if (!strnicmp(data, "uid", 3) && value && *value) {
 			vol->linux_uid = simple_strtoul(value, &value, 0);
@@ -1246,7 +1292,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 				if (vol->actimeo > CIFS_MAX_ACTIMEO) {
 					cERROR(1, "CIFS: attribute cache"
 							"timeout too large");
-					return 1;
+					goto cifs_parse_mount_err;
 				}
 			}
 		} else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1390,7 +1436,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 #ifndef CONFIG_CIFS_FSCACHE
 			cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
 				  "kernel config option set");
-			return 1;
+			goto cifs_parse_mount_err;
 #endif
 			vol->fsc = true;
 		} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
@@ -1405,12 +1451,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 		if (devname == NULL) {
 			printk(KERN_WARNING "CIFS: Missing UNC name for mount "
 						"target\n");
-			return 1;
+			goto cifs_parse_mount_err;
 		}
 		if ((temp_len = strnlen(devname, 300)) < 300) {
 			vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
 			if (vol->UNC == NULL)
-				return 1;
+				goto cifs_parse_mount_err;
 			strcpy(vol->UNC, devname);
 			if (strncmp(vol->UNC, "//", 2) == 0) {
 				vol->UNC[0] = '\\';
@@ -1418,21 +1464,21 @@ cifs_parse_mount_options(char *options, const char *devname,
 			} else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
 				printk(KERN_WARNING "CIFS: UNC Path does not "
 						    "begin with // or \\\\ \n");
-				return 1;
+				goto cifs_parse_mount_err;
 			}
 			value = strpbrk(vol->UNC+2, "/\\");
 			if (value)
 				*value = '\\';
 		} else {
 			printk(KERN_WARNING "CIFS: UNC name too long\n");
-			return 1;
+			goto cifs_parse_mount_err;
 		}
 	}
 
 	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
 		cERROR(1, "Multiuser mounts currently require krb5 "
 			  "authentication!");
-		return 1;
+		goto cifs_parse_mount_err;
 	}
 
 	if (vol->UNCip == NULL)
@@ -1450,7 +1496,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 		printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
 				   "specified with no gid= option.\n");
 
+	kfree(mountdata_copy);
 	return 0;
+
+cifs_parse_mount_err:
+	kfree(mountdata_copy);
+	return 1;
 }
 
 /** Returns true if srcaddr isn't specified and rhs isn't
@@ -2280,7 +2331,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
 		smb_buf = (struct smb_hdr *)ses_init_buf;
 
 		/* sizeof RFC1002_SESSION_REQUEST with no scope */
-		smb_buf->smb_buf_length = 0x81000044;
+		smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
 		rc = smb_send(server, smb_buf, 0x44);
 		kfree(ses_init_buf);
 		/*
@@ -2691,8 +2742,12 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
 		return;
 
 	volume_info = *pvolume_info;
+	kfree(volume_info->username);
 	kzfree(volume_info->password);
 	kfree(volume_info->UNC);
+	kfree(volume_info->UNCip);
+	kfree(volume_info->domainname);
+	kfree(volume_info->iocharset);
 	kfree(volume_info->prepath);
 	kfree(volume_info);
 	*pvolume_info = NULL;
@@ -2729,11 +2784,65 @@ build_unc_path_to_root(const struct smb_vol *volume_info,
 	full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
 	return full_path;
 }
+
+/*
+ * Perform a dfs referral query for a share and (optionally) prefix
+ *
+ * If a referral is found, cifs_sb->mountdata will be (re-)allocated
+ * to a string containing updated options for the submount.  Otherwise it
+ * will be left untouched.
+ *
+ * Returns the rc from get_dfs_path to the caller, which can be used to
+ * determine whether there were referrals.
+ */
+static int
+expand_dfs_referral(int xid, struct cifsSesInfo *pSesInfo,
+		    struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
+		    int check_prefix)
+{
+	int rc;
+	unsigned int num_referrals = 0;
+	struct dfs_info3_param *referrals = NULL;
+	char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
+
+	full_path = build_unc_path_to_root(volume_info, cifs_sb);
+	if (IS_ERR(full_path))
+		return PTR_ERR(full_path);
+
+	/* For DFS paths, skip the first '\' of the UNC */
+	ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
+
+	rc = get_dfs_path(xid, pSesInfo , ref_path, cifs_sb->local_nls,
+			  &num_referrals, &referrals,
+			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (!rc && num_referrals > 0) {
+		char *fake_devname = NULL;
+
+		mdata = cifs_compose_mount_options(cifs_sb->mountdata,
+						   full_path + 1, referrals,
+						   &fake_devname);
+
+		free_dfs_info_array(referrals, num_referrals);
+		kfree(fake_devname);
+
+		if (cifs_sb->mountdata != NULL)
+			kfree(cifs_sb->mountdata);
+
+		if (IS_ERR(mdata)) {
+			rc = PTR_ERR(mdata);
+			mdata = NULL;
+		}
+		cifs_sb->mountdata = mdata;
+	}
+	kfree(full_path);
+	return rc;
+}
 #endif
 
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
-		char *mount_data_global, const char *devname)
+		const char *devname)
 {
 	int rc;
 	int xid;
@@ -2742,13 +2851,20 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct cifsTconInfo *tcon;
 	struct TCP_Server_Info *srvTcp;
 	char   *full_path;
-	char *mount_data = mount_data_global;
 	struct tcon_link *tlink;
 #ifdef CONFIG_CIFS_DFS_UPCALL
-	struct dfs_info3_param *referrals = NULL;
-	unsigned int num_referrals = 0;
 	int referral_walks_count = 0;
 try_mount_again:
+	/* cleanup activities if we're chasing a referral */
+	if (referral_walks_count) {
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
+
+		cleanup_volume_info(&volume_info);
+		FreeXid(xid);
+	}
 #endif
 	rc = 0;
 	tcon = NULL;
@@ -2765,7 +2881,8 @@ try_mount_again:
 		goto out;
 	}
 
-	if (cifs_parse_mount_options(mount_data, devname, volume_info)) {
+	if (cifs_parse_mount_options(cifs_sb->mountdata, devname,
+				     volume_info)) {
 		rc = -EINVAL;
 		goto out;
 	}
@@ -2861,6 +2978,24 @@ try_mount_again:
 			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
 remote_path_check:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	/*
+	 * Perform an unconditional check for whether there are DFS
+	 * referrals for this path without prefix, to provide support
+	 * for DFS referrals from w2k8 servers which don't seem to respond
+	 * with PATH_NOT_COVERED to requests that include the prefix.
+	 * Chase the referral if found, otherwise continue normally.
+	 */
+	if (referral_walks_count == 0) {
+		int refrc = expand_dfs_referral(xid, pSesInfo, volume_info,
+						cifs_sb, false);
+		if (!refrc) {
+			referral_walks_count++;
+			goto try_mount_again;
+		}
+	}
+#endif
+
 	/* check if a whole path (including prepath) is not remote */
 	if (!rc && tcon) {
 		/* build_path_to_root works only when we have a valid tcon */
@@ -2894,46 +3029,15 @@ remote_path_check:
 		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
 			convert_delimiter(cifs_sb->prepath,
 					CIFS_DIR_SEP(cifs_sb));
-		full_path = build_unc_path_to_root(volume_info, cifs_sb);
-		if (IS_ERR(full_path)) {
-			rc = PTR_ERR(full_path);
-			goto mount_fail_check;
-		}
-
-		cFYI(1, "Getting referral for: %s", full_path);
-		rc = get_dfs_path(xid, pSesInfo , full_path + 1,
-			cifs_sb->local_nls, &num_referrals, &referrals,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (!rc && num_referrals > 0) {
-			char *fake_devname = NULL;
-
-			if (mount_data != mount_data_global)
-				kfree(mount_data);
 
-			mount_data = cifs_compose_mount_options(
-					cifs_sb->mountdata, full_path + 1,
-					referrals, &fake_devname);
+		rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb,
+					 true);
 
-			free_dfs_info_array(referrals, num_referrals);
-			kfree(fake_devname);
-			kfree(full_path);
-
-			if (IS_ERR(mount_data)) {
-				rc = PTR_ERR(mount_data);
-				mount_data = NULL;
-				goto mount_fail_check;
-			}
-
-			if (tcon)
-				cifs_put_tcon(tcon);
-			else if (pSesInfo)
-				cifs_put_smb_ses(pSesInfo);
-
-			cleanup_volume_info(&volume_info);
+		if (!rc) {
 			referral_walks_count++;
-			FreeXid(xid);
 			goto try_mount_again;
 		}
+		goto mount_fail_check;
 #else /* No DFS support, return error on mount */
 		rc = -EOPNOTSUPP;
 #endif
@@ -2966,8 +3070,6 @@ remote_path_check:
 mount_fail_check:
 	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
-		if (mount_data != mount_data_global)
-			kfree(mount_data);
 		/* If find_unc succeeded then rc == 0 so we can not end */
 		/* up accidentally freeing someone elses tcon struct */
 		if (tcon)
@@ -3083,7 +3185,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	bcc_ptr += strlen("?????");
 	bcc_ptr += 1;
 	count = bcc_ptr - &pSMB->Password[0];
-	pSMB->hdr.smb_buf_length += count;
+	pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu(
+					pSMB->hdr.smb_buf_length) + count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
@@ -3258,7 +3361,9 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon = NULL;
 	struct smb_vol *vol_info;
-	char username[MAX_USERNAME_SIZE + 1];
+	char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
+			   /* We used to have this as MAX_USERNAME which is   */
+			   /* way too big now (256 instead of 32) */
 
 	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
 	if (vol_info == NULL) {
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 993f820..55d87ac 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
 #include "cifs_debug.h"
 #include "cifsfs.h"
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
 	/* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
 	.encode_fs =  */
 };
 
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index faf5952..c672afe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -857,95 +857,6 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 		cifsi->server_eof = end_of_write;
 }
 
-ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-	size_t write_size, loff_t *poffset)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int rc = 0;
-	unsigned int bytes_written = 0;
-	unsigned int total_written;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	int xid;
-	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsi = CIFS_I(inode);
-
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
-	/* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name); */
-
-	if (file->private_data == NULL)
-		return -EBADF;
-
-	open_file = file->private_data;
-	pTcon = tlink_tcon(open_file->tlink);
-
-	rc = generic_write_checks(file, poffset, &write_size, 0);
-	if (rc)
-		return rc;
-
-	xid = GetXid();
-
-	for (total_written = 0; write_size > total_written;
-	     total_written += bytes_written) {
-		rc = -EAGAIN;
-		while (rc == -EAGAIN) {
-			if (file->private_data == NULL) {
-				/* file has been closed on us */
-				FreeXid(xid);
-			/* if we have gotten here we have written some data
-			   and blocked, and the file has been freed on us while
-			   we blocked so return what we managed to write */
-				return total_written;
-			}
-			if (open_file->invalidHandle) {
-				/* we could deadlock if we called
-				   filemap_fdatawait from here so tell
-				   reopen_file not to flush data to server
-				   now */
-				rc = cifs_reopen_file(open_file, false);
-				if (rc != 0)
-					break;
-			}
-
-			rc = CIFSSMBWrite(xid, pTcon,
-				open_file->netfid,
-				min_t(const int, cifs_sb->wsize,
-				      write_size - total_written),
-				*poffset, &bytes_written,
-				NULL, write_data + total_written, 0);
-		}
-		if (rc || (bytes_written == 0)) {
-			if (total_written)
-				break;
-			else {
-				FreeXid(xid);
-				return rc;
-			}
-		} else {
-			cifs_update_eof(cifsi, *poffset, bytes_written);
-			*poffset += bytes_written;
-		}
-	}
-
-	cifs_stats_bytes_written(pTcon, total_written);
-
-/* Do not update local mtime - server will set its actual value on write
- *	inode->i_ctime = inode->i_mtime =
- * 		current_fs_time(inode->i_sb);*/
-	if (total_written > 0) {
-		spin_lock(&inode->i_lock);
-		if (*poffset > inode->i_size)
-			i_size_write(inode, *poffset);
-		spin_unlock(&inode->i_lock);
-	}
-	mark_inode_dirty_sync(inode);
-
-	FreeXid(xid);
-	return total_written;
-}
-
 static ssize_t cifs_write(struct cifsFileInfo *open_file,
 			  const char *write_data, size_t write_size,
 			  loff_t *poffset)
@@ -1420,9 +1331,10 @@ retry_write:
 	return rc;
 }
 
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
-	int rc = -EFAULT;
+	int rc;
 	int xid;
 
 	xid = GetXid();
@@ -1442,15 +1354,29 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 	 * to fail to update with the state of the page correctly.
 	 */
 	set_page_writeback(page);
+retry_write:
 	rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
-	SetPageUptodate(page); /* BB add check for error and Clearuptodate? */
-	unlock_page(page);
+	if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
+		goto retry_write;
+	else if (rc == -EAGAIN)
+		redirty_page_for_writepage(wbc, page);
+	else if (rc != 0)
+		SetPageError(page);
+	else
+		SetPageUptodate(page);
 	end_page_writeback(page);
 	page_cache_release(page);
 	FreeXid(xid);
 	return rc;
 }
 
+static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int rc = cifs_writepage_locked(page, wbc);
+	unlock_page(page);
+	return rc;
+}
+
 static int cifs_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
@@ -1519,8 +1445,13 @@ int cifs_strict_fsync(struct file *file, int datasync)
 	cFYI(1, "Sync file - name: %s datasync: 0x%x",
 		file->f_path.dentry->d_name.name, datasync);
 
-	if (!CIFS_I(inode)->clientCanCacheRead)
-		cifs_invalidate_mapping(inode);
+	if (!CIFS_I(inode)->clientCanCacheRead) {
+		rc = cifs_invalidate_mapping(inode);
+		if (rc) {
+			cFYI(1, "rc: %d during invalidate phase", rc);
+			rc = 0; /* don't care about it in fsync */
+		}
+	}
 
 	tcon = tlink_tcon(smbfile->tlink);
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
@@ -1726,7 +1657,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 	return total_written;
 }
 
-static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
 	ssize_t written;
@@ -1849,17 +1780,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	return total_read;
 }
 
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
-		       size_t read_size, loff_t *poffset)
-{
-	struct iovec iov;
-	iov.iov_base = read_data;
-	iov.iov_len = read_size;
-
-	return cifs_iovec_read(file, &iov, 1, poffset);
-}
-
-static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
 	ssize_t read;
@@ -1987,8 +1908,11 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
 
 	xid = GetXid();
 
-	if (!CIFS_I(inode)->clientCanCacheRead)
-		cifs_invalidate_mapping(inode);
+	if (!CIFS_I(inode)->clientCanCacheRead) {
+		rc = cifs_invalidate_mapping(inode);
+		if (rc)
+			return rc;
+	}
 
 	rc = generic_file_mmap(file, vma);
 	if (rc == 0)
@@ -2415,6 +2339,27 @@ static void cifs_invalidate_page(struct page *page, unsigned long offset)
 		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
 
+static int cifs_launder_page(struct page *page)
+{
+	int rc = 0;
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+
+	cFYI(1, "Launder page: %p", page);
+
+	if (clear_page_dirty_for_io(page))
+		rc = cifs_writepage_locked(page, &wbc);
+
+	cifs_fscache_invalidate_page(page, page->mapping->host);
+	return rc;
+}
+
 void cifs_oplock_break(struct work_struct *work)
 {
 	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -2486,7 +2431,7 @@ const struct address_space_operations cifs_addr_ops = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .direct_IO = */
+	.launder_page = cifs_launder_page,
 };
 
 /*
@@ -2503,5 +2448,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .direct_IO = */
+	.launder_page = cifs_launder_page,
 };
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8852470..de02ed5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -878,7 +878,7 @@ retry_iget5_locked:
 }
 
 /* gets root inode */
-struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb)
 {
 	int xid;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -1683,71 +1683,70 @@ cifs_inode_needs_reval(struct inode *inode)
 /*
  * Zap the cache. Called when invalid_mapping flag is set.
  */
-void
+int
 cifs_invalidate_mapping(struct inode *inode)
 {
-	int rc;
+	int rc = 0;
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
 	cifs_i->invalid_mapping = false;
 
-	/* write back any cached data */
 	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
-		rc = filemap_write_and_wait(inode->i_mapping);
-		mapping_set_error(inode->i_mapping, rc);
+		rc = invalidate_inode_pages2(inode->i_mapping);
+		if (rc) {
+			cERROR(1, "%s: could not invalidate inode %p", __func__,
+			       inode);
+			cifs_i->invalid_mapping = true;
+		}
 	}
-	invalidate_remote_inode(inode);
+
 	cifs_fscache_reset_inode_cookie(inode);
+	return rc;
 }
 
-int cifs_revalidate_file(struct file *filp)
+int cifs_revalidate_file_attr(struct file *filp)
 {
 	int rc = 0;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
 
 	if (!cifs_inode_needs_reval(inode))
-		goto check_inval;
+		return rc;
 
 	if (tlink_tcon(cfile->tlink)->unix_ext)
 		rc = cifs_get_file_info_unix(filp);
 	else
 		rc = cifs_get_file_info(filp);
 
-check_inval:
-	if (CIFS_I(inode)->invalid_mapping)
-		cifs_invalidate_mapping(inode);
-
 	return rc;
 }
 
-/* revalidate a dentry's inode attributes */
-int cifs_revalidate_dentry(struct dentry *dentry)
+int cifs_revalidate_dentry_attr(struct dentry *dentry)
 {
 	int xid;
 	int rc = 0;
-	char *full_path = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = dentry->d_sb;
+	char *full_path = NULL;
 
 	if (inode == NULL)
 		return -ENOENT;
 
-	xid = GetXid();
-
 	if (!cifs_inode_needs_reval(inode))
-		goto check_inval;
+		return rc;
+
+	xid = GetXid();
 
 	/* can not safely grab the rename sem here if rename calls revalidate
 	   since that would deadlock */
 	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		goto check_inval;
+		goto out;
 	}
 
-	cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-		 "jiffies %ld", full_path, inode, inode->i_count.counter,
+	cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time "
+		 "%ld jiffies %ld", full_path, inode, inode->i_count.counter,
 		 dentry, dentry->d_time, jiffies);
 
 	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
@@ -1756,41 +1755,83 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
 					 xid, NULL);
 
-check_inval:
-	if (CIFS_I(inode)->invalid_mapping)
-		cifs_invalidate_mapping(inode);
-
+out:
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
 }
 
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	rc = cifs_revalidate_file_attr(filp);
+	if (rc)
+		return rc;
+
+	if (CIFS_I(inode)->invalid_mapping)
+		rc = cifs_invalidate_mapping(inode);
+	return rc;
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int rc;
+	struct inode *inode = dentry->d_inode;
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	if (CIFS_I(inode)->invalid_mapping)
+		rc = cifs_invalidate_mapping(inode);
+	return rc;
+}
+
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
 	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-	int err = cifs_revalidate_dentry(dentry);
-
-	if (!err) {
-		generic_fillattr(dentry->d_inode, stat);
-		stat->blksize = CIFS_MAX_MSGSIZE;
-		stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+	struct inode *inode = dentry->d_inode;
+	int rc;
 
-		/*
-		 * If on a multiuser mount without unix extensions, and the
-		 * admin hasn't overridden them, set the ownership to the
-		 * fsuid/fsgid of the current process.
-		 */
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
-		    !tcon->unix_ext) {
-			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
-				stat->uid = current_fsuid();
-			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
-				stat->gid = current_fsgid();
+	/*
+	 * We need to be sure that all dirty pages are written and the server
+	 * has actual ctime, mtime and file length.
+	 */
+	if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+	    inode->i_mapping->nrpages != 0) {
+		rc = filemap_fdatawait(inode->i_mapping);
+		if (rc) {
+			mapping_set_error(inode->i_mapping, rc);
+			return rc;
 		}
 	}
-	return err;
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	generic_fillattr(inode, stat);
+	stat->blksize = CIFS_MAX_MSGSIZE;
+	stat->ino = CIFS_I(inode)->uniqueid;
+
+	/*
+	 * If on a multiuser mount without unix extensions, and the admin hasn't
+	 * overridden them, set the ownership to the fsuid/fsgid of the current
+	 * process.
+	 */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+	    !tcon->unix_ext) {
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+			stat->uid = current_fsuid();
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+			stat->gid = current_fsgid();
+	}
+	return rc;
 }
 
 static int cifs_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0c684ae..907531a 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -304,12 +304,10 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 
 	memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
 
-	buffer->smb_buf_length =
+	buffer->smb_buf_length = cpu_to_be32(
 	    (2 * word_count) + sizeof(struct smb_hdr) -
 	    4 /*  RFC 1001 length field does not count */  +
-	    2 /* for bcc field itself */ ;
-	/* Note that this is the only network field that has to be converted
-	   to big endian and it is done just before we send it */
+	    2 /* for bcc field itself */) ;
 
 	buffer->Protocol[0] = 0xFF;
 	buffer->Protocol[1] = 'S';
@@ -424,7 +422,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 int
 checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
-	__u32 len = smb->smb_buf_length;
+	__u32 len = be32_to_cpu(smb->smb_buf_length);
 	__u32 clc_len;  /* calculated length */
 	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
 
@@ -464,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 
 	if (check_smb_hdr(smb, mid))
 		return 1;
-	clc_len = smbCalcSize_LE(smb);
+	clc_len = smbCalcSize(smb);
 
 	if (4 + len != length) {
 		cERROR(1, "Length read does not match RFC1001 length %d",
@@ -521,7 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 			(struct smb_com_transaction_change_notify_rsp *)buf;
 		struct file_notify_information *pnotify;
 		__u32 data_offset = 0;
-		if (get_bcc_le(buf) > sizeof(struct file_notify_information)) {
+		if (get_bcc(buf) > sizeof(struct file_notify_information)) {
 			data_offset = le32_to_cpu(pSMBr->DataOffset);
 
 			pnotify = (struct file_notify_information *)
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 79f641e..79b71c2 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -919,13 +919,6 @@ smbCalcSize(struct smb_hdr *ptr)
 		2 /* size of the bcc field */ + get_bcc(ptr));
 }
 
-unsigned int
-smbCalcSize_LE(struct smb_hdr *ptr)
-{
-	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-		2 /* size of the bcc field */ + get_bcc_le(ptr));
-}
-
 /* The following are taken from fs/ntfs/util.c */
 
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 645114a..7dd4621 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -621,7 +621,7 @@ ssetup_ntlmssp_authenticate:
 	and rest of bcc area. This allows us to avoid
 	a large buffer 17K allocation */
 	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = smb_buf->smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
 
 	/* setting this here allows the code at the end of the function
 	   to free the request buffer if there's an error */
@@ -656,7 +656,7 @@ ssetup_ntlmssp_authenticate:
 		 * to use challenge/response method (i.e. Password bit is 1).
 		 */
 
-		calc_lanman_hash(ses->password, ses->server->cryptkey,
+		rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
 				 ses->server->secMode & SECMODE_PW_ENCRYPT ?
 					true : false, lnm_session_key);
 
@@ -859,9 +859,10 @@ ssetup_ntlmssp_authenticate:
 	iov[2].iov_len = (long) bcc_ptr - (long) str_area;
 
 	count = iov[1].iov_len + iov[2].iov_len;
-	smb_buf->smb_buf_length += count;
+	smb_buf->smb_buf_length =
+		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
 
-	put_bcc_le(count, smb_buf);
+	put_bcc(count, smb_buf);
 
 	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
 			  CIFS_LOG_ERROR);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
deleted file mode 100644
index 0472148..0000000
--- a/fs/cifs/smbdes.c
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-
-   a partial implementation of DES designed for use in the
-   SMB authentication protocol
-
-   Copyright (C) Andrew Tridgell 1998
-   Modified by Steve French (sfrench@us.ibm.com) 2002,2004
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/* NOTES:
-
-   This code makes no attempt to be fast! In fact, it is a very
-   slow implementation
-
-   This code is NOT a complete DES implementation. It implements only
-   the minimum necessary for SMB authentication, as used by all SMB
-   products (including every copy of Microsoft Windows95 ever sold)
-
-   In particular, it can only do a unchained forward DES pass. This
-   means it is not possible to use this code for encryption/decryption
-   of data, instead it is only useful as a "hash" algorithm.
-
-   There is no entry point into this code that allows normal DES operation.
-
-   I believe this means that this code does not come under ITAR
-   regulations but this is NOT a legal opinion. If you are concerned
-   about the applicability of ITAR regulations to this code then you
-   should confirm it for yourself (and maybe let me know if you come
-   up with a different answer to the one above)
-*/
-#include <linux/slab.h>
-#define uchar unsigned char
-
-static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
-	1, 58, 50, 42, 34, 26, 18,
-	10, 2, 59, 51, 43, 35, 27,
-	19, 11, 3, 60, 52, 44, 36,
-	63, 55, 47, 39, 31, 23, 15,
-	7, 62, 54, 46, 38, 30, 22,
-	14, 6, 61, 53, 45, 37, 29,
-	21, 13, 5, 28, 20, 12, 4
-};
-
-static uchar perm2[48] = { 14, 17, 11, 24, 1, 5,
-	3, 28, 15, 6, 21, 10,
-	23, 19, 12, 4, 26, 8,
-	16, 7, 27, 20, 13, 2,
-	41, 52, 31, 37, 47, 55,
-	30, 40, 51, 45, 33, 48,
-	44, 49, 39, 56, 34, 53,
-	46, 42, 50, 36, 29, 32
-};
-
-static uchar perm3[64] = { 58, 50, 42, 34, 26, 18, 10, 2,
-	60, 52, 44, 36, 28, 20, 12, 4,
-	62, 54, 46, 38, 30, 22, 14, 6,
-	64, 56, 48, 40, 32, 24, 16, 8,
-	57, 49, 41, 33, 25, 17, 9, 1,
-	59, 51, 43, 35, 27, 19, 11, 3,
-	61, 53, 45, 37, 29, 21, 13, 5,
-	63, 55, 47, 39, 31, 23, 15, 7
-};
-
-static uchar perm4[48] = { 32, 1, 2, 3, 4, 5,
-	4, 5, 6, 7, 8, 9,
-	8, 9, 10, 11, 12, 13,
-	12, 13, 14, 15, 16, 17,
-	16, 17, 18, 19, 20, 21,
-	20, 21, 22, 23, 24, 25,
-	24, 25, 26, 27, 28, 29,
-	28, 29, 30, 31, 32, 1
-};
-
-static uchar perm5[32] = { 16, 7, 20, 21,
-	29, 12, 28, 17,
-	1, 15, 23, 26,
-	5, 18, 31, 10,
-	2, 8, 24, 14,
-	32, 27, 3, 9,
-	19, 13, 30, 6,
-	22, 11, 4, 25
-};
-
-static uchar perm6[64] = { 40, 8, 48, 16, 56, 24, 64, 32,
-	39, 7, 47, 15, 55, 23, 63, 31,
-	38, 6, 46, 14, 54, 22, 62, 30,
-	37, 5, 45, 13, 53, 21, 61, 29,
-	36, 4, 44, 12, 52, 20, 60, 28,
-	35, 3, 43, 11, 51, 19, 59, 27,
-	34, 2, 42, 10, 50, 18, 58, 26,
-	33, 1, 41, 9, 49, 17, 57, 25
-};
-
-static uchar sc[16] = { 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
-
-static uchar sbox[8][4][16] = {
-	{{14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7},
-	 {0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8},
-	 {4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0},
-	 {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13} },
-
-	{{15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10},
-	 {3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5},
-	 {0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15},
-	 {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9} },
-
-	{{10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8},
-	 {13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1},
-	 {13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7},
-	 {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12} },
-
-	{{7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15},
-	 {13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9},
-	 {10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4},
-	 {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14} },
-
-	{{2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9},
-	 {14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6},
-	 {4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14},
-	 {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3} },
-
-	{{12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11},
-	 {10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8},
-	 {9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6},
-	 {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13} },
-
-	{{4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1},
-	 {13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6},
-	 {1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2},
-	 {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12} },
-
-	{{13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7},
-	 {1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2},
-	 {7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8},
-	 {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11} }
-};
-
-static void
-permute(char *out, char *in, uchar *p, int n)
-{
-	int i;
-	for (i = 0; i < n; i++)
-		out[i] = in[p[i] - 1];
-}
-
-static void
-lshift(char *d, int count, int n)
-{
-	char out[64];
-	int i;
-	for (i = 0; i < n; i++)
-		out[i] = d[(i + count) % n];
-	for (i = 0; i < n; i++)
-		d[i] = out[i];
-}
-
-static void
-concat(char *out, char *in1, char *in2, int l1, int l2)
-{
-	while (l1--)
-		*out++ = *in1++;
-	while (l2--)
-		*out++ = *in2++;
-}
-
-static void
-xor(char *out, char *in1, char *in2, int n)
-{
-	int i;
-	for (i = 0; i < n; i++)
-		out[i] = in1[i] ^ in2[i];
-}
-
-static void
-dohash(char *out, char *in, char *key, int forw)
-{
-	int i, j, k;
-	char *pk1;
-	char c[28];
-	char d[28];
-	char *cd;
-	char (*ki)[48];
-	char *pd1;
-	char l[32], r[32];
-	char *rl;
-
-	/* Have to reduce stack usage */
-	pk1 = kmalloc(56+56+64+64, GFP_KERNEL);
-	if (pk1 == NULL)
-		return;
-
-	ki = kmalloc(16*48, GFP_KERNEL);
-	if (ki == NULL) {
-		kfree(pk1);
-		return;
-	}
-
-	cd = pk1 + 56;
-	pd1 = cd  + 56;
-	rl = pd1 + 64;
-
-	permute(pk1, key, perm1, 56);
-
-	for (i = 0; i < 28; i++)
-		c[i] = pk1[i];
-	for (i = 0; i < 28; i++)
-		d[i] = pk1[i + 28];
-
-	for (i = 0; i < 16; i++) {
-		lshift(c, sc[i], 28);
-		lshift(d, sc[i], 28);
-
-		concat(cd, c, d, 28, 28);
-		permute(ki[i], cd, perm2, 48);
-	}
-
-	permute(pd1, in, perm3, 64);
-
-	for (j = 0; j < 32; j++) {
-		l[j] = pd1[j];
-		r[j] = pd1[j + 32];
-	}
-
-	for (i = 0; i < 16; i++) {
-		char *er;  /* er[48]  */
-		char *erk; /* erk[48] */
-		char b[8][6];
-		char *cb;  /* cb[32]  */
-		char *pcb; /* pcb[32] */
-		char *r2;  /* r2[32]  */
-
-		er = kmalloc(48+48+32+32+32, GFP_KERNEL);
-		if (er == NULL) {
-			kfree(pk1);
-			kfree(ki);
-			return;
-		}
-		erk = er+48;
-		cb  = erk+48;
-		pcb = cb+32;
-		r2  = pcb+32;
-
-		permute(er, r, perm4, 48);
-
-		xor(erk, er, ki[forw ? i : 15 - i], 48);
-
-		for (j = 0; j < 8; j++)
-			for (k = 0; k < 6; k++)
-				b[j][k] = erk[j * 6 + k];
-
-		for (j = 0; j < 8; j++) {
-			int m, n;
-			m = (b[j][0] << 1) | b[j][5];
-
-			n = (b[j][1] << 3) | (b[j][2] << 2) | (b[j][3] <<
-							       1) | b[j][4];
-
-			for (k = 0; k < 4; k++)
-				b[j][k] =
-				    (sbox[j][m][n] & (1 << (3 - k))) ? 1 : 0;
-		}
-
-		for (j = 0; j < 8; j++)
-			for (k = 0; k < 4; k++)
-				cb[j * 4 + k] = b[j][k];
-		permute(pcb, cb, perm5, 32);
-
-		xor(r2, l, pcb, 32);
-
-		for (j = 0; j < 32; j++)
-			l[j] = r[j];
-
-		for (j = 0; j < 32; j++)
-			r[j] = r2[j];
-
-		kfree(er);
-	}
-
-	concat(rl, r, l, 32, 32);
-
-	permute(out, rl, perm6, 64);
-	kfree(pk1);
-	kfree(ki);
-}
-
-static void
-str_to_key(unsigned char *str, unsigned char *key)
-{
-	int i;
-
-	key[0] = str[0] >> 1;
-	key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
-	key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
-	key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
-	key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
-	key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
-	key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
-	key[7] = str[6] & 0x7F;
-	for (i = 0; i < 8; i++)
-		key[i] = (key[i] << 1);
-}
-
-static void
-smbhash(unsigned char *out, const unsigned char *in, unsigned char *key,
-	int forw)
-{
-	int i;
-	char *outb; /* outb[64] */
-	char *inb;  /* inb[64]  */
-	char *keyb; /* keyb[64] */
-	unsigned char key2[8];
-
-	outb = kmalloc(64 * 3, GFP_KERNEL);
-	if (outb == NULL)
-		return;
-
-	inb  = outb + 64;
-	keyb = inb +  64;
-
-	str_to_key(key, key2);
-
-	for (i = 0; i < 64; i++) {
-		inb[i] = (in[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
-		keyb[i] = (key2[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
-		outb[i] = 0;
-	}
-
-	dohash(outb, inb, keyb, forw);
-
-	for (i = 0; i < 8; i++)
-		out[i] = 0;
-
-	for (i = 0; i < 64; i++) {
-		if (outb[i])
-			out[i / 8] |= (1 << (7 - (i % 8)));
-	}
-	kfree(outb);
-}
-
-void
-E_P16(unsigned char *p14, unsigned char *p16)
-{
-	unsigned char sp8[8] =
-	    { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
-	smbhash(p16, sp8, p14, 1);
-	smbhash(p16 + 8, sp8, p14 + 7, 1);
-}
-
-void
-E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
-{
-	smbhash(p24, c8, p21, 1);
-	smbhash(p24 + 8, c8, p21 + 7, 1);
-	smbhash(p24 + 16, c8, p21 + 14, 1);
-}
-
-#if 0 /* currently unused */
-static void
-D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
-	smbhash(out, in, p14, 0);
-	smbhash(out + 8, in + 8, p14 + 7, 0);
-}
-
-static void
-E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
-	smbhash(out, in, p14, 1);
-	smbhash(out + 8, in + 8, p14 + 7, 1);
-}
-/* these routines are currently unneeded, but may be
-	needed later */
-void
-cred_hash1(unsigned char *out, unsigned char *in, unsigned char *key)
-{
-	unsigned char buf[8];
-
-	smbhash(buf, in, key, 1);
-	smbhash(out, buf, key + 9, 1);
-}
-
-void
-cred_hash2(unsigned char *out, unsigned char *in, unsigned char *key)
-{
-	unsigned char buf[8];
-	static unsigned char key2[8];
-
-	smbhash(buf, in, key, 1);
-	key2[0] = key[7];
-	smbhash(out, buf, key2, 1);
-}
-
-void
-cred_hash3(unsigned char *out, unsigned char *in, unsigned char *key, int forw)
-{
-	static unsigned char key2[8];
-
-	smbhash(out, in, key, forw);
-	key2[0] = key[7];
-	smbhash(out + 8, in + 8, key2, forw);
-}
-#endif /* unneeded routines */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5041c8..1525d5e 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -47,6 +47,88 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
 
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+	int i;
+
+	key[0] = str[0] >> 1;
+	key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+	key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+	key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+	key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+	key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+	key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+	key[7] = str[6] & 0x7F;
+	for (i = 0; i < 8; i++)
+		key[i] = (key[i] << 1);
+}
+
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+	int rc;
+	unsigned char key2[8];
+	struct crypto_blkcipher *tfm_des;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+
+	str_to_key(key, key2);
+
+	tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm_des)) {
+		rc = PTR_ERR(tfm_des);
+		cERROR(1, "could not allocate des crypto API\n");
+		goto smbhash_err;
+	}
+
+	desc.tfm = tfm_des;
+
+	crypto_blkcipher_setkey(tfm_des, key2, 8);
+
+	sg_init_one(&sgin, in, 8);
+	sg_init_one(&sgout, out, 8);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+	if (rc) {
+		cERROR(1, "could not encrypt crypt key rc: %d\n", rc);
+		crypto_free_blkcipher(tfm_des);
+		goto smbhash_err;
+	}
+
+smbhash_err:
+	return rc;
+}
+
+static int
+E_P16(unsigned char *p14, unsigned char *p16)
+{
+	int rc;
+	unsigned char sp8[8] =
+	    { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
+
+	rc = smbhash(p16, sp8, p14);
+	if (rc)
+		return rc;
+	rc = smbhash(p16 + 8, sp8, p14 + 7);
+	return rc;
+}
+
+static int
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+
+	rc = smbhash(p24, c8, p21);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 8, c8, p21 + 7);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 16, c8, p21 + 14);
+	return rc;
+}
+
 /* produce a md4 message digest from data of length n bytes */
 int
 mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
@@ -87,40 +169,30 @@ mdfour_err:
 	return rc;
 }
 
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-	      unsigned char p24[24])
-{
-	unsigned char p21[21];
-
-	memset(p21, '\0', 21);
-
-	memcpy(p21, passwd, 16);
-	E_P24(p21, c8, p24);
-}
-
 /*
    This implements the X/Open SMB password encryption
    It takes a password, a 8 byte "crypt key" and puts 24 bytes of
    encrypted password into p24 */
 /* Note that password must be uppercased and null terminated */
-void
+int
 SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
 {
-	unsigned char p14[15], p21[21];
+	int rc;
+	unsigned char p14[14], p16[16], p21[21];
 
-	memset(p21, '\0', 21);
 	memset(p14, '\0', 14);
-	strncpy((char *) p14, (char *) passwd, 14);
+	memset(p16, '\0', 16);
+	memset(p21, '\0', 21);
 
-/*	strupper((char *)p14); *//* BB at least uppercase the easy range */
-	E_P16(p14, p21);
+	memcpy(p14, passwd, 14);
+	rc = E_P16(p14, p16);
+	if (rc)
+		return rc;
 
-	SMBOWFencrypt(p21, c8, p24);
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
 
-	memset(p14, 0, 15);
-	memset(p21, 0, 21);
+	return rc;
 }
 
 /* Routines for Windows NT MD4 Hash functions. */
@@ -279,16 +351,18 @@ int
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
 	int rc;
-	unsigned char p21[21];
+	unsigned char p16[16], p21[21];
 
+	memset(p16, '\0', 16);
 	memset(p21, '\0', 21);
 
-	rc = E_md4hash(passwd, p21);
+	rc = E_md4hash(passwd, p16);
 	if (rc) {
 		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
 		return rc;
 	}
-	SMBOWFencrypt(p21, c8, p24);
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
 	return rc;
 }
 
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 46d8756f..f2513fb 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -129,7 +129,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	unsigned int len = iov[0].iov_len;
 	unsigned int total_len;
 	int first_vec = 0;
-	unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+	unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length);
 	struct socket *ssocket = server->ssocket;
 
 	if (ssocket == NULL)
@@ -144,17 +144,10 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	else
 		smb_msg.msg_flags = MSG_NOSIGNAL;
 
-	/* smb header is converted in header_assemble. bcc and rest of SMB word
-	   area, and byte area if necessary, is converted to littleendian in
-	   cifssmb.c and RFC1001 len is converted to bigendian in smb_send
-	   Flags2 is converted in SendReceive */
-
-
 	total_len = 0;
 	for (i = 0; i < n_vec; i++)
 		total_len += iov[i].iov_len;
 
-	smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
 	cFYI(1, "Sending smb:  total_len %d", total_len);
 	dump_smb(smb_buffer, len);
 
@@ -243,7 +236,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 
 	/* Don't want to modify the buffer as a
 	   side effect of this call. */
-	smb_buffer->smb_buf_length = smb_buf_length;
+	smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length);
 
 	return rc;
 }
@@ -387,7 +380,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&server->inSend);
 #endif
-	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&server->inSend);
 	mid->when_sent = jiffies;
@@ -422,7 +415,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 	int resp_buf_type;
 
 	iov[0].iov_base = (char *)in_buf;
-	iov[0].iov_len = in_buf->smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4;
 	flags |= CIFS_NO_RESP;
 	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
 	cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
@@ -488,10 +481,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 	int rc = 0;
 
 	/* -4 for RFC1001 length and +2 for BCC field */
-	in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
 	in_buf->Command = SMB_COM_NT_CANCEL;
 	in_buf->WordCount = 0;
-	put_bcc_le(0, in_buf);
+	put_bcc(0, in_buf);
 
 	mutex_lock(&server->srv_mutex);
 	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
@@ -499,7 +492,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 		mutex_unlock(&server->srv_mutex);
 		return rc;
 	}
-	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 	mutex_unlock(&server->srv_mutex);
 
 	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
@@ -612,7 +605,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		return rc;
 	}
 
-	receive_len = midQ->resp_buf->smb_buf_length;
+	receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
 		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
@@ -651,11 +644,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		rc = map_smb_to_linux_error(midQ->resp_buf,
 					    flags & CIFS_LOG_ERROR);
 
-		/* convert ByteCount if necessary */
-		if (receive_len >= sizeof(struct smb_hdr) - 4
-		    /* do not count RFC1001 header */  +
-		    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
 		if ((flags & CIFS_NO_RESP) == 0)
 			midQ->resp_buf = NULL;  /* mark it so buf will
 						   not be freed by
@@ -698,9 +686,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
 		cERROR(1, "Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length);
+			   be32_to_cpu(in_buf->smb_buf_length));
 		return -EIO;
 	}
 
@@ -733,7 +722,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -768,7 +757,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		return rc;
 	}
 
-	receive_len = midQ->resp_buf->smb_buf_length;
+	receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
 		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
@@ -781,7 +770,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 
 	if (midQ->resp_buf && out_buf
 	    && (midQ->midState == MID_RESPONSE_RECEIVED)) {
-		out_buf->smb_buf_length = receive_len;
+		out_buf->smb_buf_length = cpu_to_be32(receive_len);
 		memcpy((char *)out_buf + 4,
 		       (char *)midQ->resp_buf + 4,
 		       receive_len);
@@ -800,16 +789,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 			}
 		}
 
-		*pbytes_returned = out_buf->smb_buf_length;
+		*pbytes_returned = be32_to_cpu(out_buf->smb_buf_length);
 
 		/* BB special case reconnect tid and uid here? */
 		rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-
-		/* convert ByteCount if necessary */
-		if (receive_len >= sizeof(struct smb_hdr) - 4
-		    /* do not count RFC1001 header */  +
-		    (2 * out_buf->WordCount) + 2 /* bcc */ )
-			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
 	} else {
 		rc = -EIO;
 		cERROR(1, "Bad MID state?");
@@ -877,9 +860,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
 		cERROR(1, "Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length);
+			   be32_to_cpu(in_buf->smb_buf_length));
 		return -EIO;
 	}
 
@@ -910,7 +894,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -977,7 +961,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	if (rc != 0)
 		return rc;
 
-	receive_len = midQ->resp_buf->smb_buf_length;
+	receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
 		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
 			receive_len, xid);
@@ -993,7 +977,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		goto out;
 	}
 
-	out_buf->smb_buf_length = receive_len;
+	out_buf->smb_buf_length = cpu_to_be32(receive_len);
 	memcpy((char *)out_buf + 4,
 	       (char *)midQ->resp_buf + 4,
 	       receive_len);
@@ -1012,17 +996,11 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		}
 	}
 
-	*pbytes_returned = out_buf->smb_buf_length;
+	*pbytes_returned = be32_to_cpu(out_buf->smb_buf_length);
 
 	/* BB special case reconnect tid and uid here? */
 	rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
 
-	/* convert ByteCount if necessary */
-	if (receive_len >= sizeof(struct smb_hdr) - 4
-	    /* do not count RFC1001 header */  +
-	    (2 * out_buf->WordCount) + 2 /* bcc */ )
-		put_bcc(get_bcc_le(out_buf), out_buf);
-
 out:
 	delete_mid(midQ);
 	if (rstart && rc == -EACCES)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index eae2a14..912995e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -112,6 +112,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
+	struct cifs_ntsd *pacl;
 
 	if (direntry == NULL)
 		return -EIO;
@@ -166,6 +167,25 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
 			(__u16)value_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+			strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+		pacl = kmalloc(value_size, GFP_KERNEL);
+		if (!pacl) {
+			cFYI(1, "%s: Can't allocate memory for ACL",
+					__func__);
+			rc = -ENOMEM;
+		} else {
+#ifdef CONFIG_CIFS_ACL
+			memcpy(pacl, ea_value, value_size);
+			rc = set_cifs_acl(pacl, value_size,
+				direntry->d_inode, full_path);
+			if (rc == 0) /* force revalidate of the inode */
+				CIFS_I(direntry->d_inode)->time = 0;
+			kfree(pacl);
+#else
+			cFYI(1, "Set CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
+		}
 	} else {
 		int temp;
 		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
diff --git a/fs/compat.c b/fs/compat.c
index 72fe6cd..0ea00832 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1306,241 +1306,6 @@ compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
diff --git a/fs/dcache.c b/fs/dcache.c
index 22a0ef4..37f72ee 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,6 +35,7 @@
 #include <linux/hardirq.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
+#include <linux/prefetch.h>
 #include "internal.h"
 
 /*
@@ -1219,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
 EXPORT_SYMBOL(shrink_dcache_parent);
 
 /*
- * Scan `nr' dentries and return the number which remain.
+ * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
  *
  * We need to avoid reentering the filesystem if the caller is performing a
  * GFP_NOFS allocation attempt.  One example deadlock is:
@@ -1230,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
  *
  * In this case we return -1 to tell the caller that we baled.
  */
-static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink,
+				struct shrink_control *sc)
 {
+	int nr = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 89d394d..90f7657 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -428,26 +428,17 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 			       size_t count, loff_t *ppos)
 {
 	char buf[32];
-	int buf_size;
+	size_t buf_size;
+	bool bv;
 	u32 *val = file->private_data;
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
 		return -EFAULT;
 
-	switch (buf[0]) {
-	case 'y':
-	case 'Y':
-	case '1':
-		*val = 1;
-		break;
-	case 'n':
-	case 'N':
-	case '0':
-		*val = 0;
-		break;
-	}
-	
+	if (strtobool(buf, &bv) == 0)
+		*val = bv;
+
 	return count;
 }
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff..9b026ea 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
 	unsigned int cl_log_debug;
 	unsigned int cl_protocol;
 	unsigned int cl_timewarn_cs;
+	unsigned int cl_waitwarn_us;
 };
 
 enum {
@@ -114,6 +115,7 @@ enum {
 	CLUSTER_ATTR_LOG_DEBUG,
 	CLUSTER_ATTR_PROTOCOL,
 	CLUSTER_ATTR_TIMEWARN_CS,
+	CLUSTER_ATTR_WAITWARN_US,
 };
 
 struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
 CLUSTER_ATTR(protocol, 0);
 CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
 
 static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
 	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
 	[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+	[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
 	NULL,
 };
 
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
 	cl->cl_log_debug = dlm_config.ci_log_debug;
 	cl->cl_protocol = dlm_config.ci_protocol;
 	cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+	cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
 
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_LOG_DEBUG          0
 #define DEFAULT_PROTOCOL           0
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US	   0
 
 struct dlm_config_info dlm_config = {
 	.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
 	.ci_scan_secs = DEFAULT_SCAN_SECS,
 	.ci_log_debug = DEFAULT_LOG_DEBUG,
 	.ci_protocol = DEFAULT_PROTOCOL,
-	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS
+	.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+	.ci_waitwarn_us = DEFAULT_WAITWARN_US
 };
 
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fc..dd0ce24 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
 	int ci_log_debug;
 	int ci_protocol;
 	int ci_timewarn_cs;
+	int ci_waitwarn_us;
 };
 
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b942049..0262451 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
 #define DLM_IFL_WATCH_TIMEWARN	0x00400000
 #define DLM_IFL_TIMEOUT_CANCEL	0x00800000
 #define DLM_IFL_DEADLOCK_CANCEL	0x01000000
+#define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_ORPHAN		0x00000002
 
@@ -245,6 +246,7 @@ struct dlm_lkb {
 
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
 	int8_t			lkb_wait_count;
+	int			lkb_wait_nodeid; /* for debugging */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
 	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	struct list_head	lkb_time_list;
 	ktime_t			lkb_timestamp;
+	ktime_t			lkb_wait_time;
 	unsigned long		lkb_timeout_cs;
 
 	struct dlm_callback	lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfc..f71d0b5 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
 	return -1;
 }
 
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+	int i;
+
+	for (i = 0; i < num_nodes; i++) {
+		if (!warned[i]) {
+			warned[i] = nodeid;
+			return 0;
+		}
+		if (warned[i] == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	ktime_t zero = ktime_set(0, 0);
+	s64 us;
+	s64 debug_maxus = 0;
+	u32 debug_scanned = 0;
+	u32 debug_expired = 0;
+	int num_nodes = 0;
+	int *warned = NULL;
+
+	if (!dlm_config.ci_waitwarn_us)
+		return;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (ktime_equal(lkb->lkb_wait_time, zero))
+			continue;
+
+		debug_scanned++;
+
+		us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+
+		if (us < dlm_config.ci_waitwarn_us)
+			continue;
+
+		lkb->lkb_wait_time = zero;
+
+		debug_expired++;
+		if (us > debug_maxus)
+			debug_maxus = us;
+
+		if (!num_nodes) {
+			num_nodes = ls->ls_num_nodes;
+			warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+			if (warned)
+				memset(warned, 0, num_nodes * sizeof(int));
+		}
+		if (!warned)
+			continue;
+		if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+			continue;
+
+		log_error(ls, "waitwarn %x %lld %d us check connection to "
+			  "node %d", lkb->lkb_id, (long long)us,
+			  dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	if (warned)
+		kfree(warned);
+
+	if (debug_expired)
+		log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+			  debug_scanned, debug_expired,
+			  dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
+
 /* add/remove lkb from global waiters list of lkb's waiting for
    a reply from a remote node */
 
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 
 	lkb->lkb_wait_count++;
 	lkb->lkb_wait_type = mstype;
+	lkb->lkb_wait_time = ktime_get();
+	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
 	hold_lkb(lkb);
 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
  out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error;
 
-	if (ms != &ls->ls_stub_ms)
+	if (ms->m_flags != DLM_IFL_STUB_MS)
 		mutex_lock(&ls->ls_waiters_mutex);
 	error = _remove_from_waiters(lkb, ms->m_type, ms);
-	if (ms != &ls->ls_stub_ms)
+	if (ms->m_flags != DLM_IFL_STUB_MS)
 		mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
 	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
 		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
 	mutex_unlock(&ls->ls_timeout_mutex);
+
+	if (!dlm_config.ci_waitwarn_us)
+		return;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (ktime_to_us(lkb->lkb_wait_time))
+			lkb->lkb_wait_time = ktime_get();
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
 }
 
 /* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
    ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
    compatible with other granted locks */
 
-static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_demoted(struct dlm_lkb *lkb)
 {
-	if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
-		log_print("munge_demoted %x invalid reply type %d",
-			  lkb->lkb_id, ms->m_type);
-		return;
-	}
-
 	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
 		log_print("munge_demoted %x invalid modes gr %d rq %d",
 			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 	struct dlm_mhandle *mh;
 	int to_nodeid, error;
 
-	error = add_to_waiters(lkb, mstype);
+	to_nodeid = r->res_nodeid;
+
+	error = add_to_waiters(lkb, mstype, to_nodeid);
 	if (error)
 		return error;
 
-	to_nodeid = r->res_nodeid;
-
 	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
 	if (error)
 		goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	/* down conversions go without a reply from the master */
 	if (!error && down_conversion(lkb)) {
 		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+		r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
 		r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		r->res_ls->ls_stub_ms.m_result = 0;
-		r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
 	}
 
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	struct dlm_mhandle *mh;
 	int to_nodeid, error;
 
-	error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
 	if (error)
 		return error;
 
-	to_nodeid = dlm_dir_nodeid(r);
-
 	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
 	if (error)
 		goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
 
 static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
+	if (ms->m_flags == DLM_IFL_STUB_MS)
+		return;
+
 	lkb->lkb_sbflags = ms->m_sbflags;
 	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
 		         (ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 		/* convert was queued on remote master */
 		receive_flags_reply(lkb, ms);
 		if (is_demoted(lkb))
-			munge_demoted(lkb, ms);
+			munge_demoted(lkb);
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
 		add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 		/* convert was granted on remote master */
 		receive_flags_reply(lkb, ms);
 		if (is_demoted(lkb))
-			munge_demoted(lkb, ms);
+			munge_demoted(lkb);
 		grant_lock_pc(r, lkb, ms);
 		queue_cast(r, lkb, 0);
 		break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
 	dlm_put_lockspace(ls);
 }
 
-static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				   struct dlm_message *ms_stub)
 {
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
-		ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
-		ls->ls_stub_ms.m_result = -EINPROGRESS;
-		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-		ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
-		_receive_convert_reply(lkb, &ls->ls_stub_ms);
+		memset(ms_stub, 0, sizeof(struct dlm_message));
+		ms_stub->m_flags = DLM_IFL_STUB_MS;
+		ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
+		ms_stub->m_result = -EINPROGRESS;
+		ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+		_receive_convert_reply(lkb, ms_stub);
 
 		/* Same special case as in receive_rcom_lock_args() */
 		lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
+	struct dlm_message *ms_stub;
 	int wait_type, stub_unlock_result, stub_cancel_result;
 
+	ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+	if (!ms_stub) {
+		log_error(ls, "dlm_recover_waiters_pre no mem");
+		return;
+	}
+
 	mutex_lock(&ls->ls_waiters_mutex);
 
 	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
-		log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
-			  lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+		/* exclude debug messages about unlocks because there can be so
+		   many and they aren't very interesting */
+
+		if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+			log_debug(ls, "recover_waiter %x nodeid %d "
+				  "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+				  lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+		}
 
 		/* all outstanding lookups, regardless of destination  will be
 		   resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 			break;
 
 		case DLM_MSG_CONVERT:
-			recover_convert_waiter(ls, lkb);
+			recover_convert_waiter(ls, lkb, ms_stub);
 			break;
 
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
-			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
-			ls->ls_stub_ms.m_result = stub_unlock_result;
-			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
-			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
+			memset(ms_stub, 0, sizeof(struct dlm_message));
+			ms_stub->m_flags = DLM_IFL_STUB_MS;
+			ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
+			ms_stub->m_result = stub_unlock_result;
+			ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+			_receive_unlock_reply(lkb, ms_stub);
 			dlm_put_lkb(lkb);
 			break;
 
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
-			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
-			ls->ls_stub_ms.m_result = stub_cancel_result;
-			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
-			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
+			memset(ms_stub, 0, sizeof(struct dlm_message));
+			ms_stub->m_flags = DLM_IFL_STUB_MS;
+			ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
+			ms_stub->m_result = stub_cancel_result;
+			ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+			_receive_cancel_reply(lkb, ms_stub);
 			dlm_put_lkb(lkb);
 			break;
 
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		schedule();
 	}
 	mutex_unlock(&ls->ls_waiters_mutex);
+	kfree(ms_stub);
 }
 
 static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
 		ou = is_overlap_unlock(lkb);
 		err = 0;
 
-		log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
-			  lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+		log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
+			  lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
 
 		/* At this point we assume that we won't get a reply to any
 		   previous op or overlap op on this lock.  First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c8..265017a 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
 void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
 void dlm_scan_timeout(struct dlm_ls *ls);
 void dlm_adjust_timeouts(struct dlm_ls *ls);
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7d..14cbf40 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
 static int dlm_scand(void *data)
 {
 	struct dlm_ls *ls;
-	int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
 
 	while (!kthread_should_stop()) {
 		ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
 				ls->ls_scan_time = jiffies;
 				dlm_scan_rsbs(ls);
 				dlm_scan_timeout(ls);
+				dlm_scan_waiters(ls);
 				dlm_unlock_recovery(ls);
 			} else {
 				ls->ls_scan_time += HZ;
 			}
-		} else {
-			schedule_timeout_interruptible(timeout_jiffies);
+			continue;
 		}
+		schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
 	}
 	return 0;
 }
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85..e2b8780 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
 	wake_up(&send_wq);
 }
 
+/* If a process was killed while waiting for the only plock on a file,
+   locks_remove_posix will not see any lock on the file so it won't
+   send an unlock-close to us to pass on to userspace to clean up the
+   abandoned waiter.  So, we have to insert the unlock-close when the
+   lock call is interrupted. */
+
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+			    struct file *file, struct file_lock *fl)
+{
+	struct plock_op *op;
+
+	op = kzalloc(sizeof(*op), GFP_NOFS);
+	if (!op)
+		return;
+
+	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= 0;
+	op->info.end		= OFFSET_MAX;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	op->info.flags |= DLM_PLOCK_FL_CLOSE;
+	send_op(op);
+}
+
 int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 		   int cmd, struct file_lock *fl)
 {
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 
 	send_op(op);
 
-	if (xop->callback == NULL)
-		wait_event(recv_wq, (op->done != 0));
-	else {
+	if (xop->callback == NULL) {
+		rv = wait_event_killable(recv_wq, (op->done != 0));
+		if (rv == -ERESTARTSYS) {
+			log_debug(ls, "dlm_posix_lock: wait killed %llx",
+				  (unsigned long long)number);
+			spin_lock(&ops_lock);
+			list_del(&op->list);
+			spin_unlock(&ops_lock);
+			kfree(xop);
+			do_unlock_close(ls, number, file, fl);
+			goto out;
+		}
+	} else {
 		rv = FILE_LOCK_DEFERRED;
 		goto out;
 	}
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	else
 		op->info.owner	= (__u64)(long) fl->fl_owner;
 
+	if (fl->fl_flags & FL_CLOSE) {
+		op->info.flags |= DLM_PLOCK_FL_CLOSE;
+		send_op(op);
+		rv = 0;
+		goto out;
+	}
+
 	send_op(op);
 	wait_event(recv_wq, (op->done != 0));
 
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
 	spin_lock(&ops_lock);
 	if (!list_empty(&send_list)) {
 		op = list_entry(send_list.next, struct plock_op, list);
-		list_move(&op->list, &recv_list);
+		if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+			list_del(&op->list);
+		else
+			list_move(&op->list, &recv_list);
 		memcpy(&info, &op->info, sizeof(info));
 	}
 	spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
 	if (!op)
 		return -EAGAIN;
 
+	/* there is no need to get a reply from userspace for unlocks
+	   that were generated by the vfs cleaning up for a close
+	   (the process did not make an unlock call). */
+
+	if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+		kfree(op);
+
 	if (copy_to_user(u, &info, sizeof(info)))
 		return -EFAULT;
 	return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe..e96bf3e 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
  out_sig:
 	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
-	recalc_sigpending();
  out_free:
 	kfree(kbuf);
 	return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c8..c00e055 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 static void drop_slab(void)
 {
 	int nr_objects;
+	struct shrink_control shrink = {
+		.gfp_mask = GFP_KERNEL,
+	};
 
 	do {
-		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+		nr_objects = shrink_slab(&shrink, 1000, 1000);
 	} while (nr_objects > 10);
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 5e62d26..936f577 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -166,8 +167,13 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -186,7 +192,7 @@ void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -194,7 +200,7 @@ struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 
 #ifdef CONFIG_STACK_GROWSUP
 	if (write) {
-		ret = expand_stack_downwards(bprm->vma, pos);
+		ret = expand_downwards(bprm->vma, pos);
 		if (ret < 0)
 			return NULL;
 	}
@@ -305,11 +311,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -398,22 +404,56 @@ err:
 	return err;
 }
 
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+		compat_uptr_t __user *compat;
+#endif
+	} ptr;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+{
+	const char __user *native;
+
+#ifdef CONFIG_COMPAT
+	if (unlikely(argv.is_compat)) {
+		compat_uptr_t compat;
+
+		if (get_user(compat, argv.ptr.compat + nr))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(compat);
+	}
+#endif
+
+	if (get_user(native, argv.ptr.native + nr))
+		return ERR_PTR(-EFAULT);
+
+	return native;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(struct user_arg_ptr argv, int max)
 {
 	int i = 0;
 
-	if (argv != NULL) {
+	if (argv.ptr.native != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_user_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -430,7 +470,7 @@ static int count(const char __user * const __user * argv, int max)
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, struct user_arg_ptr argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
@@ -443,16 +483,18 @@ static int copy_strings(int argc, const char __user *const __user *argv,
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_user_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;
@@ -519,14 +561,19 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *__argv,
 			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
+	struct user_arg_ptr argv = {
+		.ptr.native = (const char __user *const  __user *)__argv,
+	};
+
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, argv, bprm);
 	set_fs(oldfs);
+
 	return r;
 }
 EXPORT_SYMBOL(copy_strings_kernel);
@@ -553,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	unsigned long length = old_end - old_start;
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
-	struct mmu_gather *tlb;
+	struct mmu_gather tlb;
 
 	BUG_ON(new_start > new_end);
 
@@ -579,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		return -ENOMEM;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
+	tlb_gather_mmu(&tlb, mm, 0);
 	if (new_end > old_start) {
 		/*
 		 * when the old and new regions overlap clear from new_end.
 		 */
-		free_pgd_range(tlb, new_end, old_end, new_end,
+		free_pgd_range(&tlb, new_end, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	} else {
 		/*
@@ -593,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * have constraints on va-space that make this illegal (IA64) -
 		 * for the others its just a little faster.
 		 */
-		free_pgd_range(tlb, old_start, old_end, new_end,
+		free_pgd_range(&tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
-	tlb_finish_mmu(tlb, new_end, old_end);
+	tlb_finish_mmu(&tlb, new_end, old_end);
 
 	/*
 	 * Shrink the vma to just the new range.  Always succeeds.
@@ -1004,6 +1051,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
 	task_unlock(tsk);
 	return buf;
 }
+EXPORT_SYMBOL_GPL(get_task_comm);
 
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
@@ -1379,10 +1427,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
-	const char __user *const __user *argv,
-	const char __user *const __user *envp,
-	struct pt_regs * regs)
+static int do_execve_common(const char *filename,
+				struct user_arg_ptr argv,
+				struct user_arg_ptr envp,
+				struct pt_regs *regs)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1489,6 +1537,34 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+	return do_execve_common(filename, argv, envp, regs);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *__argv,
+	compat_uptr_t __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;
@@ -1659,6 +1735,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
 	t = start;
 	do {
+		task_clear_group_stop_pending(t);
 		if (t != current && t->mm) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae..1dd62ed 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		brelse(bh);
 
 		if (!sb_set_blocksize(sb, blocksize)) {
-			ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
+			ext2_msg(sb, KERN_ERR,
+				"error: bad blocksize %d", blocksize);
 			goto failed_sbi;
 		}
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b86..34b6d9b 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	frame->at = entries;
 	frame->bh = bh;
 	bh = bh2;
+	/*
+	 * Mark buffers dirty here so that if do_split() fails we write a
+	 * consistent set of buffers to disk.
+	 */
+	ext3_journal_dirty_metadata(handle, frame->bh);
+	ext3_journal_dirty_metadata(handle, bh);
 	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-	dx_release (frames);
-	if (!(de))
+	if (!de) {
+		ext3_mark_inode_dirty(handle, dir);
+		dx_release(frames);
 		return retval;
+	}
+	dx_release(frames);
 
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
 	handle_t *handle;
 	struct inode * inode;
 	int l, err, retries = 0;
+	int credits;
 
 	l = strlen(symname)+1;
 	if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
 
 	dquot_initialize(dir);
 
+	if (l > EXT3_N_BLOCKS * 4) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks.
+		 */
+		credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+			  EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+	}
 retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-					EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+	handle = ext3_journal_start(dir, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2211,21 +2237,45 @@ retry:
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > sizeof (EXT3_I(inode)->i_data)) {
+	if (l > EXT3_N_BLOCKS * 4) {
 		inode->i_op = &ext3_symlink_inode_operations;
 		ext3_set_aops(inode);
 		/*
-		 * page_symlink() calls into ext3_prepare/commit_write.
-		 * We have a transaction open.  All is sweetness.  It also sets
-		 * i_size in generic_commit_write().
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext3_write_begin() which acquires page
+		 * lock which ranks below transaction start (and it can also
+		 * wait for journal commit if we are running out of space). So
+		 * we have to stop transaction now and restart it when symlink
+		 * contents is written. 
+		 *
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
 		 */
+		drop_nlink(inode);
+		err = ext3_orphan_add(handle, inode);
+		ext3_journal_stop(handle);
+		if (err)
+			goto err_drop_inode;
 		err = __page_symlink(inode, symname, l, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+		 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+		 */
+		handle = ext3_journal_start(dir,
+				EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err_drop_inode;
+		}
+		inc_nlink(inode);
+		err = ext3_orphan_del(handle, inode);
 		if (err) {
+			ext3_journal_stop(handle);
 			drop_nlink(inode);
-			unlock_new_inode(inode);
-			ext3_mark_inode_dirty(handle, inode);
-			iput (inode);
-			goto out_stop;
+			goto err_drop_inode;
 		}
 	} else {
 		inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
+err_drop_inode:
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
 }
 
 static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index ae8200f..1cc7038e 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
 			spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
 
 			tmp = fat_cache_alloc(inode);
+			if (!tmp) {
+				spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+				MSDOS_I(inode)->nr_caches--;
+				spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+				return;
+			}
+
 			spin_lock(&MSDOS_I(inode)->cache_lru_lock);
 			cache = fat_cache_merge(inode, new);
 			if (cache != NULL) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ee42b9e..4ad6473 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -98,7 +98,7 @@ next:
 
 	*bh = sb_bread(sb, phys);
 	if (*bh == NULL) {
-		printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
+		fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed",
 		       (llu)phys);
 		/* skip this block */
 		*pos = (iblock + 1) << sb->s_blocksize_bits;
@@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
  * but ignore that right now.
  * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
  */
-static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
-		       int uni_xlate, struct nls_table *nls)
+static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
+		       const wchar_t *uni, int len, struct nls_table *nls)
 {
+	int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
 	const wchar_t *ip;
 	wchar_t ec;
 	unsigned char *op;
@@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 	}
 
 	if (unlikely(*ip)) {
-		printk(KERN_WARNING "FAT: filename was truncated while "
-		       "converting.");
+		fat_msg(sb, KERN_WARNING, "filename was truncated while "
+			"converting.");
 	}
 
 	*op = 0;
 	return (op - ascii);
 }
 
-static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
 				unsigned char *buf, int size)
 {
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	if (sbi->options.utf8)
 		return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
 				UTF16_HOST_ENDIAN, buf, size);
 	else
-		return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
-				   sbi->nls_io);
+		return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
 }
 
 static inline int
@@ -419,7 +420,7 @@ parse_record:
 
 		/* Compare shortname */
 		bufuname[last_u] = 0x0000;
-		len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+		len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
 		if (fat_name_match(sbi, name, name_len, bufname, len))
 			goto found;
 
@@ -428,7 +429,7 @@ parse_record:
 			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
 
 			/* Compare longname */
-			len = fat_uni_to_x8(sbi, unicode, longname, size);
+			len = fat_uni_to_x8(sb, unicode, longname, size);
 			if (fat_name_match(sbi, name, name_len, longname, len))
 				goto found;
 		}
@@ -545,7 +546,7 @@ parse_record:
 		if (nr_slots) {
 			void *longname = unicode + FAT_MAX_UNI_CHARS;
 			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
-			int len = fat_uni_to_x8(sbi, unicode, longname, size);
+			int len = fat_uni_to_x8(sb, unicode, longname, size);
 
 			fill_name = longname;
 			fill_len = len;
@@ -621,7 +622,7 @@ parse_record:
 
 	if (isvfat) {
 		bufuname[j] = 0x0000;
-		i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+		i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
 	}
 	if (nr_slots) {
 		/* hack for fat_ioctl_filldir() */
@@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 
 int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 {
+	struct super_block *sb = dir->i_sb;
 	struct msdos_dir_entry *de;
 	struct buffer_head *bh;
 	int err = 0, nr_slots;
@@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 		 */
 		err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
 		if (err) {
-			printk(KERN_WARNING
-			       "FAT: Couldn't remove the long name slots\n");
+			fat_msg(sb, KERN_WARNING,
+			       "Couldn't remove the long name slots");
 		}
 	}
 
@@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 		if (sbi->fat_bits != 32)
 			goto error;
 	} else if (MSDOS_I(dir)->i_start == 0) {
-		printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n",
+		fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
 		       MSDOS_I(dir)->i_pos);
 		err = -EIO;
 		goto error;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index f504089..8276cc2 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb,
 			struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-			const struct inode_operations *fs_dir_inode_ops,
-			int isvfat, void (*setup)(struct super_block *));
+			  int isvfat, void (*setup)(struct super_block *));
 
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
 extern void
-__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(sb, fmt, args...)		\
+	__fat_fs_error(sb, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(sb, fmt, args...) \
+	__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
 	__attribute__ ((format (printf, 3, 4))) __cold;
-#define fat_fs_error(s, fmt, args...)		\
-	__fat_fs_error(s, 1, fmt , ## args)
-#define fat_fs_error_ratelimit(s, fmt, args...) \
-	__fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index b47d2c9..2e81ac0 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
 	brelse(bhs[0]);
 err:
-	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
+	fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
 	return -EIO;
 }
 
@@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
-		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
+		fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
 		       (llu)blocknr);
 		return -EIO;
 	}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8d68690..cb8d839 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = sbi->free_clusters;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
-	buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
+	buf->f_namelen =
+		(sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
 
 	return 0;
 }
@@ -619,8 +620,8 @@ retry:
 
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
 	if (!bh) {
-		printk(KERN_ERR "FAT: unable to read inode block "
-		       "for updating (i_pos %lld)\n", i_pos);
+		fat_msg(sb, KERN_ERR, "unable to read inode block "
+		       "for updating (i_pos %lld)", i_pos);
 		return -EIO;
 	}
 	spin_lock(&sbi->inode_hash_lock);
@@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = {
 	{Opt_err, NULL}
 };
 
-static int parse_options(char *options, int is_vfat, int silent, int *debug,
-			 struct fat_mount_options *opts)
+static int parse_options(struct super_block *sb, char *options, int is_vfat,
+			 int silent, int *debug, struct fat_mount_options *opts)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 
 		/* obsolete mount options */
 		case Opt_obsolate:
-			printk(KERN_INFO "FAT: \"%s\" option is obsolete, "
-			       "not supported now\n", p);
+			fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
+			       "not supported now", p);
 			break;
 		/* unknown option */
 		default:
 			if (!silent) {
-				printk(KERN_ERR
-				       "FAT: Unrecognized mount option \"%s\" "
-				       "or missing value\n", p);
+				fat_msg(sb, KERN_ERR,
+				       "Unrecognized mount option \"%s\" "
+				       "or missing value", p);
 			}
 			return -EINVAL;
 		}
@@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 out:
 	/* UTF-8 doesn't provide FAT semantics */
 	if (!strcmp(opts->iocharset, "utf8")) {
-		printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
+		fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
 		       " for FAT filesystems, filesystem will be "
 		       "case sensitive!\n");
 	}
@@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode)
 /*
  * Read the super block of an MS-DOS FS.
  */
-int fat_fill_super(struct super_block *sb, void *data, int silent,
-		   const struct inode_operations *fs_dir_inode_ops, int isvfat,
+int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		   void (*setup)(struct super_block *))
 {
 	struct inode *root_inode = NULL, *fat_inode = NULL;
@@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_magic = MSDOS_SUPER_MAGIC;
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
-	sbi->dir_ops = fs_dir_inode_ops;
 	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
 			     DEFAULT_RATELIMIT_BURST);
 
-	error = parse_options(data, isvfat, silent, &debug, &sbi->options);
+	error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
 	if (error)
 		goto out_fail;
 
@@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sb_min_blocksize(sb, 512);
 	bh = sb_bread(sb, 0);
 	if (bh == NULL) {
-		printk(KERN_ERR "FAT: unable to read boot sector\n");
+		fat_msg(sb, KERN_ERR, "unable to read boot sector");
 		goto out_fail;
 	}
 
 	b = (struct fat_boot_sector *) bh->b_data;
 	if (!b->reserved) {
 		if (!silent)
-			printk(KERN_ERR "FAT: bogus number of reserved sectors\n");
+			fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
 		brelse(bh);
 		goto out_invalid;
 	}
 	if (!b->fats) {
 		if (!silent)
-			printk(KERN_ERR "FAT: bogus number of FAT structure\n");
+			fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
 		brelse(bh);
 		goto out_invalid;
 	}
@@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	media = b->media;
 	if (!fat_valid_media(media)) {
 		if (!silent)
-			printk(KERN_ERR "FAT: invalid media value (0x%02x)\n",
+			fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
 			       media);
 		brelse(bh);
 		goto out_invalid;
@@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	    || (logical_sector_size < 512)
 	    || (logical_sector_size > 4096)) {
 		if (!silent)
-			printk(KERN_ERR "FAT: bogus logical sector size %u\n",
+			fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
 			       logical_sector_size);
 		brelse(bh);
 		goto out_invalid;
@@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sbi->sec_per_clus = b->sec_per_clus;
 	if (!is_power_of_2(sbi->sec_per_clus)) {
 		if (!silent)
-			printk(KERN_ERR "FAT: bogus sectors per cluster %u\n",
+			fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
 			       sbi->sec_per_clus);
 		brelse(bh);
 		goto out_invalid;
 	}
 
 	if (logical_sector_size < sb->s_blocksize) {
-		printk(KERN_ERR "FAT: logical sector size too small for device"
-		       " (logical sector size = %u)\n", logical_sector_size);
+		fat_msg(sb, KERN_ERR, "logical sector size too small for device"
+		       " (logical sector size = %u)", logical_sector_size);
 		brelse(bh);
 		goto out_fail;
 	}
@@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 		brelse(bh);
 
 		if (!sb_set_blocksize(sb, logical_sector_size)) {
-			printk(KERN_ERR "FAT: unable to set blocksize %u\n",
+			fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
 			       logical_sector_size);
 			goto out_fail;
 		}
 		bh = sb_bread(sb, 0);
 		if (bh == NULL) {
-			printk(KERN_ERR "FAT: unable to read boot sector"
-			       " (logical sector size = %lu)\n",
+			fat_msg(sb, KERN_ERR, "unable to read boot sector"
+			       " (logical sector size = %lu)",
 			       sb->s_blocksize);
 			goto out_fail;
 		}
@@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 
 		fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
 		if (fsinfo_bh == NULL) {
-			printk(KERN_ERR "FAT: bread failed, FSINFO block"
-			       " (sector = %lu)\n", sbi->fsinfo_sector);
+			fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
+			       " (sector = %lu)", sbi->fsinfo_sector);
 			brelse(bh);
 			goto out_fail;
 		}
 
 		fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
 		if (!IS_FSINFO(fsinfo)) {
-			printk(KERN_WARNING "FAT: Invalid FSINFO signature: "
-			       "0x%08x, 0x%08x (sector = %lu)\n",
+			fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
+			       "0x%08x, 0x%08x (sector = %lu)",
 			       le32_to_cpu(fsinfo->signature1),
 			       le32_to_cpu(fsinfo->signature2),
 			       sbi->fsinfo_sector);
@@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
 	if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
 		if (!silent)
-			printk(KERN_ERR "FAT: bogus directroy-entries per block"
-			       " (%u)\n", sbi->dir_entries);
+			fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
+			       " (%u)", sbi->dir_entries);
 		brelse(bh);
 		goto out_invalid;
 	}
@@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
 	if (total_clusters > MAX_FAT(sb)) {
 		if (!silent)
-			printk(KERN_ERR "FAT: count of clusters too big (%u)\n",
+			fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
 			       total_clusters);
 		brelse(bh);
 		goto out_invalid;
@@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sprintf(buf, "cp%d", sbi->options.codepage);
 	sbi->nls_disk = load_nls(buf);
 	if (!sbi->nls_disk) {
-		printk(KERN_ERR "FAT: codepage %s not found\n", buf);
+		fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
 		goto out_fail;
 	}
 
@@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	if (sbi->options.isvfat) {
 		sbi->nls_io = load_nls(sbi->options.iocharset);
 		if (!sbi->nls_io) {
-			printk(KERN_ERR "FAT: IO charset %s not found\n",
+			fat_msg(sb, KERN_ERR, "IO charset %s not found",
 			       sbi->options.iocharset);
 			goto out_fail;
 		}
@@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	insert_inode_hash(root_inode);
 	sb->s_root = d_alloc_root(root_inode);
 	if (!sb->s_root) {
-		printk(KERN_ERR "FAT: get root inode failed\n");
+		fat_msg(sb, KERN_ERR, "get root inode failed");
 		goto out_fail;
 	}
 
@@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 out_invalid:
 	error = -EINVAL;
 	if (!silent)
-		printk(KERN_INFO "VFS: Can't find a valid FAT filesystem"
-		       " on dev %s.\n", sb->s_id);
+		fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
 
 out_fail:
 	if (fat_inode)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 970e682..6d93360 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,30 +20,46 @@
  * In case the file system is remounted read-only, it can be made writable
  * again by remounting it.
  */
-void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 {
-	struct fat_mount_options *opts = &MSDOS_SB(s)->options;
+	struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
 	va_list args;
+	struct va_format vaf;
 
 	if (report) {
-		printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-
-		printk(KERN_ERR "    ");
 		va_start(args, fmt);
-		vprintk(fmt, args);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
 		va_end(args);
-		printk("\n");
 	}
 
 	if (opts->errors == FAT_ERRORS_PANIC)
-		panic("FAT: fs panic from previous error\n");
-	else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
-		s->s_flags |= MS_RDONLY;
-		printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
+		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
+	else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
+				"set read-only\n", sb->s_id);
 	}
 }
 EXPORT_SYMBOL_GPL(__fat_fs_error);
 
+/**
+ * fat_msg() - print preformated FAT specific messages. Every thing what is
+ * not fat_fs_error() should be fat_msg().
+ */
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+	va_end(args);
+}
+
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
 int fat_clusters_flush(struct super_block *sb)
@@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb)
 
 	bh = sb_bread(sb, sbi->fsinfo_sector);
 	if (bh == NULL) {
-		printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
+		fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
 		return -EIO;
 	}
 
 	fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
 	/* Sanity check */
 	if (!IS_FSINFO(fsinfo)) {
-		printk(KERN_ERR "FAT: Invalid FSINFO signature: "
-		       "0x%08x, 0x%08x (sector = %lu)\n",
+		fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
+		       "0x%08x, 0x%08x (sector = %lu)",
 		       le32_to_cpu(fsinfo->signature1),
 		       le32_to_cpu(fsinfo->signature2),
 		       sbi->fsinfo_sector);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7114990..3b222da 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -659,14 +659,14 @@ static const struct inode_operations msdos_dir_inode_operations = {
 
 static void setup(struct super_block *sb)
 {
+	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
 	sb->s_d_op = &msdos_dentry_operations;
 	sb->s_flags |= MS_NOATIME;
 }
 
 static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
-	return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
-			     0, setup);
+	return fat_fill_super(sb, data, silent, 0, setup);
 }
 
 static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index adae3fb..20b4ea5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1065,6 +1065,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
 
 static void setup(struct super_block *sb)
 {
+	MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
 	if (MSDOS_SB(sb)->options.name_check != 's')
 		sb->s_d_op = &vfat_ci_dentry_ops;
 	else
@@ -1073,8 +1074,7 @@ static void setup(struct super_block *sb)
 
 static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
-	return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
-			     1, setup);
+	return fat_fill_super(sb, data, silent, 1, setup);
 }
 
 static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 2ba6719..1a43114 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -272,7 +272,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
  * *ip:			VFS inode
  *
  * Description:
- *  vxfs_put_fake_inode frees all data asssociated with @ip.
+ *  vxfs_put_fake_inode frees all data associated with @ip.
  */
 void
 vxfs_put_fake_inode(struct inode *ip)
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f1..30afdfa 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 	_enter("{OBJ%x OP%x,%u}",
 	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
 
-	fscache_set_op_state(op, "EnQ");
-
 	ASSERT(list_empty(&op->pend_link));
 	ASSERT(op->processor != NULL);
 	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
 			   struct fscache_operation *op)
 {
-	fscache_set_op_state(op, "Run");
-
 	object->n_in_progress++;
 	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
 		wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 
 	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
 
-	fscache_set_op_state(op, "SubmitX");
-
 	spin_lock(&object->lock);
 	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
 
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
 
-	fscache_set_op_state(op, "Submit");
-
 	spin_lock(&object->lock);
 	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
 	if (!atomic_dec_and_test(&op->usage))
 		return;
 
-	fscache_set_op_state(op, "Put");
-
 	_debug("PUT OP");
 	if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
 		BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c..a2a5d19 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
 	fscache_stat(&fscache_n_attr_changed_calls);
 
 	if (fscache_object_is_active(object)) {
-		fscache_set_op_state(op, "CallFS");
 		fscache_stat(&fscache_n_cop_attr_changed);
 		ret = object->cache->ops->attr_changed(object);
 		fscache_stat_d(&fscache_n_cop_attr_changed);
-		fscache_set_op_state(op, "Done");
 		if (ret < 0)
 			fscache_abort_object(object);
 	}
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 
 	fscache_operation_init(op, fscache_attr_changed_op, NULL);
 	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
-	fscache_set_op_name(op, "Attr");
 
 	spin_lock(&cookie->lock);
 
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	op->context	= context;
 	op->start_time	= jiffies;
 	INIT_LIST_HEAD(&op->to_do);
-	fscache_set_op_name(&op->op, "Retr");
 	return op;
 }
 
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
-	fscache_set_op_name(&op->op, "RetrRA1");
 
 	spin_lock(&cookie->lock);
 
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	op = fscache_alloc_retrieval(mapping, end_io_func, context);
 	if (!op)
 		return -ENOMEM;
-	fscache_set_op_name(&op->op, "RetrRAN");
 
 	spin_lock(&cookie->lock);
 
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
 	if (!op)
 		return -ENOMEM;
-	fscache_set_op_name(&op->op, "RetrAL1");
 
 	spin_lock(&cookie->lock);
 
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
 
 	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
 
-	fscache_set_op_state(&op->op, "GetPage");
-
 	spin_lock(&object->lock);
 	cookie = object->cookie;
 
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
 	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 
-	fscache_set_op_state(&op->op, "Store");
 	fscache_stat(&fscache_n_store_pages);
 	fscache_stat(&fscache_n_cop_write_page);
 	ret = object->cache->ops->write_page(op, page);
 	fscache_stat_d(&fscache_n_cop_write_page);
-	fscache_set_op_state(&op->op, "EndWrite");
 	fscache_end_page_write(object, page);
 	if (ret < 0) {
-		fscache_set_op_state(&op->op, "Abort");
 		fscache_abort_object(object);
 	} else {
 		fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	fscache_operation_init(&op->op, fscache_write_op,
 			       fscache_release_write_op);
 	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
-	fscache_set_op_name(&op->op, "Write1");
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
 	if (ret < 0)
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index f3d23ef..8612820 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
 ccflags-y := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
-	glops.o inode.o log.o lops.o main.o meta_io.o \
+	glops.o log.o lops.o main.o meta_io.o \
 	aops.o dentry.o export.o file.o \
-	ops_fstype.o ops_inode.o quota.o \
+	ops_fstype.o inode.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0f5c4f9..802ac5e 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1076,8 +1076,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		bd = bh->b_private;
 		if (bd && bd->bd_ail)
 			goto cannot_release;
-		gfs2_assert_warn(sdp, !buffer_pinned(bh));
-		gfs2_assert_warn(sdp, !buffer_dirty(bh));
+		if (buffer_pinned(bh) || buffer_dirty(bh))
+			goto not_possible;
 		bh = bh->b_this_page;
 	} while(bh != head);
 	gfs2_log_unlock(sdp);
@@ -1107,6 +1107,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 	} while (bh != head);
 
 	return try_to_free_buffers(page);
+
+not_possible: /* Should never happen */
+	WARN_ON(buffer_dirty(bh));
+	WARN_ON(buffer_pinned(bh));
 cannot_release:
 	gfs2_log_unlock(sdp);
 	return 0;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 74add2d..e65493a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -780,6 +780,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	metadata = (height != ip->i_height - 1);
 	if (metadata)
 		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+	else if (ip->i_depth)
+		revokes = sdp->sd_inptrs;
 
 	if (ip != GFS2_I(sdp->sd_rindex))
 		error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index f789c57..091ee47 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,12 +82,9 @@
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
 
-typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
-			    u64 leaf_no, void *data);
 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
 			    const struct qstr *name, void *opaque);
 
-
 int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
 			    struct buffer_head **bhp)
 {
@@ -1600,7 +1597,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
  */
 
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-		 const struct gfs2_inode *nip, unsigned type)
+		 const struct gfs2_inode *nip)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct buffer_head *bh;
@@ -1616,7 +1613,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 				return PTR_ERR(dent);
 			dent = gfs2_init_dirent(inode, dent, name, bh);
 			gfs2_inum_out(nip, dent);
-			dent->de_type = cpu_to_be16(type);
+			dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
 			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 				leaf = (struct gfs2_leaf *)bh->b_data;
 				be16_add_cpu(&leaf->lf_entries, 1);
@@ -1628,6 +1625,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
 			ip->i_entries++;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+			if (S_ISDIR(nip->i_inode.i_mode))
+				inc_nlink(&ip->i_inode);
 			gfs2_dinode_out(ip, bh->b_data);
 			brelse(bh);
 			error = 0;
@@ -1672,8 +1671,9 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
  * Returns: 0 on success, error code on failure
  */
 
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 {
+	const struct qstr *name = &dentry->d_name;
 	struct gfs2_dirent *dent, *prev = NULL;
 	struct buffer_head *bh;
 	int error;
@@ -1714,6 +1714,8 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
 	dip->i_entries--;
 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		drop_nlink(&dip->i_inode);
 	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
 	mark_inode_dirty(&dip->i_inode);
@@ -1768,94 +1770,20 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 }
 
 /**
- * foreach_leaf - call a function for each leaf in a directory
- * @dip: the directory
- * @lc: the function to call for each each
- * @data: private data to pass to it
- *
- * Returns: errno
- */
-
-static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct buffer_head *bh;
-	struct gfs2_leaf *leaf;
-	u32 hsize, len;
-	u32 ht_offset, lp_offset, ht_offset_cur = -1;
-	u32 index = 0;
-	__be64 *lp;
-	u64 leaf_no;
-	int error = 0;
-
-	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
-		gfs2_consist_inode(dip);
-		return -EIO;
-	}
-
-	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
-	if (!lp)
-		return -ENOMEM;
-
-	while (index < hsize) {
-		lp_offset = index & (sdp->sd_hash_ptrs - 1);
-		ht_offset = index - lp_offset;
-
-		if (ht_offset_cur != ht_offset) {
-			error = gfs2_dir_read_data(dip, (char *)lp,
-						ht_offset * sizeof(__be64),
-						sdp->sd_hash_bsize, 1);
-			if (error != sdp->sd_hash_bsize) {
-				if (error >= 0)
-					error = -EIO;
-				goto out;
-			}
-			ht_offset_cur = ht_offset;
-		}
-
-		leaf_no = be64_to_cpu(lp[lp_offset]);
-		if (leaf_no) {
-			error = get_leaf(dip, leaf_no, &bh);
-			if (error)
-				goto out;
-			leaf = (struct gfs2_leaf *)bh->b_data;
-			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
-			brelse(bh);
-
-			error = lc(dip, index, len, leaf_no, data);
-			if (error)
-				goto out;
-
-			index = (index & ~(len - 1)) + len;
-		} else
-			index++;
-	}
-
-	if (index != hsize) {
-		gfs2_consist_inode(dip);
-		error = -EIO;
-	}
-
-out:
-	kfree(lp);
-
-	return error;
-}
-
-/**
  * leaf_dealloc - Deallocate a directory leaf
  * @dip: the directory
  * @index: the hash table offset in the directory
  * @len: the number of pointers to this leaf
  * @leaf_no: the leaf number
- * @data: not used
+ * @leaf_bh: buffer_head for the starting leaf
+ * last_dealloc: 1 if this is the final dealloc for the leaf, else 0
  *
  * Returns: errno
  */
 
 static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
-			u64 leaf_no, void *data)
+			u64 leaf_no, struct buffer_head *leaf_bh,
+			int last_dealloc)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_leaf *tmp_leaf;
@@ -1887,14 +1815,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		goto out_qs;
 
 	/*  Count the number of leaves  */
+	bh = leaf_bh;
 
 	for (blk = leaf_no; blk; blk = nblk) {
-		error = get_leaf(dip, blk, &bh);
-		if (error)
-			goto out_rlist;
+		if (blk != leaf_no) {
+			error = get_leaf(dip, blk, &bh);
+			if (error)
+				goto out_rlist;
+		}
 		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
 		nblk = be64_to_cpu(tmp_leaf->lf_next);
-		brelse(bh);
+		if (blk != leaf_no)
+			brelse(bh);
 
 		gfs2_rlist_add(sdp, &rlist, blk);
 		l_blocks++;
@@ -1918,13 +1850,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	if (error)
 		goto out_rg_gunlock;
 
+	bh = leaf_bh;
+
 	for (blk = leaf_no; blk; blk = nblk) {
-		error = get_leaf(dip, blk, &bh);
-		if (error)
-			goto out_end_trans;
+		if (blk != leaf_no) {
+			error = get_leaf(dip, blk, &bh);
+			if (error)
+				goto out_end_trans;
+		}
 		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
 		nblk = be64_to_cpu(tmp_leaf->lf_next);
-		brelse(bh);
+		if (blk != leaf_no)
+			brelse(bh);
 
 		gfs2_free_meta(dip, blk, 1);
 		gfs2_add_inode_blocks(&dip->i_inode, -1);
@@ -1942,6 +1879,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		goto out_end_trans;
 
 	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	/* On the last dealloc, make this a regular file in case we crash.
+	   (We don't want to free these blocks a second time.)  */
+	if (last_dealloc)
+		dip->i_inode.i_mode = S_IFREG;
 	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
 
@@ -1975,29 +1916,67 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct buffer_head *bh;
-	int error;
+	struct gfs2_leaf *leaf;
+	u32 hsize, len;
+	u32 ht_offset, lp_offset, ht_offset_cur = -1;
+	u32 index = 0, next_index;
+	__be64 *lp;
+	u64 leaf_no;
+	int error = 0, last;
 
-	/* Dealloc on-disk leaves to FREEMETA state */
-	error = foreach_leaf(dip, leaf_dealloc, NULL);
-	if (error)
-		return error;
+	hsize = 1 << dip->i_depth;
+	if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
 
-	/* Make this a regular file in case we crash.
-	   (We don't want to free these blocks a second time.)  */
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
+	if (!lp)
+		return -ENOMEM;
 
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (error)
-		return error;
+	while (index < hsize) {
+		lp_offset = index & (sdp->sd_hash_ptrs - 1);
+		ht_offset = index - lp_offset;
 
-	error = gfs2_meta_inode_buffer(dip, &bh);
-	if (!error) {
-		gfs2_trans_add_bh(dip->i_gl, bh, 1);
-		((struct gfs2_dinode *)bh->b_data)->di_mode =
-						cpu_to_be32(S_IFREG);
-		brelse(bh);
+		if (ht_offset_cur != ht_offset) {
+			error = gfs2_dir_read_data(dip, (char *)lp,
+						ht_offset * sizeof(__be64),
+						sdp->sd_hash_bsize, 1);
+			if (error != sdp->sd_hash_bsize) {
+				if (error >= 0)
+					error = -EIO;
+				goto out;
+			}
+			ht_offset_cur = ht_offset;
+		}
+
+		leaf_no = be64_to_cpu(lp[lp_offset]);
+		if (leaf_no) {
+			error = get_leaf(dip, leaf_no, &bh);
+			if (error)
+				goto out;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+
+			next_index = (index & ~(len - 1)) + len;
+			last = ((next_index >= hsize) ? 1 : 0);
+			error = leaf_dealloc(dip, index, len, leaf_no, bh,
+					     last);
+			brelse(bh);
+			if (error)
+				goto out;
+			index = next_index;
+		} else
+			index++;
 	}
 
-	gfs2_trans_end(sdp);
+	if (index != hsize) {
+		gfs2_consist_inode(dip);
+		error = -EIO;
+	}
+
+out:
+	kfree(lp);
 
 	return error;
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index a98f644..e686af1 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -22,8 +22,8 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
 			  const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-			const struct gfs2_inode *ip, unsigned int type);
-extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+			const struct gfs2_inode *ip);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			 filldir_t filldir);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index b5a5e60..fe9945f 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -139,7 +139,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct inode *inode;
 
-	inode = gfs2_ilookup(sb, inum->no_addr);
+	inode = gfs2_ilookup(sb, inum->no_addr, 0);
 	if (inode) {
 		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
 			iput(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e483108..a9f5cbe 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -545,18 +545,10 @@ static int gfs2_close(struct inode *inode, struct file *file)
 /**
  * gfs2_fsync - sync the dirty data for a file (across the cluster)
  * @file: the file that points to the dentry (we ignore this)
- * @dentry: the dentry that points to the inode to sync
+ * @datasync: set if we can ignore timestamp changes
  *
- * The VFS will flush "normal" data for us. We only need to worry
- * about metadata here. For journaled data, we just do a log flush
- * as we can't avoid it. Otherwise we can just bale out if datasync
- * is set. For stuffed inodes we must flush the log in order to
- * ensure that all data is on disk.
- *
- * The call to write_inode_now() is there to write back metadata and
- * the inode itself. It does also try and write the data, but thats
- * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
- * for us.
+ * The VFS will flush data for us. We only need to worry
+ * about metadata here.
  *
  * Returns: errno
  */
@@ -565,22 +557,20 @@ static int gfs2_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
-	int ret = 0;
-
-	if (gfs2_is_jdata(GFS2_I(inode))) {
-		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-		return 0;
-	}
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
 
-	if (sync_state != 0) {
-		if (!datasync)
-			ret = write_inode_now(inode, 0);
+	if (datasync)
+		sync_state &= ~I_DIRTY_SYNC;
 
-		if (gfs2_is_stuffed(GFS2_I(inode)))
-			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+	if (sync_state) {
+		ret = sync_inode_metadata(inode, 1);
+		if (ret)
+			return ret;
+		gfs2_ail_flush(ip->i_gl);
 	}
 
-	return ret;
+	return 0;
 }
 
 /**
@@ -826,6 +816,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	loff_t bytes, max_bytes;
 	struct gfs2_alloc *al;
 	int error;
+	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
 	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
 	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
 
@@ -833,13 +824,15 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	if (mode & ~FALLOC_FL_KEEP_SIZE)
 		return -EOPNOTSUPP;
 
-	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-		 sdp->sd_sb.sb_bsize_shift;
+	offset &= bsize_mask;
 
 	len = next - offset;
 	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
 	if (!bytes)
 		bytes = UINT_MAX;
+	bytes &= bsize_mask;
+	if (bytes == 0)
+		bytes = sdp->sd_sb.sb_bsize;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
 	error = gfs2_glock_nq(&ip->i_gh);
@@ -870,6 +863,9 @@ retry:
 		if (error) {
 			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
 				bytes >>= 1;
+				bytes &= bsize_mask;
+				if (bytes == 0)
+					bytes = sdp->sd_sb.sb_bsize;
 				goto retry;
 			}
 			goto out_qunlock;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7a4fb63..2792a79 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -143,14 +143,9 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 
-	/* assert_spin_locked(&gl->gl_spin); */
-
 	if (gl->gl_state == LM_ST_UNLOCKED)
 		return 0;
-	if (test_bit(GLF_LFLUSH, &gl->gl_flags))
-		return 0;
-	if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
-	    !list_empty(&gl->gl_holders))
+	if (!list_empty(&gl->gl_holders))
 		return 0;
 	if (glops->go_demote_ok)
 		return glops->go_demote_ok(gl);
@@ -158,6 +153,31 @@ static int demote_ok(const struct gfs2_glock *gl)
 }
 
 
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+
+	if (!list_empty(&gl->gl_lru))
+		list_del_init(&gl->gl_lru);
+	else
+		atomic_inc(&lru_count);
+
+	list_add_tail(&gl->gl_lru, &lru_list);
+	set_bit(GLF_LRU, &gl->gl_flags);
+	spin_unlock(&lru_lock);
+}
+
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+	if (!list_empty(&gl->gl_lru)) {
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
+		clear_bit(GLF_LRU, &gl->gl_flags);
+	}
+	spin_unlock(&lru_lock);
+}
+
 /**
  * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
  * @gl: the glock
@@ -168,24 +188,8 @@ static int demote_ok(const struct gfs2_glock *gl)
 
 static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-	if (demote_ok(gl)) {
-		spin_lock(&lru_lock);
-
-		if (!list_empty(&gl->gl_lru))
-			list_del_init(&gl->gl_lru);
-		else
-			atomic_inc(&lru_count);
-
-		list_add_tail(&gl->gl_lru, &lru_list);
-		spin_unlock(&lru_lock);
-	}
-}
-
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-	spin_lock(&gl->gl_spin);
-	__gfs2_glock_schedule_for_reclaim(gl);
-	spin_unlock(&gl->gl_spin);
+	if (demote_ok(gl))
+		gfs2_glock_add_to_lru(gl);
 }
 
 /**
@@ -217,12 +221,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 		spin_lock_bucket(gl->gl_hash);
 		hlist_bl_del_rcu(&gl->gl_list);
 		spin_unlock_bucket(gl->gl_hash);
-		spin_lock(&lru_lock);
-		if (!list_empty(&gl->gl_lru)) {
-			list_del_init(&gl->gl_lru);
-			atomic_dec(&lru_count);
-		}
-		spin_unlock(&lru_lock);
+		gfs2_glock_remove_from_lru(gl);
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
 		trace_gfs2_glock_put(gl);
@@ -542,11 +541,6 @@ __acquires(&gl->gl_spin)
 	clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
 
 	gfs2_glock_hold(gl);
-	if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
-	    gl->gl_state == LM_ST_DEFERRED) &&
-	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
-		lck_flags |= LM_FLAG_TRY_1CB;
-
 	if (sdp->sd_lockstruct.ls_ops->lm_lock)	{
 		/* lock_dlm */
 		ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
@@ -648,7 +642,7 @@ static void delete_work_func(struct work_struct *work)
 	/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
 
 	if (ip)
-		inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+		inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
 	else
 		inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
 	if (inode && !IS_ERR(inode)) {
@@ -1025,6 +1019,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		return -EIO;
 
+	if (test_bit(GLF_LRU, &gl->gl_flags))
+		gfs2_glock_remove_from_lru(gl);
+
 	spin_lock(&gl->gl_spin);
 	add_to_queue(gh);
 	if ((LM_FLAG_NOEXP & gh->gh_flags) &&
@@ -1082,7 +1079,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
 			fast_path = 1;
 	}
-	__gfs2_glock_schedule_for_reclaim(gl);
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags))
+		__gfs2_glock_schedule_for_reclaim(gl);
 	trace_gfs2_glock_queue(gh, 0);
 	spin_unlock(&gl->gl_spin);
 	if (likely(fast_path))
@@ -1348,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
 
 
-static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+				    struct shrink_control *sc)
 {
 	struct gfs2_glock *gl;
 	int may_demote;
 	int nr_skipped = 0;
+	int nr = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
 	LIST_HEAD(skipped);
 
 	if (nr == 0)
@@ -1365,6 +1366,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
 	while(nr && !list_empty(&lru_list)) {
 		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
 		list_del_init(&gl->gl_lru);
+		clear_bit(GLF_LRU, &gl->gl_flags);
 		atomic_dec(&lru_count);
 
 		/* Test for being demotable */
@@ -1387,6 +1389,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
 		}
 		nr_skipped++;
 		list_add(&gl->gl_lru, &skipped);
+		set_bit(GLF_LRU, &gl->gl_flags);
 	}
 	list_splice(&skipped, &lru_list);
 	atomic_add(nr_skipped, &lru_count);
@@ -1459,12 +1462,7 @@ static void thaw_glock(struct gfs2_glock *gl)
 
 static void clear_glock(struct gfs2_glock *gl)
 {
-	spin_lock(&lru_lock);
-	if (!list_empty(&gl->gl_lru)) {
-		list_del_init(&gl->gl_lru);
-		atomic_dec(&lru_count);
-	}
-	spin_unlock(&lru_lock);
+	gfs2_glock_remove_from_lru(gl);
 
 	spin_lock(&gl->gl_spin);
 	if (gl->gl_state != LM_ST_UNLOCKED)
@@ -1599,9 +1597,11 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 	return 0;
 }
 
-static const char *gflags2str(char *buf, const unsigned long *gflags)
+static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 {
+	const unsigned long *gflags = &gl->gl_flags;
 	char *p = buf;
+
 	if (test_bit(GLF_LOCK, gflags))
 		*p++ = 'l';
 	if (test_bit(GLF_DEMOTE, gflags))
@@ -1624,6 +1624,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'F';
 	if (test_bit(GLF_QUEUED, gflags))
 		*p++ = 'q';
+	if (test_bit(GLF_LRU, gflags))
+		*p++ = 'L';
+	if (gl->gl_object)
+		*p++ = 'o';
 	*p = 0;
 	return buf;
 }
@@ -1658,14 +1662,15 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
-		  gflags2str(gflags_buf, &gl->gl_flags),
+		  gflags2str(gflags_buf, gl),
 		  state2str(gl->gl_target),
 		  state2str(gl->gl_demote_state), dtime,
 		  atomic_read(&gl->gl_ail_count),
+		  atomic_read(&gl->gl_revokes),
 		  atomic_read(&gl->gl_ref));
 
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index aea1606..6b2f757 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -225,11 +225,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 
 extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
 extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
 extern void gfs2_glock_free(struct gfs2_glock *gl);
 
 extern int __init gfs2_glock_init(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 25eeb2b..8ef70f4 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,33 +28,18 @@
 #include "trans.h"
 
 /**
- * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
  * @gl: the glock
  *
  * None of the buffers should be dirty, locked, or pinned.
  */
 
-static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct list_head *head = &gl->gl_ail_list;
 	struct gfs2_bufdata *bd;
 	struct buffer_head *bh;
-	struct gfs2_trans tr;
-
-	memset(&tr, 0, sizeof(tr));
-	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
-
-	if (!tr.tr_revokes)
-		return;
-
-	/* A shortened, inline version of gfs2_trans_begin() */
-	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
-	tr.tr_ip = (unsigned long)__builtin_return_address(0);
-	INIT_LIST_HEAD(&tr.tr_list_buf);
-	gfs2_log_reserve(sdp, tr.tr_reserved);
-	BUG_ON(current->journal_info);
-	current->journal_info = &tr;
 
 	spin_lock(&sdp->sd_ail_lock);
 	while (!list_empty(head)) {
@@ -76,7 +61,47 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	}
 	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
 	spin_unlock(&sdp->sd_ail_lock);
+}
+
+
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_trans tr;
+
+	memset(&tr, 0, sizeof(tr));
+	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
+
+	if (!tr.tr_revokes)
+		return;
+
+	/* A shortened, inline version of gfs2_trans_begin() */
+	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+	tr.tr_ip = (unsigned long)__builtin_return_address(0);
+	INIT_LIST_HEAD(&tr.tr_list_buf);
+	gfs2_log_reserve(sdp, tr.tr_reserved);
+	BUG_ON(current->journal_info);
+	current->journal_info = &tr;
+
+	__gfs2_ail_flush(gl);
+
+	gfs2_trans_end(sdp);
+	gfs2_log_flush(sdp, NULL);
+}
+
+void gfs2_ail_flush(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int revokes = atomic_read(&gl->gl_ail_count);
+	int ret;
+
+	if (!revokes)
+		return;
 
+	ret = gfs2_trans_begin(sdp, 0, revokes);
+	if (ret)
+		return;
+	__gfs2_ail_flush(gl);
 	gfs2_trans_end(sdp);
 	gfs2_log_flush(sdp, NULL);
 }
@@ -227,6 +252,119 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
 }
 
 /**
+ * gfs2_set_nlink - Set the inode's link count based on on-disk info
+ * @inode: The inode in question
+ * @nlink: The link count
+ *
+ * If the link count has hit zero, it must never be raised, whatever the
+ * on-disk inode might say. When new struct inodes are created the link
+ * count is set to 1, so that we can safely use this test even when reading
+ * in on disk information for the first time.
+ */
+
+static void gfs2_set_nlink(struct inode *inode, u32 nlink)
+{
+	/*
+	 * We will need to review setting the nlink count here in the
+	 * light of the forthcoming ro bind mount work. This is a reminder
+	 * to do that.
+	 */
+	if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
+		if (nlink == 0)
+			clear_nlink(inode);
+		else
+			inode->i_nlink = nlink;
+	}
+}
+
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+{
+	const struct gfs2_dinode *str = buf;
+	struct timespec atime;
+	u16 height, depth;
+
+	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+		goto corrupt;
+	ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
+	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
+	ip->i_inode.i_rdev = 0;
+	switch (ip->i_inode.i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+					   be32_to_cpu(str->di_minor));
+		break;
+	};
+
+	ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
+	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
+	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+	atime.tv_sec = be64_to_cpu(str->di_atime);
+	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+		ip->i_inode.i_atime = atime;
+	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+
+	ip->i_goal = be64_to_cpu(str->di_goal_meta);
+	ip->i_generation = be64_to_cpu(str->di_generation);
+
+	ip->i_diskflags = be32_to_cpu(str->di_flags);
+	gfs2_set_inode_flags(&ip->i_inode);
+	height = be16_to_cpu(str->di_height);
+	if (unlikely(height > GFS2_MAX_META_HEIGHT))
+		goto corrupt;
+	ip->i_height = (u8)height;
+
+	depth = be16_to_cpu(str->di_depth);
+	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+		goto corrupt;
+	ip->i_depth = (u8)depth;
+	ip->i_entries = be32_to_cpu(str->di_entries);
+
+	ip->i_eattr = be64_to_cpu(str->di_eattr);
+	if (S_ISREG(ip->i_inode.i_mode))
+		gfs2_set_aops(&ip->i_inode);
+
+	return 0;
+corrupt:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+		brelse(dibh);
+		return -EIO;
+	}
+
+	error = gfs2_dinode_in(ip, dibh->b_data);
+	brelse(dibh);
+	clear_bit(GIF_INVALID, &ip->i_flags);
+
+	return error;
+}
+
+/**
  * inode_go_lock - operation done after an inode lock is locked by a process
  * @gl: the glock
  * @flags:
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index b3aa2e3..6fce409 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,4 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
 extern const struct gfs2_glock_operations *gfs2_glops_list[];
 
+extern void gfs2_ail_flush(struct gfs2_glock *gl);
+
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 870a89d..0a064e9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -20,7 +20,6 @@
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
-#define DIO_ALL		0x00000100
 
 struct gfs2_log_operations;
 struct gfs2_log_element;
@@ -200,6 +199,8 @@ enum {
 	GLF_INITIAL			= 10,
 	GLF_FROZEN			= 11,
 	GLF_QUEUED			= 12,
+	GLF_LRU				= 13,
+	GLF_OBJECT			= 14, /* Used only for tracing */
 };
 
 struct gfs2_glock {
@@ -234,6 +235,7 @@ struct gfs2_glock {
 
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
+	atomic_t gl_revokes;
 	struct delayed_work gl_work;
 	struct work_struct gl_delete;
 	struct rcu_head gl_rcu;
@@ -374,8 +376,6 @@ struct gfs2_ail {
 	unsigned int ai_first;
 	struct list_head ai_ail1_list;
 	struct list_head ai_ail2_list;
-
-	u64 ai_sync_gen;
 };
 
 struct gfs2_journal_extent {
@@ -488,7 +488,6 @@ struct gfs2_sb_host {
 
 	char sb_lockproto[GFS2_LOCKNAME_LEN];
 	char sb_locktable[GFS2_LOCKNAME_LEN];
-	u8 sb_uuid[16];
 };
 
 /*
@@ -654,7 +653,6 @@ struct gfs2_sbd {
 	spinlock_t sd_ail_lock;
 	struct list_head sd_ail1_list;
 	struct list_head sd_ail2_list;
-	u64 sd_ail_sync_gen;
 
 	/* Replay stuff */
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9134dcb..03e0c52 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,23 +1,25 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
  * of the GNU General Public License version 2.
  */
 
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl.h>
-#include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
+#include <linux/fiemap.h>
 #include <linux/security.h>
-#include <linux/time.h>
+#include <asm/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -26,19 +28,14 @@
 #include "dir.h"
 #include "xattr.h"
 #include "glock.h"
-#include "glops.h"
 #include "inode.h"
-#include "log.h"
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-
-struct gfs2_inum_range_host {
-	u64 ir_start;
-	u64 ir_length;
-};
+#include "super.h"
+#include "glops.h"
 
 struct gfs2_skip_data {
 	u64 no_addr;
@@ -74,14 +71,14 @@ static int iget_set(struct inode *inode, void *opaque)
 	return 0;
 }
 
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
 {
 	unsigned long hash = (unsigned long)no_addr;
 	struct gfs2_skip_data data;
 
 	data.no_addr = no_addr;
 	data.skipped = 0;
-	data.non_block = 0;
+	data.non_block = non_block;
 	return ilookup5(sb, hash, iget_test, &data);
 }
 
@@ -248,203 +245,6 @@ fail_iput:
 	goto fail;
 }
 
-static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
-{
-	const struct gfs2_dinode *str = buf;
-	struct timespec atime;
-	u16 height, depth;
-
-	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
-		goto corrupt;
-	ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
-	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
-	ip->i_inode.i_rdev = 0;
-	switch (ip->i_inode.i_mode & S_IFMT) {
-	case S_IFBLK:
-	case S_IFCHR:
-		ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
-					   be32_to_cpu(str->di_minor));
-		break;
-	};
-
-	ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
-	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
-	/*
-	 * We will need to review setting the nlink count here in the
-	 * light of the forthcoming ro bind mount work. This is a reminder
-	 * to do that.
-	 */
-	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
-	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-	atime.tv_sec = be64_to_cpu(str->di_atime);
-	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
-	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
-		ip->i_inode.i_atime = atime;
-	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
-	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
-	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
-	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
-
-	ip->i_goal = be64_to_cpu(str->di_goal_meta);
-	ip->i_generation = be64_to_cpu(str->di_generation);
-
-	ip->i_diskflags = be32_to_cpu(str->di_flags);
-	gfs2_set_inode_flags(&ip->i_inode);
-	height = be16_to_cpu(str->di_height);
-	if (unlikely(height > GFS2_MAX_META_HEIGHT))
-		goto corrupt;
-	ip->i_height = (u8)height;
-
-	depth = be16_to_cpu(str->di_depth);
-	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
-		goto corrupt;
-	ip->i_depth = (u8)depth;
-	ip->i_entries = be32_to_cpu(str->di_entries);
-
-	ip->i_eattr = be64_to_cpu(str->di_eattr);
-	if (S_ISREG(ip->i_inode.i_mode))
-		gfs2_set_aops(&ip->i_inode);
-
-	return 0;
-corrupt:
-	if (gfs2_consist_inode(ip))
-		gfs2_dinode_print(ip);
-	return -EIO;
-}
-
-/**
- * gfs2_inode_refresh - Refresh the incore copy of the dinode
- * @ip: The GFS2 inode
- *
- * Returns: errno
- */
-
-int gfs2_inode_refresh(struct gfs2_inode *ip)
-{
-	struct buffer_head *dibh;
-	int error;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		return error;
-
-	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
-		brelse(dibh);
-		return -EIO;
-	}
-
-	error = gfs2_dinode_in(ip, dibh->b_data);
-	brelse(dibh);
-	clear_bit(GIF_INVALID, &ip->i_flags);
-
-	return error;
-}
-
-int gfs2_dinode_dealloc(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al;
-	struct gfs2_rgrpd *rgd;
-	int error;
-
-	if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto out;
-
-	error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-	if (error)
-		goto out_qs;
-
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
-	if (!rgd) {
-		gfs2_consist_inode(ip);
-		error = -EIO;
-		goto out_rindex_relse;
-	}
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
-				   &al->al_rgd_gh);
-	if (error)
-		goto out_rindex_relse;
-
-	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
-	if (error)
-		goto out_rg_gunlock;
-
-	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-	set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
-
-	gfs2_free_di(rgd, ip);
-
-	gfs2_trans_end(sdp);
-
-out_rg_gunlock:
-	gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
-out_qs:
-	gfs2_quota_unhold(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-/**
- * gfs2_change_nlink - Change nlink count on inode
- * @ip: The GFS2 inode
- * @diff: The change in the nlink count required
- *
- * Returns: errno
- */
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
-{
-	struct buffer_head *dibh;
-	u32 nlink;
-	int error;
-
-	BUG_ON(diff != 1 && diff != -1);
-	nlink = ip->i_inode.i_nlink + diff;
-
-	/* If we are reducing the nlink count, but the new value ends up being
-	   bigger than the old one, we must have underflowed. */
-	if (diff < 0 && nlink > ip->i_inode.i_nlink) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		return error;
-
-	if (diff > 0)
-		inc_nlink(&ip->i_inode);
-	else
-		drop_nlink(&ip->i_inode);
-
-	ip->i_inode.i_ctime = CURRENT_TIME;
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-	mark_inode_dirty(&ip->i_inode);
-
-	if (ip->i_inode.i_nlink == 0)
-		gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
-
-	return error;
-}
 
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 {
@@ -543,7 +343,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 
 	/*  Don't create entries in an unlinked directory  */
 	if (!dip->i_inode.i_nlink)
-		return -EPERM;
+		return -ENOENT;
 
 	error = gfs2_dir_check(&dip->i_inode, name, NULL);
 	switch (error) {
@@ -613,21 +413,44 @@ out:
 	return error;
 }
 
+static void gfs2_init_dir(struct buffer_head *dibh,
+			  const struct gfs2_inode *parent)
+{
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+	struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+
+	gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
+	dent->de_inum = di->di_num; /* already GFS2 endian */
+	dent->de_type = cpu_to_be16(DT_DIR);
+
+	dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+	gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+	gfs2_inum_out(parent, dent);
+	dent->de_type = cpu_to_be16(DT_DIR);
+	
+}
+
 /**
  * init_dinode - Fill in a new dinode structure
- * @dip: the directory this inode is being created in
+ * @dip: The directory this inode is being created in
  * @gl: The glock covering the new inode
- * @inum: the inode number
- * @mode: the file permissions
- * @uid:
- * @gid:
+ * @inum: The inode number
+ * @mode: The file permissions
+ * @uid: The uid of the new inode
+ * @gid: The gid of the new inode
+ * @generation: The generation number of the new inode
+ * @dev: The device number (if a device node)
+ * @symname: The symlink destination (if a symlink)
+ * @size: The inode size (ignored for directories)
+ * @bhp: The buffer head (returned to caller)
  *
  */
 
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 			const struct gfs2_inum_host *inum, unsigned int mode,
 			unsigned int uid, unsigned int gid,
-			const u64 *generation, dev_t dev, struct buffer_head **bhp)
+			const u64 *generation, dev_t dev, const char *symname,
+			unsigned size, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_dinode *di;
@@ -646,7 +469,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_uid = cpu_to_be32(uid);
 	di->di_gid = cpu_to_be32(gid);
 	di->di_nlink = 0;
-	di->di_size = 0;
+	di->di_size = cpu_to_be64(size);
 	di->di_blocks = cpu_to_be64(1);
 	di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
 	di->di_major = cpu_to_be32(MAJOR(dev));
@@ -654,16 +477,6 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
 	di->di_generation = cpu_to_be64(*generation);
 	di->di_flags = 0;
-
-	if (S_ISREG(mode)) {
-		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
-		    gfs2_tune_get(sdp, gt_new_files_jdata))
-			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-	} else if (S_ISDIR(mode)) {
-		di->di_flags |= cpu_to_be32(dip->i_diskflags &
-					    GFS2_DIF_INHERIT_JDATA);
-	}
-
 	di->__pad1 = 0;
 	di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
 	di->di_height = 0;
@@ -677,7 +490,26 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
 	di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
 	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
-	
+
+	switch(mode & S_IFMT) {	
+	case S_IFREG:
+		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+		    gfs2_tune_get(sdp, gt_new_files_jdata))
+			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+		break;
+	case S_IFDIR:
+		di->di_flags |= cpu_to_be32(dip->i_diskflags &
+					    GFS2_DIF_INHERIT_JDATA);
+		di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+		di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
+		di->di_entries = cpu_to_be32(2);
+		gfs2_init_dir(dibh, dip);
+		break;
+	case S_IFLNK:
+		memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size);
+		break;
+	}
+
 	set_buffer_uptodate(dibh);
 
 	*bhp = dibh;
@@ -685,7 +517,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 
 static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 		       unsigned int mode, const struct gfs2_inum_host *inum,
-		       const u64 *generation, dev_t dev, struct buffer_head **bhp)
+		       const u64 *generation, dev_t dev, const char *symname,
+		       unsigned int size, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	unsigned int uid, gid;
@@ -707,7 +540,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	if (error)
 		goto out_quota;
 
-	init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp);
+	init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp);
 	gfs2_quota_change(dip, +1, uid, gid);
 	gfs2_trans_end(sdp);
 
@@ -761,14 +594,16 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 			goto fail_quota_locks;
 	}
 
-	error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode));
+	error = gfs2_dir_add(&dip->i_inode, name, ip);
 	if (error)
 		goto fail_end_trans;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto fail_end_trans;
-	ip->i_inode.i_nlink = 1;
+	inc_nlink(&ip->i_inode);
+	if (S_ISDIR(ip->i_inode.i_mode))
+		inc_nlink(&ip->i_inode);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
@@ -815,27 +650,25 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 }
 
 /**
- * gfs2_createi - Create a new inode
- * @ghs: An array of two holders
- * @name: The name of the new file
- * @mode: the permissions on the new inode
- *
- * @ghs[0] is an initialized holder for the directory
- * @ghs[1] is the holder for the inode lock
+ * gfs2_create_inode - Create a new inode
+ * @dir: The parent directory
+ * @dentry: The new dentry
+ * @mode: The permissions on the new inode
+ * @dev: For device nodes, this is the device number
+ * @symname: For symlinks, this is the link destination
+ * @size: The initial size of the inode (ignored for directories)
  *
- * If the return value is not NULL, the glocks on both the directory and the new
- * file are held.  A transaction has been started and an inplace reservation
- * is held, as well.
- *
- * Returns: An inode
+ * Returns: 0 on success, or error code
  */
 
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
-			   unsigned int mode, dev_t dev)
+static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+			     unsigned int mode, dev_t dev, const char *symname,
+			     unsigned int size)
 {
+	const struct qstr *name = &dentry->d_name;
+	struct gfs2_holder ghs[2];
 	struct inode *inode = NULL;
-	struct gfs2_inode *dip = ghs->gh_gl->gl_object;
-	struct inode *dir = &dip->i_inode;
+	struct gfs2_inode *dip = GFS2_I(dir);
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
 	int error;
@@ -843,10 +676,9 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	struct buffer_head *bh = NULL;
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
-		return ERR_PTR(-ENAMETOOLONG);
+		return -ENAMETOOLONG;
 
-	gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
-	error = gfs2_glock_nq(ghs);
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	if (error)
 		goto fail;
 
@@ -864,7 +696,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock;
 
-	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh);
+	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh);
 	if (error)
 		goto fail_gunlock2;
 
@@ -891,18 +723,852 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 
 	if (bh)
 		brelse(bh);
-	return inode;
+
+	gfs2_trans_end(sdp);
+	if (dip->i_alloc->al_rgd)
+		gfs2_inplace_release(dip);
+	gfs2_quota_unlock(dip);
+	gfs2_alloc_put(dip);
+	gfs2_glock_dq_uninit_m(2, ghs);
+	mark_inode_dirty(inode);
+	d_instantiate(dentry, inode);
+	return 0;
 
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
 	if (inode && !IS_ERR(inode))
 		iput(inode);
 fail_gunlock:
-	gfs2_glock_dq(ghs);
+	gfs2_glock_dq_uninit(ghs);
 fail:
 	if (bh)
 		brelse(bh);
-	return ERR_PTR(error);
+	return error;
+}
+
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+		       int mode, struct nameidata *nd)
+{
+	struct inode *inode;
+	int ret;
+
+	for (;;) {
+		ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
+		if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
+			return ret;
+
+		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+		if (inode) {
+			if (!IS_ERR(inode))
+				break;
+			return PTR_ERR(inode);
+		}
+	}
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+	if (inode && IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (inode) {
+		struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+		struct gfs2_holder gh;
+		int error;
+		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error) {
+			iput(inode);
+			return ERR_PTR(error);
+		}
+		gfs2_glock_dq_uninit(&gh);
+		return d_splice_alias(inode, dentry);
+	}
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = old_dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder ghs[2];
+	struct buffer_head *dibh;
+	int alloc_required;
+	int error;
+
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	error = gfs2_glock_nq(ghs); /* parent */
+	if (error)
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = -ENOENT;
+	if (inode->i_nlink == 0)
+		goto out_gunlock;
+
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_dir_check(dir, &dentry->d_name, NULL);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+	default:
+		goto out_gunlock;
+	}
+
+	error = -EINVAL;
+	if (!dip->i_inode.i_nlink)
+		goto out_gunlock;
+	error = -EFBIG;
+	if (dip->i_entries == (u32)-1)
+		goto out_gunlock;
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out_gunlock;
+	error = -EINVAL;
+	if (!ip->i_inode.i_nlink)
+		goto out_gunlock;
+	error = -EMLINK;
+	if (ip->i_inode.i_nlink == (u32)-1)
+		goto out_gunlock;
+
+	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+	if (error < 0)
+		goto out_gunlock;
+	error = 0;
+
+	if (alloc_required) {
+		struct gfs2_alloc *al = gfs2_alloc_get(dip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
+
+		error = gfs2_quota_lock_check(dip);
+		if (error)
+			goto out_alloc;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve(dip);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 gfs2_rg_blocks(al) +
+					 2 * RES_DINODE + RES_STATFS +
+					 RES_QUOTA, 0);
+		if (error)
+			goto out_ipres;
+	} else {
+		error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+		if (error)
+			goto out_ipres;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(dir, &dentry->d_name, ip);
+	if (error)
+		goto out_brelse;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	inc_nlink(&ip->i_inode);
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(&ip->i_inode);
+
+out_brelse:
+	brelse(dibh);
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	if (alloc_required)
+		gfs2_inplace_release(dip);
+out_gunlock_q:
+	if (alloc_required)
+		gfs2_quota_unlock(dip);
+out_alloc:
+	if (alloc_required)
+		gfs2_alloc_put(dip);
+out_gunlock:
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
+	gfs2_holder_uninit(ghs + 1);
+	if (!error) {
+		ihold(inode);
+		d_instantiate(dentry, inode);
+		mark_inode_dirty(inode);
+	}
+	return error;
+}
+
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip)
+{
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
+	    dip->i_inode.i_uid != current_fsuid() &&
+	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+	if (error)
+		return error;
+
+	error = gfs2_dir_check(&dip->i_inode, name, ip);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+/**
+ * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
+ * @dip: The parent directory
+ * @name: The name of the entry in the parent directory
+ * @bh: The inode buffer for the inode to be removed
+ * @inode: The inode to be removed
+ *
+ * Called with all the locks and in a transaction. This will only be
+ * called for a directory after it has been checked to ensure it is empty.
+ *
+ * Returns: 0 on success, or an error
+ */
+
+static int gfs2_unlink_inode(struct gfs2_inode *dip,
+			     const struct dentry *dentry,
+			     struct buffer_head *bh)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	error = gfs2_dir_del(dip, dentry);
+	if (error)
+		return error;
+
+	ip->i_entries = 0;
+	inode->i_ctime = CURRENT_TIME;
+	if (S_ISDIR(inode->i_mode))
+		clear_nlink(inode);
+	else
+		drop_nlink(inode);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_dinode_out(ip, bh->b_data);
+	mark_inode_dirty(inode);
+	if (inode->i_nlink == 0)
+		gfs2_unlink_di(inode);
+	return 0;
+}
+
+
+/**
+ * gfs2_unlink - Unlink an inode (this does rmdir as well)
+ * @dir: The inode of the directory containing the inode to unlink
+ * @dentry: The file itself
+ *
+ * This routine uses the type of the inode as a flag to figure out
+ * whether this is an unlink or an rmdir.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder ghs[3];
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder ri_gh;
+	int error;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+
+
+	error = gfs2_glock_nq(ghs); /* parent */
+	if (error)
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = -ENOENT;
+	if (inode->i_nlink == 0)
+		goto out_rgrp;
+
+	if (S_ISDIR(inode->i_mode)) {
+		error = -ENOTEMPTY;
+		if (ip->i_entries > 2 || inode->i_nlink > 2)
+			goto out_rgrp;
+	}
+
+	error = gfs2_glock_nq(ghs + 2); /* rgrp */
+	if (error)
+		goto out_rgrp;
+
+	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_unlink_inode(dip, dentry, bh);
+	brelse(bh);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock:
+	gfs2_glock_dq(ghs + 2);
+out_rgrp:
+	gfs2_holder_uninit(ghs + 2);
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
+	gfs2_glock_dq_uninit(&ri_gh);
+	return error;
+}
+
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+			const char *symname)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	unsigned int size;
+
+	size = strlen(symname);
+	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+		return -ENAMETOOLONG;
+
+	return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+}
+
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+}
+
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @dev: The device specification of the special file
+ *
+ */
+
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		      dev_t dev)
+{
+	return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+}
+
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+	struct inode *dir = &to->i_inode;
+	struct super_block *sb = dir->i_sb;
+	struct inode *tmp;
+	int error = 0;
+
+	igrab(dir);
+
+	for (;;) {
+		if (dir == &this->i_inode) {
+			error = -EINVAL;
+			break;
+		}
+		if (dir == sb->s_root->d_inode) {
+			error = 0;
+			break;
+		}
+
+		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+		if (IS_ERR(tmp)) {
+			error = PTR_ERR(tmp);
+			break;
+		}
+
+		iput(dir);
+		dir = tmp;
+	}
+
+	iput(dir);
+
+	return error;
+}
+
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+		       struct inode *ndir, struct dentry *ndentry)
+{
+	struct gfs2_inode *odip = GFS2_I(odir);
+	struct gfs2_inode *ndip = GFS2_I(ndir);
+	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+	struct gfs2_inode *nip = NULL;
+	struct gfs2_sbd *sdp = GFS2_SB(odir);
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+	struct gfs2_rgrpd *nrgd;
+	unsigned int num_gh;
+	int dir_rename = 0;
+	int alloc_required = 0;
+	unsigned int x;
+	int error;
+
+	if (ndentry->d_inode) {
+		nip = GFS2_I(ndentry->d_inode);
+		if (ip == nip)
+			return 0;
+	}
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
+
+	if (odip != ndip) {
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+					   0, &r_gh);
+		if (error)
+			goto out;
+
+		if (S_ISDIR(ip->i_inode.i_mode)) {
+			dir_rename = 1;
+			/* don't move a dirctory into it's subdir */
+			error = gfs2_ok_to_move(ip, ndip);
+			if (error)
+				goto out_gunlock_r;
+		}
+	}
+
+	num_gh = 1;
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (odip != ndip) {
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+	}
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	num_gh++;
+
+	if (nip) {
+		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+		/* grab the resource lock for unlink flag twiddling 
+		 * this is the case of the target file already existing
+		 * so we unlink before doing the rename
+		 */
+		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
+		if (nrgd)
+			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
+	}
+
+	for (x = 0; x < num_gh; x++) {
+		error = gfs2_glock_nq(ghs + x);
+		if (error)
+			goto out_gunlock;
+	}
+
+	error = -ENOENT;
+	if (ip->i_inode.i_nlink == 0)
+		goto out_gunlock;
+
+	/* Check out the old directory */
+
+	error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	/* Check out the new directory */
+
+	if (nip) {
+		error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+		if (error)
+			goto out_gunlock;
+
+		if (nip->i_inode.i_nlink == 0) {
+			error = -EAGAIN;
+			goto out_gunlock;
+		}
+
+		if (S_ISDIR(nip->i_inode.i_mode)) {
+			if (nip->i_entries < 2) {
+				gfs2_consist_inode(nip);
+				error = -EIO;
+				goto out_gunlock;
+			}
+			if (nip->i_entries > 2) {
+				error = -ENOTEMPTY;
+				goto out_gunlock;
+			}
+		}
+	} else {
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
+		if (error)
+			goto out_gunlock;
+
+		error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
+		switch (error) {
+		case -ENOENT:
+			error = 0;
+			break;
+		case 0:
+			error = -EEXIST;
+		default:
+			goto out_gunlock;
+		};
+
+		if (odip != ndip) {
+			if (!ndip->i_inode.i_nlink) {
+				error = -ENOENT;
+				goto out_gunlock;
+			}
+			if (ndip->i_entries == (u32)-1) {
+				error = -EFBIG;
+				goto out_gunlock;
+			}
+			if (S_ISDIR(ip->i_inode.i_mode) &&
+			    ndip->i_inode.i_nlink == (u32)-1) {
+				error = -EMLINK;
+				goto out_gunlock;
+			}
+		}
+	}
+
+	/* Check out the dir to be renamed */
+
+	if (dir_rename) {
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	if (nip == NULL)
+		alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+	error = alloc_required;
+	if (error < 0)
+		goto out_gunlock;
+	error = 0;
+
+	if (alloc_required) {
+		struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
+
+		error = gfs2_quota_lock_check(ndip);
+		if (error)
+			goto out_alloc;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve_ri(ndip);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 gfs2_rg_blocks(al) +
+					 4 * RES_DINODE + 4 * RES_LEAF +
+					 RES_STATFS + RES_QUOTA + 4, 0);
+		if (error)
+			goto out_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+					 5 * RES_LEAF + 4, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	/* Remove the target file, if it exists */
+
+	if (nip) {
+		struct buffer_head *bh;
+		error = gfs2_meta_inode_buffer(nip, &bh);
+		if (error)
+			goto out_end_trans;
+		error = gfs2_unlink_inode(ndip, ndentry, bh);
+		brelse(bh);
+	}
+
+	if (dir_rename) {
+		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+		if (error)
+			goto out_end_trans;
+	} else {
+		struct buffer_head *dibh;
+		error = gfs2_meta_inode_buffer(ip, &dibh);
+		if (error)
+			goto out_end_trans;
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	error = gfs2_dir_del(odip, odentry);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+	if (error)
+		goto out_end_trans;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipreserv:
+	if (alloc_required)
+		gfs2_inplace_release(ndip);
+out_gunlock_q:
+	if (alloc_required)
+		gfs2_quota_unlock(ndip);
+out_alloc:
+	if (alloc_required)
+		gfs2_alloc_put(ndip);
+out_gunlock:
+	while (x--) {
+		gfs2_glock_dq(ghs + x);
+		gfs2_holder_uninit(ghs + x);
+	}
+out_gunlock_r:
+	if (r_gh.gh_gl)
+		gfs2_glock_dq_uninit(&r_gh);
+out:
+	gfs2_glock_dq_uninit(&ri_gh);
+	return error;
+}
+
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size.
+ *
+ * Returns: 0 on success or error code
+ */
+
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int size;
+	char *buf;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		nd_set_link(nd, ERR_PTR(error));
+		return NULL;
+	}
+
+	size = (unsigned int)i_size_read(&ip->i_inode);
+	if (size == 0) {
+		gfs2_consist_inode(ip);
+		buf = ERR_PTR(-EIO);
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error) {
+		buf = ERR_PTR(error);
+		goto out;
+	}
+
+	buf = kzalloc(size + 1, GFP_NOFS);
+	if (!buf)
+		buf = ERR_PTR(-ENOMEM);
+	else
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	nd_set_link(nd, buf);
+	return NULL;
+}
+
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+
+/**
+ * gfs2_permission -
+ * @inode: The inode
+ * @mask: The mask to be tested
+ * @flags: Indicates whether this is an RCU path walk or not
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done.
+ *
+ * Returns: errno
+ */
+
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
+{
+	struct gfs2_inode *ip;
+	struct gfs2_holder i_gh;
+	int error;
+	int unlock = 0;
+
+
+	ip = GFS2_I(inode);
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		if (flags & IPERM_FLAG_RCU)
+			return -ECHILD;
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
+
+	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+		error = -EACCES;
+	else
+		error = generic_permission(inode, mask, flags, gfs2_check_acl);
+	if (unlock)
+		gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
 }
 
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
@@ -928,8 +1594,6 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
  * @ip:
  * @attr:
  *
- * Called with a reference on the vnode.
- *
  * Returns: errno
  */
 
@@ -949,60 +1613,280 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 	return error;
 }
 
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
-{
-	struct gfs2_dinode *str = buf;
-
-	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
-	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
-	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
-	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
-	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
-	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
-	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
-	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
-	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
-	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
-	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
-	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
-
-	str->di_goal_meta = cpu_to_be64(ip->i_goal);
-	str->di_goal_data = cpu_to_be64(ip->i_goal);
-	str->di_generation = cpu_to_be64(ip->i_generation);
-
-	str->di_flags = cpu_to_be32(ip->i_diskflags);
-	str->di_height = cpu_to_be16(ip->i_height);
-	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
-					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
-					     GFS2_FORMAT_DE : 0);
-	str->di_depth = cpu_to_be16(ip->i_depth);
-	str->di_entries = cpu_to_be32(ip->i_entries);
-
-	str->di_eattr = cpu_to_be64(ip->i_eattr);
-	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
-	str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
-	str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
-}
-
-void gfs2_dinode_print(const struct gfs2_inode *ip)
-{
-	printk(KERN_INFO "  no_formal_ino = %llu\n",
-	       (unsigned long long)ip->i_no_formal_ino);
-	printk(KERN_INFO "  no_addr = %llu\n",
-	       (unsigned long long)ip->i_no_addr);
-	printk(KERN_INFO "  i_size = %llu\n",
-	       (unsigned long long)i_size_read(&ip->i_inode));
-	printk(KERN_INFO "  blocks = %llu\n",
-	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
-	printk(KERN_INFO "  i_goal = %llu\n",
-	       (unsigned long long)ip->i_goal);
-	printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
-	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
-	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
-	printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
-	printk(KERN_INFO "  i_eattr = %llu\n",
-	       (unsigned long long)ip->i_eattr);
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u32 ouid, ogid, nuid, ngid;
+	int error;
+
+	ouid = inode->i_uid;
+	ogid = inode->i_gid;
+	nuid = attr->ia_uid;
+	ngid = attr->ia_gid;
+
+	if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+		ouid = nuid = NO_QUOTA_CHANGE;
+	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+		ogid = ngid = NO_QUOTA_CHANGE;
+
+	if (!gfs2_alloc_get(ip))
+		return -ENOMEM;
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out_alloc;
+
+	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+		error = gfs2_quota_check(ip, nuid, ngid);
+		if (error)
+			goto out_gunlock_q;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_setattr_simple(ip, attr);
+	if (error)
+		goto out_end_trans;
+
+	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+		gfs2_quota_change(ip, -blocks, ouid, ogid);
+		gfs2_quota_change(ip, blocks, nuid, ngid);
+	}
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return error;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		error = gfs2_setattr_size(inode, attr->ia_size);
+	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+		error = setattr_chown(inode, attr);
+	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+		error = gfs2_acl_chmod(ip, attr);
+	else
+		error = gfs2_setattr_simple(ip, attr);
+
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	if (!error)
+		mark_inode_dirty(inode);
+	return error;
+}
+
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done. Note that its the NFS
+ * readdirplus operation which causes this to be called (from filldir)
+ * with the glock already held.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	int unlock = 0;
+
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error)
+			return error;
+		unlock = 1;
+	}
+
+	generic_fillattr(inode, stat);
+	if (unlock)
+		gfs2_glock_dq_uninit(&gh);
+
+	return 0;
+}
+
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+			 const void *data, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_setxattr(dentry, name, data, size, flags);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+			     void *data, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_getxattr(dentry, name, data, size);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_removexattr(dentry, name);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (ret)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		u64 phys = ip->i_no_addr << inode->i_blkbits;
+		u64 size = i_size_read(inode);
+		u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+			    FIEMAP_EXTENT_DATA_INLINE;
+		phys += sizeof(struct gfs2_dinode);
+		phys += start;
+		if (start + len > size)
+			len = size - start;
+		if (start < size)
+			ret = fiemap_fill_next_extent(fieinfo, start, phys,
+						      len, flags);
+		if (ret == 1)
+			ret = 0;
+	} else {
+		ret = __generic_block_fiemap(inode, fieinfo, start, len,
+					     gfs2_block_map);
+	}
+
+	gfs2_glock_dq_uninit(&gh);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
 }
 
+const struct inode_operations gfs2_file_iops = {
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
+const struct inode_operations gfs2_dir_iops = {
+	.create = gfs2_create,
+	.lookup = gfs2_lookup,
+	.link = gfs2_link,
+	.unlink = gfs2_unlink,
+	.symlink = gfs2_symlink,
+	.mkdir = gfs2_mkdir,
+	.rmdir = gfs2_unlink,
+	.mknod = gfs2_mknod,
+	.rename = gfs2_rename,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
+const struct inode_operations gfs2_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = gfs2_follow_link,
+	.put_link = gfs2_put_link,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
+};
+
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 099ca30..3160607 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -102,22 +102,16 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 					 u64 *no_formal_ino,
 					 unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
 
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
 
-extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 				  int is_root);
-extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
-				  const struct qstr *name,
-				  unsigned int mode, dev_t dev);
 extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-extern void gfs2_dinode_print(const struct gfs2_inode *ip);
 
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 5b102c1..903115f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/bio.h>
+#include <linux/writeback.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -83,55 +84,97 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 /**
  * gfs2_ail1_start_one - Start I/O on a part of the AIL
  * @sdp: the filesystem
- * @tr: the part of the AIL
+ * @wbc: The writeback control structure
+ * @ai: The ail structure
  *
  */
 
-static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
+			       struct writeback_control *wbc,
+			       struct gfs2_ail *ai)
 __releases(&sdp->sd_ail_lock)
 __acquires(&sdp->sd_ail_lock)
 {
+	struct gfs2_glock *gl = NULL;
+	struct address_space *mapping;
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
-	int retry;
 
-	do {
-		retry = 0;
+	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) {
+		bh = bd->bd_bh;
 
-		list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
-						 bd_ail_st_list) {
-			bh = bd->bd_bh;
+		gfs2_assert(sdp, bd->bd_ail == ai);
 
-			gfs2_assert(sdp, bd->bd_ail == ai);
+		if (!buffer_busy(bh)) {
+			if (!buffer_uptodate(bh))
+				gfs2_io_error_bh(sdp, bh);
+			list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+			continue;
+		}
 
-			if (!buffer_busy(bh)) {
-				if (!buffer_uptodate(bh))
-					gfs2_io_error_bh(sdp, bh);
-				list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
-				continue;
-			}
+		if (!buffer_dirty(bh))
+			continue;
+		if (gl == bd->bd_gl)
+			continue;
+		gl = bd->bd_gl;
+		list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+		mapping = bh->b_page->mapping;
+		if (!mapping)
+			continue;
+		spin_unlock(&sdp->sd_ail_lock);
+		generic_writepages(mapping, wbc);
+		spin_lock(&sdp->sd_ail_lock);
+		if (wbc->nr_to_write <= 0)
+			break;
+		return 1;
+	}
 
-			if (!buffer_dirty(bh))
-				continue;
+	return 0;
+}
 
-			list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
 
-			get_bh(bh);
-			spin_unlock(&sdp->sd_ail_lock);
-			lock_buffer(bh);
-			if (test_clear_buffer_dirty(bh)) {
-				bh->b_end_io = end_buffer_write_sync;
-				submit_bh(WRITE_SYNC, bh);
-			} else {
-				unlock_buffer(bh);
-				brelse(bh);
-			}
-			spin_lock(&sdp->sd_ail_lock);
-
-			retry = 1;
+/**
+ * gfs2_ail1_flush - start writeback of some ail1 entries 
+ * @sdp: The super block
+ * @wbc: The writeback control structure
+ *
+ * Writes back some ail1 entries, according to the limits in the
+ * writeback control structure
+ */
+
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
+{
+	struct list_head *head = &sdp->sd_ail1_list;
+	struct gfs2_ail *ai;
+
+	trace_gfs2_ail_flush(sdp, wbc, 1);
+	spin_lock(&sdp->sd_ail_lock);
+restart:
+	list_for_each_entry_reverse(ai, head, ai_list) {
+		if (wbc->nr_to_write <= 0)
 			break;
-		}
-	} while (retry);
+		if (gfs2_ail1_start_one(sdp, wbc, ai))
+			goto restart;
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+	trace_gfs2_ail_flush(sdp, wbc, 0);
+}
+
+/**
+ * gfs2_ail1_start - start writeback of all ail1 entries
+ * @sdp: The superblock
+ */
+
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = LONG_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
+
+	return gfs2_ail1_flush(sdp, &wbc);
 }
 
 /**
@@ -141,7 +184,7 @@ __acquires(&sdp->sd_ail_lock)
  *
  */
 
-static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 {
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
@@ -149,76 +192,63 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
 	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
 					 bd_ail_st_list) {
 		bh = bd->bd_bh;
-
 		gfs2_assert(sdp, bd->bd_ail == ai);
-
-		if (buffer_busy(bh)) {
-			if (flags & DIO_ALL)
-				continue;
-			else
-				break;
-		}
-
+		if (buffer_busy(bh))
+			continue;
 		if (!buffer_uptodate(bh))
 			gfs2_io_error_bh(sdp, bh);
-
 		list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
 	}
 
-	return list_empty(&ai->ai_ail1_list);
 }
 
-static void gfs2_ail1_start(struct gfs2_sbd *sdp)
-{
-	struct list_head *head;
-	u64 sync_gen;
-	struct gfs2_ail *ai;
-	int done = 0;
-
-	spin_lock(&sdp->sd_ail_lock);
-	head = &sdp->sd_ail1_list;
-	if (list_empty(head)) {
-		spin_unlock(&sdp->sd_ail_lock);
-		return;
-	}
-	sync_gen = sdp->sd_ail_sync_gen++;
-
-	while(!done) {
-		done = 1;
-		list_for_each_entry_reverse(ai, head, ai_list) {
-			if (ai->ai_sync_gen >= sync_gen)
-				continue;
-			ai->ai_sync_gen = sync_gen;
-			gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
-			done = 0;
-			break;
-		}
-	}
-
-	spin_unlock(&sdp->sd_ail_lock);
-}
+/**
+ * gfs2_ail1_empty - Try to empty the ail1 lists
+ * @sdp: The superblock
+ *
+ * Tries to empty the ail1 lists, starting with the oldest first
+ */
 
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 {
 	struct gfs2_ail *ai, *s;
 	int ret;
 
 	spin_lock(&sdp->sd_ail_lock);
-
 	list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
-		if (gfs2_ail1_empty_one(sdp, ai, flags))
+		gfs2_ail1_empty_one(sdp, ai);
+		if (list_empty(&ai->ai_ail1_list))
 			list_move(&ai->ai_list, &sdp->sd_ail2_list);
-		else if (!(flags & DIO_ALL))
+		else
 			break;
 	}
-
 	ret = list_empty(&sdp->sd_ail1_list);
-
 	spin_unlock(&sdp->sd_ail_lock);
 
 	return ret;
 }
 
+static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
+{
+	struct gfs2_ail *ai;
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) {
+		list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) {
+			bh = bd->bd_bh;
+			if (!buffer_locked(bh))
+				continue;
+			get_bh(bh);
+			spin_unlock(&sdp->sd_ail_lock);
+			wait_on_buffer(bh);
+			brelse(bh);
+			return;
+		}
+	}
+	spin_unlock(&sdp->sd_ail_lock);
+}
 
 /**
  * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
@@ -574,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
 
-	gfs2_ail1_empty(sdp, 0);
+	gfs2_ail1_empty(sdp);
 	tail = current_tail(sdp);
 
 	lh = (struct gfs2_log_header *)bh->b_data;
@@ -869,9 +899,9 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 	gfs2_log_flush(sdp, NULL);
 	for (;;) {
 		gfs2_ail1_start(sdp);
-		if (gfs2_ail1_empty(sdp, DIO_ALL))
+		gfs2_ail1_wait(sdp);
+		if (gfs2_ail1_empty(sdp))
 			break;
-		msleep(10);
 	}
 }
 
@@ -905,20 +935,20 @@ int gfs2_logd(void *data)
 
 		preflush = atomic_read(&sdp->sd_log_pinned);
 		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
-			gfs2_ail1_empty(sdp, DIO_ALL);
+			gfs2_ail1_empty(sdp);
 			gfs2_log_flush(sdp, NULL);
-			gfs2_ail1_empty(sdp, DIO_ALL);
 		}
 
 		if (gfs2_ail_flush_reqd(sdp)) {
 			gfs2_ail1_start(sdp);
-			io_schedule();
-			gfs2_ail1_empty(sdp, 0);
+			gfs2_ail1_wait(sdp);
+			gfs2_ail1_empty(sdp);
 			gfs2_log_flush(sdp, NULL);
-			gfs2_ail1_empty(sdp, DIO_ALL);
 		}
 
-		wake_up(&sdp->sd_log_waitq);
+		if (!gfs2_ail_flush_reqd(sdp))
+			wake_up(&sdp->sd_log_waitq);
+
 		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
 		if (freezing(current))
 			refrigerator();
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 0d007f9..ab06216 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -12,6 +12,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/writeback.h>
 #include "incore.h"
 
 /**
@@ -59,6 +60,7 @@ extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
 extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
 extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
 
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 51d27f0..05bbb12 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -40,7 +40,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
 	struct gfs2_bufdata *bd;
 
-	gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+	BUG_ON(!current->journal_info);
 
 	clear_buffer_dirty(bh);
 	if (test_set_buffer_pinned(bh))
@@ -65,6 +65,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
  * @sdp: the filesystem the buffer belongs to
  * @bh: The buffer to unpin
  * @ai:
+ * @flags: The inode dirty flags
  *
  */
 
@@ -73,10 +74,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 {
 	struct gfs2_bufdata *bd = bh->b_private;
 
-	gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
-
-	if (!buffer_pinned(bh))
-		gfs2_assert_withdraw(sdp, 0);
+	BUG_ON(!buffer_uptodate(bh));
+	BUG_ON(!buffer_pinned(bh));
 
 	lock_buffer(bh);
 	mark_buffer_dirty(bh);
@@ -95,8 +94,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
 	spin_unlock(&sdp->sd_ail_lock);
 
-	if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
-		gfs2_glock_schedule_for_reclaim(bd->bd_gl);
+	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
 	trace_gfs2_pin(bd, 0);
 	unlock_buffer(bh);
 	atomic_dec(&sdp->sd_log_pinned);
@@ -322,12 +320,16 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 
 static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
+	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_glock *gl = bd->bd_gl;
 	struct gfs2_trans *tr;
 
 	tr = current->journal_info;
 	tr->tr_touched = 1;
 	tr->tr_num_revoke++;
 	sdp->sd_log_num_revoke++;
+	atomic_inc(&gl->gl_revokes);
+	set_bit(GLF_LFLUSH, &gl->gl_flags);
 	list_add(&le->le_list, &sdp->sd_log_le_revoke);
 }
 
@@ -350,9 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
 	offset = sizeof(struct gfs2_log_descriptor);
 
-	while (!list_empty(head)) {
-		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
-		list_del_init(&bd->bd_le.le_list);
+	list_for_each_entry(bd, head, bd_le.le_list) {
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
@@ -367,8 +367,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		*(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
-		kmem_cache_free(gfs2_bufdata_cachep, bd);
-
 		offset += sizeof(u64);
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
@@ -376,6 +374,22 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	submit_bh(WRITE_SYNC, bh);
 }
 
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct gfs2_bufdata *bd;
+	struct gfs2_glock *gl;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
+		gl = bd->bd_gl;
+		atomic_dec(&gl->gl_revokes);
+		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		kmem_cache_free(gfs2_bufdata_cachep, bd);
+	}
+}
+
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
 				  struct gfs2_log_header_host *head, int pass)
 {
@@ -749,6 +763,7 @@ const struct gfs2_log_operations gfs2_buf_lops = {
 const struct gfs2_log_operations gfs2_revoke_lops = {
 	.lo_add = revoke_lo_add,
 	.lo_before_commit = revoke_lo_before_commit,
+	.lo_after_commit = revoke_lo_after_commit,
 	.lo_before_scan = revoke_lo_before_scan,
 	.lo_scan_elements = revoke_lo_scan_elements,
 	.lo_after_scan = revoke_lo_after_scan,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 888a5f5..cfa327d 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -53,6 +53,7 @@ static void gfs2_init_glock_once(void *foo)
 	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
 	atomic_set(&gl->gl_ail_count, 0);
+	atomic_set(&gl->gl_revokes, 0);
 }
 
 static void gfs2_init_gl_aspace_once(void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 675349b..747238c 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,6 +31,7 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
@@ -310,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 	struct gfs2_bufdata *bd = bh->b_private;
 
 	if (test_clear_buffer_pinned(bh)) {
+		trace_gfs2_pin(bd, 0);
 		atomic_dec(&sdp->sd_log_pinned);
 		list_del_init(&bd->bd_le.le_list);
 		if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 6a1d9ba..22c5265 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -77,8 +77,6 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
 
 #define buffer_busy(bh) \
 ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
-#define buffer_in_io(bh) \
-((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
 
 #endif /* __DIO_DOT_H__ */
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d3c69eb..8ac9ae1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -126,8 +126,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
  * changed.
  */
 
-static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
+static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
 {
+	struct gfs2_sb_host *sb = &sdp->sd_sb;
+
 	if (sb->sb_magic != GFS2_MAGIC ||
 	    sb->sb_type != GFS2_METATYPE_SB) {
 		if (!silent)
@@ -157,8 +159,10 @@ static void end_bio_io_page(struct bio *bio, int error)
 	unlock_page(page);
 }
 
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
 {
+	struct gfs2_sb_host *sb = &sdp->sd_sb;
+	struct super_block *s = sdp->sd_vfs;
 	const struct gfs2_sb *str = buf;
 
 	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
@@ -175,7 +179,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 
 	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
 	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-	memcpy(sb->sb_uuid, str->sb_uuid, 16);
+	memcpy(s->s_uuid, str->sb_uuid, 16);
 }
 
 /**
@@ -197,7 +201,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
  * Returns: 0 on success or error
  */
 
-static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 {
 	struct super_block *sb = sdp->sd_vfs;
 	struct gfs2_sb *p;
@@ -227,10 +231,10 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 		return -EIO;
 	}
 	p = kmap(page);
-	gfs2_sb_in(&sdp->sd_sb, p);
+	gfs2_sb_in(sdp, p);
 	kunmap(page);
 	__free_page(page);
-	return 0;
+	return gfs2_check_sb(sdp, silent);
 }
 
 /**
@@ -247,17 +251,13 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 	unsigned int x;
 	int error;
 
-	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
 	if (error) {
 		if (!silent)
 			fs_err(sdp, "can't read superblock\n");
 		return error;
 	}
 
-	error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-	if (error)
-		return error;
-
 	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
 			       GFS2_BASIC_BLOCK_SHIFT;
 	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
@@ -340,14 +340,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
 	/*  Try to autodetect  */
 
 	if (!proto[0] || !table[0]) {
-		error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+		error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
 		if (error)
 			return error;
 
-		error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-		if (error)
-			goto out;
-
 		if (!proto[0])
 			proto = sdp->sd_sb.sb_lockproto;
 		if (!table[0])
@@ -364,7 +360,6 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
 	while ((table = strchr(table, '/')))
 		*table = '_';
 
-out:
 	return error;
 }
 
@@ -1119,8 +1114,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	if (sdp->sd_args.ar_statfs_quantum) {
 		sdp->sd_tune.gt_statfs_slow = 0;
 		sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
-	}
-	else {
+	} else {
 		sdp->sd_tune.gt_statfs_slow = 1;
 		sdp->sd_tune.gt_statfs_quantum = 30;
 	}
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
deleted file mode 100644
index 09e436a..0000000
--- a/fs/gfs2/ops_inode.c
+++ /dev/null
@@ -1,1344 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/namei.h>
-#include <linux/mm.h>
-#include <linux/xattr.h>
-#include <linux/posix_acl.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/fiemap.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "bmap.h"
-#include "dir.h"
-#include "xattr.h"
-#include "glock.h"
-#include "inode.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-#include "super.h"
-
-/**
- * gfs2_create - Create a file
- * @dir: The directory in which to create the file
- * @dentry: The dentry of the new file
- * @mode: The mode of the new file
- *
- * Returns: errno
- */
-
-static int gfs2_create(struct inode *dir, struct dentry *dentry,
-		       int mode, struct nameidata *nd)
-{
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct gfs2_holder ghs[2];
-	struct inode *inode;
-
-	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
-	for (;;) {
-		inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
-		if (!IS_ERR(inode)) {
-			gfs2_trans_end(sdp);
-			if (dip->i_alloc->al_rgd)
-				gfs2_inplace_release(dip);
-			gfs2_quota_unlock(dip);
-			gfs2_alloc_put(dip);
-			gfs2_glock_dq_uninit_m(2, ghs);
-			mark_inode_dirty(inode);
-			break;
-		} else if (PTR_ERR(inode) != -EEXIST ||
-			   (nd && nd->flags & LOOKUP_EXCL)) {
-			gfs2_holder_uninit(ghs);
-			return PTR_ERR(inode);
-		}
-
-		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-		if (inode) {
-			if (!IS_ERR(inode)) {
-				gfs2_holder_uninit(ghs);
-				break;
-			} else {
-				gfs2_holder_uninit(ghs);
-				return PTR_ERR(inode);
-			}
-		}
-	}
-
-	d_instantiate(dentry, inode);
-
-	return 0;
-}
-
-/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
- * @dir: The directory inode
- * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
- *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
- *
- * Returns: errno
- */
-
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
-{
-	struct inode *inode = NULL;
-
-	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-	if (inode && IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	if (inode) {
-		struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
-		struct gfs2_holder gh;
-		int error;
-		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-		if (error) {
-			iput(inode);
-			return ERR_PTR(error);
-		}
-		gfs2_glock_dq_uninit(&gh);
-		return d_splice_alias(inode, dentry);
-	}
-	d_add(dentry, inode);
-
-	return NULL;
-}
-
-/**
- * gfs2_link - Link to a file
- * @old_dentry: The inode to link
- * @dir: Add link to this directory
- * @dentry: The name of the link
- *
- * Link the inode in "old_dentry" into the directory "dir" with the
- * name in "dentry".
- *
- * Returns: errno
- */
-
-static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
-		     struct dentry *dentry)
-{
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct inode *inode = old_dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder ghs[2];
-	int alloc_required;
-	int error;
-
-	if (S_ISDIR(inode->i_mode))
-		return -EPERM;
-
-	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-
-	error = gfs2_glock_nq(ghs); /* parent */
-	if (error)
-		goto out_parent;
-
-	error = gfs2_glock_nq(ghs + 1); /* child */
-	if (error)
-		goto out_child;
-
-	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
-	if (error)
-		goto out_gunlock;
-
-	error = gfs2_dir_check(dir, &dentry->d_name, NULL);
-	switch (error) {
-	case -ENOENT:
-		break;
-	case 0:
-		error = -EEXIST;
-	default:
-		goto out_gunlock;
-	}
-
-	error = -EINVAL;
-	if (!dip->i_inode.i_nlink)
-		goto out_gunlock;
-	error = -EFBIG;
-	if (dip->i_entries == (u32)-1)
-		goto out_gunlock;
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_gunlock;
-	error = -EINVAL;
-	if (!ip->i_inode.i_nlink)
-		goto out_gunlock;
-	error = -EMLINK;
-	if (ip->i_inode.i_nlink == (u32)-1)
-		goto out_gunlock;
-
-	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
-	if (error < 0)
-		goto out_gunlock;
-	error = 0;
-
-	if (alloc_required) {
-		struct gfs2_alloc *al = gfs2_alloc_get(dip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_gunlock;
-		}
-
-		error = gfs2_quota_lock_check(dip);
-		if (error)
-			goto out_alloc;
-
-		al->al_requested = sdp->sd_max_dirres;
-
-		error = gfs2_inplace_reserve(dip);
-		if (error)
-			goto out_gunlock_q;
-
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(al) +
-					 2 * RES_DINODE + RES_STATFS +
-					 RES_QUOTA, 0);
-		if (error)
-			goto out_ipres;
-	} else {
-		error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
-		if (error)
-			goto out_ipres;
-	}
-
-	error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode));
-	if (error)
-		goto out_end_trans;
-
-	error = gfs2_change_nlink(ip, +1);
-
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_ipres:
-	if (alloc_required)
-		gfs2_inplace_release(dip);
-out_gunlock_q:
-	if (alloc_required)
-		gfs2_quota_unlock(dip);
-out_alloc:
-	if (alloc_required)
-		gfs2_alloc_put(dip);
-out_gunlock:
-	gfs2_glock_dq(ghs + 1);
-out_child:
-	gfs2_glock_dq(ghs);
-out_parent:
-	gfs2_holder_uninit(ghs);
-	gfs2_holder_uninit(ghs + 1);
-	if (!error) {
-		ihold(inode);
-		d_instantiate(dentry, inode);
-		mark_inode_dirty(inode);
-	}
-	return error;
-}
-
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-
-static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-			  const struct gfs2_inode *ip)
-{
-	int error;
-
-	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-		return -EPERM;
-
-	if ((dip->i_inode.i_mode & S_ISVTX) &&
-	    dip->i_inode.i_uid != current_fsuid() &&
-	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-		return -EPERM;
-
-	if (IS_APPEND(&dip->i_inode))
-		return -EPERM;
-
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
-	if (error)
-		return error;
-
-	error = gfs2_dir_check(&dip->i_inode, name, ip);
-	if (error)
-		return error;
-
-	return 0;
-}
-
-/**
- * gfs2_unlink - Unlink a file
- * @dir: The inode of the directory containing the file to unlink
- * @dentry: The file itself
- *
- * Unlink a file.  Call gfs2_unlinki()
- *
- * Returns: errno
- */
-
-static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	struct gfs2_holder ghs[3];
-	struct gfs2_rgrpd *rgd;
-	struct gfs2_holder ri_gh;
-	int error;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		return error;
-
-	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
-
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
-	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-
-
-	error = gfs2_glock_nq(ghs); /* parent */
-	if (error)
-		goto out_parent;
-
-	error = gfs2_glock_nq(ghs + 1); /* child */
-	if (error)
-		goto out_child;
-
-	error = gfs2_glock_nq(ghs + 2); /* rgrp */
-	if (error)
-		goto out_rgrp;
-
-	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
-	if (error)
-		goto out_gunlock;
-
-	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
-	if (error)
-		goto out_gunlock;
-
-	error = gfs2_dir_del(dip, &dentry->d_name);
-        if (error)
-                goto out_end_trans;
-
-	error = gfs2_change_nlink(ip, -1);
-
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_gunlock:
-	gfs2_glock_dq(ghs + 2);
-out_rgrp:
-	gfs2_holder_uninit(ghs + 2);
-	gfs2_glock_dq(ghs + 1);
-out_child:
-	gfs2_holder_uninit(ghs + 1);
-	gfs2_glock_dq(ghs);
-out_parent:
-	gfs2_holder_uninit(ghs);
-	gfs2_glock_dq_uninit(&ri_gh);
-	return error;
-}
-
-/**
- * gfs2_symlink - Create a symlink
- * @dir: The directory to create the symlink in
- * @dentry: The dentry to put the symlink in
- * @symname: The thing which the link points to
- *
- * Returns: errno
- */
-
-static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
-			const char *symname)
-{
-	struct gfs2_inode *dip = GFS2_I(dir), *ip;
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct gfs2_holder ghs[2];
-	struct inode *inode;
-	struct buffer_head *dibh;
-	int size;
-	int error;
-
-	/* Must be stuffed with a null terminator for gfs2_follow_link() */
-	size = strlen(symname);
-	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
-		return -ENAMETOOLONG;
-
-	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
-	inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO, 0);
-	if (IS_ERR(inode)) {
-		gfs2_holder_uninit(ghs);
-		return PTR_ERR(inode);
-	}
-
-	ip = ghs[1].gh_gl->gl_object;
-
-	i_size_write(inode, size);
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-
-	if (!gfs2_assert_withdraw(sdp, !error)) {
-		gfs2_dinode_out(ip, dibh->b_data);
-		memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
-		       size);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-	if (dip->i_alloc->al_rgd)
-		gfs2_inplace_release(dip);
-	gfs2_quota_unlock(dip);
-	gfs2_alloc_put(dip);
-
-	gfs2_glock_dq_uninit_m(2, ghs);
-
-	d_instantiate(dentry, inode);
-	mark_inode_dirty(inode);
-
-	return 0;
-}
-
-/**
- * gfs2_mkdir - Make a directory
- * @dir: The parent directory of the new one
- * @dentry: The dentry of the new directory
- * @mode: The mode of the new directory
- *
- * Returns: errno
- */
-
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct gfs2_inode *dip = GFS2_I(dir), *ip;
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct gfs2_holder ghs[2];
-	struct inode *inode;
-	struct buffer_head *dibh;
-	int error;
-
-	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
-	inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode, 0);
-	if (IS_ERR(inode)) {
-		gfs2_holder_uninit(ghs);
-		return PTR_ERR(inode);
-	}
-
-	ip = ghs[1].gh_gl->gl_object;
-
-	ip->i_inode.i_nlink = 2;
-	i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
-	ip->i_diskflags |= GFS2_DIF_JDATA;
-	ip->i_entries = 2;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-
-	if (!gfs2_assert_withdraw(sdp, !error)) {
-		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
-		struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
-		dent->de_inum = di->di_num; /* already GFS2 endian */
-		dent->de_type = cpu_to_be16(DT_DIR);
-		di->di_entries = cpu_to_be32(1);
-
-		dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-		gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
-
-		gfs2_inum_out(dip, dent);
-		dent->de_type = cpu_to_be16(DT_DIR);
-
-		gfs2_dinode_out(ip, di);
-
-		brelse(dibh);
-	}
-
-	error = gfs2_change_nlink(dip, +1);
-	gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
-
-	gfs2_trans_end(sdp);
-	if (dip->i_alloc->al_rgd)
-		gfs2_inplace_release(dip);
-	gfs2_quota_unlock(dip);
-	gfs2_alloc_put(dip);
-
-	gfs2_glock_dq_uninit_m(2, ghs);
-
-	d_instantiate(dentry, inode);
-	mark_inode_dirty(inode);
-
-	return 0;
-}
-
-/**
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-
-static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip)
-{
-	int error;
-
-	if (ip->i_entries != 2) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-
-	error = gfs2_dir_del(dip, name);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(dip, -1);
-	if (error)
-		return error;
-
-	error = gfs2_dir_del(ip, &gfs2_qdot);
-	if (error)
-		return error;
-
-	error = gfs2_dir_del(ip, &gfs2_qdotdot);
-	if (error)
-		return error;
-
-	/* It looks odd, but it really should be done twice */
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	return error;
-}
-
-/**
- * gfs2_rmdir - Remove a directory
- * @dir: The parent directory of the directory to be removed
- * @dentry: The dentry of the directory to remove
- *
- * Remove a directory. Call gfs2_rmdiri()
- *
- * Returns: errno
- */
-
-static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	struct gfs2_holder ghs[3];
-	struct gfs2_rgrpd *rgd;
-	struct gfs2_holder ri_gh;
-	int error;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		return error;
-	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
-	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-
-	error = gfs2_glock_nq(ghs); /* parent */
-	if (error)
-		goto out_parent;
-
-	error = gfs2_glock_nq(ghs + 1); /* child */
-	if (error)
-		goto out_child;
-
-	error = gfs2_glock_nq(ghs + 2); /* rgrp */
-	if (error)
-		goto out_rgrp;
-
-	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
-	if (error)
-		goto out_gunlock;
-
-	if (ip->i_entries < 2) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		error = -EIO;
-		goto out_gunlock;
-	}
-	if (ip->i_entries > 2) {
-		error = -ENOTEMPTY;
-		goto out_gunlock;
-	}
-
-	error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
-	if (error)
-		goto out_gunlock;
-
-	error = gfs2_rmdiri(dip, &dentry->d_name, ip);
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq(ghs + 2);
-out_rgrp:
-	gfs2_holder_uninit(ghs + 2);
-	gfs2_glock_dq(ghs + 1);
-out_child:
-	gfs2_holder_uninit(ghs + 1);
-	gfs2_glock_dq(ghs);
-out_parent:
-	gfs2_holder_uninit(ghs);
-	gfs2_glock_dq_uninit(&ri_gh);
-	return error;
-}
-
-/**
- * gfs2_mknod - Make a special file
- * @dir: The directory in which the special file will reside
- * @dentry: The dentry of the special file
- * @mode: The mode of the special file
- * @rdev: The device specification of the special file
- *
- */
-
-static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
-		      dev_t dev)
-{
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_sbd *sdp = GFS2_SB(dir);
-	struct gfs2_holder ghs[2];
-	struct inode *inode;
-
-	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
-	inode = gfs2_createi(ghs, &dentry->d_name, mode, dev);
-	if (IS_ERR(inode)) {
-		gfs2_holder_uninit(ghs);
-		return PTR_ERR(inode);
-	}
-
-	gfs2_trans_end(sdp);
-	if (dip->i_alloc->al_rgd)
-		gfs2_inplace_release(dip);
-	gfs2_quota_unlock(dip);
-	gfs2_alloc_put(dip);
-
-	gfs2_glock_dq_uninit_m(2, ghs);
-
-	d_instantiate(dentry, inode);
-	mark_inode_dirty(inode);
-
-	return 0;
-}
-
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-
-static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
-	struct inode *dir = &to->i_inode;
-	struct super_block *sb = dir->i_sb;
-	struct inode *tmp;
-	int error = 0;
-
-	igrab(dir);
-
-	for (;;) {
-		if (dir == &this->i_inode) {
-			error = -EINVAL;
-			break;
-		}
-		if (dir == sb->s_root->d_inode) {
-			error = 0;
-			break;
-		}
-
-		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
-		if (IS_ERR(tmp)) {
-			error = PTR_ERR(tmp);
-			break;
-		}
-
-		iput(dir);
-		dir = tmp;
-	}
-
-	iput(dir);
-
-	return error;
-}
-
-/**
- * gfs2_rename - Rename a file
- * @odir: Parent directory of old file name
- * @odentry: The old dentry of the file
- * @ndir: Parent directory of new file name
- * @ndentry: The new dentry of the file
- *
- * Returns: errno
- */
-
-static int gfs2_rename(struct inode *odir, struct dentry *odentry,
-		       struct inode *ndir, struct dentry *ndentry)
-{
-	struct gfs2_inode *odip = GFS2_I(odir);
-	struct gfs2_inode *ndip = GFS2_I(ndir);
-	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
-	struct gfs2_inode *nip = NULL;
-	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
-	struct gfs2_rgrpd *nrgd;
-	unsigned int num_gh;
-	int dir_rename = 0;
-	int alloc_required = 0;
-	unsigned int x;
-	int error;
-
-	if (ndentry->d_inode) {
-		nip = GFS2_I(ndentry->d_inode);
-		if (ip == nip)
-			return 0;
-	}
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		return error;
-
-	if (odip != ndip) {
-		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
-					   0, &r_gh);
-		if (error)
-			goto out;
-
-		if (S_ISDIR(ip->i_inode.i_mode)) {
-			dir_rename = 1;
-			/* don't move a dirctory into it's subdir */
-			error = gfs2_ok_to_move(ip, ndip);
-			if (error)
-				goto out_gunlock_r;
-		}
-	}
-
-	num_gh = 1;
-	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-	if (odip != ndip) {
-		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
-		num_gh++;
-	}
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
-	num_gh++;
-
-	if (nip) {
-		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
-		num_gh++;
-		/* grab the resource lock for unlink flag twiddling 
-		 * this is the case of the target file already existing
-		 * so we unlink before doing the rename
-		 */
-		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
-		if (nrgd)
-			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
-	}
-
-	for (x = 0; x < num_gh; x++) {
-		error = gfs2_glock_nq(ghs + x);
-		if (error)
-			goto out_gunlock;
-	}
-
-	/* Check out the old directory */
-
-	error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
-	if (error)
-		goto out_gunlock;
-
-	/* Check out the new directory */
-
-	if (nip) {
-		error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
-		if (error)
-			goto out_gunlock;
-
-		if (S_ISDIR(nip->i_inode.i_mode)) {
-			if (nip->i_entries < 2) {
-				if (gfs2_consist_inode(nip))
-					gfs2_dinode_print(nip);
-				error = -EIO;
-				goto out_gunlock;
-			}
-			if (nip->i_entries > 2) {
-				error = -ENOTEMPTY;
-				goto out_gunlock;
-			}
-		}
-	} else {
-		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
-		if (error)
-			goto out_gunlock;
-
-		error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
-		switch (error) {
-		case -ENOENT:
-			error = 0;
-			break;
-		case 0:
-			error = -EEXIST;
-		default:
-			goto out_gunlock;
-		};
-
-		if (odip != ndip) {
-			if (!ndip->i_inode.i_nlink) {
-				error = -EINVAL;
-				goto out_gunlock;
-			}
-			if (ndip->i_entries == (u32)-1) {
-				error = -EFBIG;
-				goto out_gunlock;
-			}
-			if (S_ISDIR(ip->i_inode.i_mode) &&
-			    ndip->i_inode.i_nlink == (u32)-1) {
-				error = -EMLINK;
-				goto out_gunlock;
-			}
-		}
-	}
-
-	/* Check out the dir to be renamed */
-
-	if (dir_rename) {
-		error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
-		if (error)
-			goto out_gunlock;
-	}
-
-	if (nip == NULL)
-		alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
-	error = alloc_required;
-	if (error < 0)
-		goto out_gunlock;
-	error = 0;
-
-	if (alloc_required) {
-		struct gfs2_alloc *al = gfs2_alloc_get(ndip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_gunlock;
-		}
-
-		error = gfs2_quota_lock_check(ndip);
-		if (error)
-			goto out_alloc;
-
-		al->al_requested = sdp->sd_max_dirres;
-
-		error = gfs2_inplace_reserve_ri(ndip);
-		if (error)
-			goto out_gunlock_q;
-
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(al) +
-					 4 * RES_DINODE + 4 * RES_LEAF +
-					 RES_STATFS + RES_QUOTA + 4, 0);
-		if (error)
-			goto out_ipreserv;
-	} else {
-		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
-					 5 * RES_LEAF + 4, 0);
-		if (error)
-			goto out_gunlock;
-	}
-
-	/* Remove the target file, if it exists */
-
-	if (nip) {
-		if (S_ISDIR(nip->i_inode.i_mode))
-			error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
-		else {
-			error = gfs2_dir_del(ndip, &ndentry->d_name);
-			if (error)
-				goto out_end_trans;
-			error = gfs2_change_nlink(nip, -1);
-		}
-		if (error)
-			goto out_end_trans;
-	}
-
-	if (dir_rename) {
-		error = gfs2_change_nlink(ndip, +1);
-		if (error)
-			goto out_end_trans;
-		error = gfs2_change_nlink(odip, -1);
-		if (error)
-			goto out_end_trans;
-
-		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
-		if (error)
-			goto out_end_trans;
-	} else {
-		struct buffer_head *dibh;
-		error = gfs2_meta_inode_buffer(ip, &dibh);
-		if (error)
-			goto out_end_trans;
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	error = gfs2_dir_del(odip, &odentry->d_name);
-	if (error)
-		goto out_end_trans;
-
-	error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode));
-	if (error)
-		goto out_end_trans;
-
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_ipreserv:
-	if (alloc_required)
-		gfs2_inplace_release(ndip);
-out_gunlock_q:
-	if (alloc_required)
-		gfs2_quota_unlock(ndip);
-out_alloc:
-	if (alloc_required)
-		gfs2_alloc_put(ndip);
-out_gunlock:
-	while (x--) {
-		gfs2_glock_dq(ghs + x);
-		gfs2_holder_uninit(ghs + x);
-	}
-out_gunlock_r:
-	if (r_gh.gh_gl)
-		gfs2_glock_dq_uninit(&r_gh);
-out:
-	gfs2_glock_dq_uninit(&ri_gh);
-	return error;
-}
-
-/**
- * gfs2_follow_link - Follow a symbolic link
- * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
- *
- * This can handle symlinks of any size.
- *
- * Returns: 0 on success or error code
- */
-
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	struct gfs2_holder i_gh;
-	struct buffer_head *dibh;
-	unsigned int x, size;
-	char *buf;
-	int error;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		nd_set_link(nd, ERR_PTR(error));
-		return NULL;
-	}
-
-	size = (unsigned int)i_size_read(&ip->i_inode);
-	if (size == 0) {
-		gfs2_consist_inode(ip);
-		buf = ERR_PTR(-EIO);
-		goto out;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error) {
-		buf = ERR_PTR(error);
-		goto out;
-	}
-
-	x = size + 1;
-	buf = kmalloc(x, GFP_NOFS);
-	if (!buf)
-		buf = ERR_PTR(-ENOMEM);
-	else
-		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-	brelse(dibh);
-out:
-	gfs2_glock_dq_uninit(&i_gh);
-	nd_set_link(nd, buf);
-	return NULL;
-}
-
-static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
-/**
- * gfs2_permission -
- * @inode: The inode
- * @mask: The mask to be tested
- * @flags: Indicates whether this is an RCU path walk or not
- *
- * This may be called from the VFS directly, or from within GFS2 with the
- * inode locked, so we look to see if the glock is already locked and only
- * lock the glock if its not already been done.
- *
- * Returns: errno
- */
-
-int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
-{
-	struct gfs2_inode *ip;
-	struct gfs2_holder i_gh;
-	int error;
-	int unlock = 0;
-
-
-	ip = GFS2_I(inode);
-	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		if (flags & IPERM_FLAG_RCU)
-			return -ECHILD;
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-		if (error)
-			return error;
-		unlock = 1;
-	}
-
-	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
-		error = -EACCES;
-	else
-		error = generic_permission(inode, mask, flags, gfs2_check_acl);
-	if (unlock)
-		gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
-static int setattr_chown(struct inode *inode, struct iattr *attr)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	u32 ouid, ogid, nuid, ngid;
-	int error;
-
-	ouid = inode->i_uid;
-	ogid = inode->i_gid;
-	nuid = attr->ia_uid;
-	ngid = attr->ia_gid;
-
-	if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
-		ouid = nuid = NO_QUOTA_CHANGE;
-	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
-		ogid = ngid = NO_QUOTA_CHANGE;
-
-	if (!gfs2_alloc_get(ip))
-		return -ENOMEM;
-
-	error = gfs2_quota_lock(ip, nuid, ngid);
-	if (error)
-		goto out_alloc;
-
-	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-		error = gfs2_quota_check(ip, nuid, ngid);
-		if (error)
-			goto out_gunlock_q;
-	}
-
-	error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_setattr_simple(ip, attr);
-	if (error)
-		goto out_end_trans;
-
-	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
-		gfs2_quota_change(ip, -blocks, ouid, ogid);
-		gfs2_quota_change(ip, blocks, nuid, ngid);
-	}
-
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out_alloc:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-/**
- * gfs2_setattr - Change attributes on an inode
- * @dentry: The dentry which is changing
- * @attr: The structure describing the change
- *
- * The VFS layer wants to change one or more of an inodes attributes.  Write
- * that change out to disk.
- *
- * Returns: errno
- */
-
-static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder i_gh;
-	int error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-	if (error)
-		return error;
-
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out;
-
-	error = inode_change_ok(inode, attr);
-	if (error)
-		goto out;
-
-	if (attr->ia_valid & ATTR_SIZE)
-		error = gfs2_setattr_size(inode, attr->ia_size);
-	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
-		error = setattr_chown(inode, attr);
-	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
-		error = gfs2_acl_chmod(ip, attr);
-	else
-		error = gfs2_setattr_simple(ip, attr);
-
-out:
-	gfs2_glock_dq_uninit(&i_gh);
-	if (!error)
-		mark_inode_dirty(inode);
-	return error;
-}
-
-/**
- * gfs2_getattr - Read out an inode's attributes
- * @mnt: The vfsmount the inode is being accessed from
- * @dentry: The dentry to stat
- * @stat: The inode's stats
- *
- * This may be called from the VFS directly, or from within GFS2 with the
- * inode locked, so we look to see if the glock is already locked and only
- * lock the glock if its not already been done. Note that its the NFS
- * readdirplus operation which causes this to be called (from filldir)
- * with the glock already held.
- *
- * Returns: errno
- */
-
-static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int error;
-	int unlock = 0;
-
-	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-		if (error)
-			return error;
-		unlock = 1;
-	}
-
-	generic_fillattr(inode, stat);
-	if (unlock)
-		gfs2_glock_dq_uninit(&gh);
-
-	return 0;
-}
-
-static int gfs2_setxattr(struct dentry *dentry, const char *name,
-			 const void *data, size_t size, int flags)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (ret == 0) {
-		ret = generic_setxattr(dentry, name, data, size, flags);
-		gfs2_glock_dq(&gh);
-	}
-	gfs2_holder_uninit(&gh);
-	return ret;
-}
-
-static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
-			     void *data, size_t size)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (ret == 0) {
-		ret = generic_getxattr(dentry, name, data, size);
-		gfs2_glock_dq(&gh);
-	}
-	gfs2_holder_uninit(&gh);
-	return ret;
-}
-
-static int gfs2_removexattr(struct dentry *dentry, const char *name)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (ret == 0) {
-		ret = generic_removexattr(dentry, name);
-		gfs2_glock_dq(&gh);
-	}
-	gfs2_holder_uninit(&gh);
-	return ret;
-}
-
-static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		       u64 start, u64 len)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int ret;
-
-	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
-	if (ret)
-		return ret;
-
-	mutex_lock(&inode->i_mutex);
-
-	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	if (ret)
-		goto out;
-
-	if (gfs2_is_stuffed(ip)) {
-		u64 phys = ip->i_no_addr << inode->i_blkbits;
-		u64 size = i_size_read(inode);
-		u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
-			    FIEMAP_EXTENT_DATA_INLINE;
-		phys += sizeof(struct gfs2_dinode);
-		phys += start;
-		if (start + len > size)
-			len = size - start;
-		if (start < size)
-			ret = fiemap_fill_next_extent(fieinfo, start, phys,
-						      len, flags);
-		if (ret == 1)
-			ret = 0;
-	} else {
-		ret = __generic_block_fiemap(inode, fieinfo, start, len,
-					     gfs2_block_map);
-	}
-
-	gfs2_glock_dq_uninit(&gh);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-const struct inode_operations gfs2_file_iops = {
-	.permission = gfs2_permission,
-	.setattr = gfs2_setattr,
-	.getattr = gfs2_getattr,
-	.setxattr = gfs2_setxattr,
-	.getxattr = gfs2_getxattr,
-	.listxattr = gfs2_listxattr,
-	.removexattr = gfs2_removexattr,
-	.fiemap = gfs2_fiemap,
-};
-
-const struct inode_operations gfs2_dir_iops = {
-	.create = gfs2_create,
-	.lookup = gfs2_lookup,
-	.link = gfs2_link,
-	.unlink = gfs2_unlink,
-	.symlink = gfs2_symlink,
-	.mkdir = gfs2_mkdir,
-	.rmdir = gfs2_rmdir,
-	.mknod = gfs2_mknod,
-	.rename = gfs2_rename,
-	.permission = gfs2_permission,
-	.setattr = gfs2_setattr,
-	.getattr = gfs2_getattr,
-	.setxattr = gfs2_setxattr,
-	.getxattr = gfs2_getxattr,
-	.listxattr = gfs2_listxattr,
-	.removexattr = gfs2_removexattr,
-	.fiemap = gfs2_fiemap,
-};
-
-const struct inode_operations gfs2_symlink_iops = {
-	.readlink = generic_readlink,
-	.follow_link = gfs2_follow_link,
-	.put_link = gfs2_put_link,
-	.permission = gfs2_permission,
-	.setattr = gfs2_setattr,
-	.getattr = gfs2_getattr,
-	.setxattr = gfs2_setxattr,
-	.getxattr = gfs2_getxattr,
-	.listxattr = gfs2_listxattr,
-	.removexattr = gfs2_removexattr,
-	.fiemap = gfs2_fiemap,
-};
-
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d986..42e8d23 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
 
-int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_sbd *sdp;
+	int nr_to_scan = sc->nr_to_scan;
 
-	if (nr == 0)
+	if (nr_to_scan == 0)
 		goto out;
 
-	if (!(gfp_mask & __GFP_FS))
+	if (!(sc->gfp_mask & __GFP_FS))
 		return -1;
 
 	spin_lock(&qd_lru_lock);
-	while (nr && !list_empty(&qd_lru_list)) {
+	while (nr_to_scan && !list_empty(&qd_lru_list)) {
 		qd = list_entry(qd_lru_list.next,
 				struct gfs2_quota_data, qd_reclaim);
 		sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 		spin_unlock(&qd_lru_lock);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 		spin_lock(&qd_lru_lock);
-		nr--;
+		nr_to_scan--;
 	}
 	spin_unlock(&qd_lru_lock);
 
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236c..90bf1c3 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
 
 struct gfs2_inode;
 struct gfs2_sbd;
+struct shrink_control;
 
 #define NO_QUOTA_CHANGE ((u32)-1)
 
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
-extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
+				 struct shrink_control *sc);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6fcae84..9b780df 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -78,10 +78,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 
 static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
 			       unsigned char *buf2, unsigned int offset,
-			       unsigned int buflen, u32 block,
+			       struct gfs2_bitmap *bi, u32 block,
 			       unsigned char new_state)
 {
 	unsigned char *byte1, *byte2, *end, cur_state;
+	unsigned int buflen = bi->bi_len;
 	const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
 
 	byte1 = buf1 + offset + (block / GFS2_NBBY);
@@ -92,6 +93,16 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
 
 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
+		printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, "
+		       "new_state=%d\n",
+		       (unsigned long long)block, cur_state, new_state);
+		printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n",
+		       (unsigned long long)rgd->rd_addr,
+		       (unsigned long)bi->bi_start);
+		printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n",
+		       (unsigned long)bi->bi_offset,
+		       (unsigned long)bi->bi_len);
+		dump_stack();
 		gfs2_consist_rgrpd(rgd);
 		return;
 	}
@@ -381,6 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
 
 		if (gl) {
 			gl->gl_object = NULL;
+			gfs2_glock_add_to_lru(gl);
 			gfs2_glock_put(gl);
 		}
 
@@ -1365,7 +1377,7 @@ skip:
 
 	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
 	gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-		    bi->bi_len, blk, new_state);
+		    bi, blk, new_state);
 	goal = blk;
 	while (*n < elen) {
 		goal++;
@@ -1375,7 +1387,7 @@ skip:
 		    GFS2_BLKST_FREE)
 			break;
 		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-			    bi->bi_len, goal, new_state);
+			    bi, goal, new_state);
 		(*n)++;
 	}
 out:
@@ -1432,7 +1444,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 		}
 		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
 		gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
-			    bi->bi_len, buf_blk, new_state);
+			    bi, buf_blk, new_state);
 	}
 
 	return rgd;
@@ -1617,6 +1629,10 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_trans_add_rg(rgd);
+
+	/* Directories keep their data in the metadata address space */
+	if (ip->i_depth)
+		gfs2_meta_wipe(ip, bstart, blen);
 }
 
 /**
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9f28e6..ed540e7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -23,6 +23,7 @@
 #include <linux/time.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -700,11 +701,47 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
 	mutex_unlock(&sdp->sd_freeze_lock);
 }
 
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
+{
+	struct gfs2_dinode *str = buf;
+
+	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
+	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
+	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
+	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
+	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
+	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
+	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
+	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+
+	str->di_goal_meta = cpu_to_be64(ip->i_goal);
+	str->di_goal_data = cpu_to_be64(ip->i_goal);
+	str->di_generation = cpu_to_be64(ip->i_generation);
+
+	str->di_flags = cpu_to_be32(ip->i_diskflags);
+	str->di_height = cpu_to_be16(ip->i_height);
+	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
+					     GFS2_FORMAT_DE : 0);
+	str->di_depth = cpu_to_be16(ip->i_depth);
+	str->di_entries = cpu_to_be32(ip->i_entries);
+
+	str->di_eattr = cpu_to_be64(ip->i_eattr);
+	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
+	str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
+	str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+}
 
 /**
  * gfs2_write_inode - Make sure the inode is stable on the disk
  * @inode: The inode
- * @sync: synchronous write flag
+ * @wbc: The writeback control structure
  *
  * Returns: errno
  */
@@ -713,15 +750,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+	struct backing_dev_info *bdi = metamapping->backing_dev_info;
 	struct gfs2_holder gh;
 	struct buffer_head *bh;
 	struct timespec atime;
 	struct gfs2_dinode *di;
-	int ret = 0;
+	int ret = -EAGAIN;
 
-	/* Check this is a "normal" inode, etc */
+	/* Skip timestamp update, if this is from a memalloc */
 	if (current->flags & PF_MEMALLOC)
-		return 0;
+		goto do_flush;
 	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	if (ret)
 		goto do_flush;
@@ -745,6 +784,13 @@ do_unlock:
 do_flush:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	filemap_fdatawrite(metamapping);
+	if (bdi->dirty_exceeded)
+		gfs2_ail1_flush(sdp, wbc);
+	if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
+		ret = filemap_fdatawait(metamapping);
+	if (ret)
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -874,8 +920,9 @@ restart:
 
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
-	if (wait && sb->s_fs_info)
-		gfs2_log_flush(sb->s_fs_info, NULL);
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	if (wait && sdp)
+		gfs2_log_flush(sdp, NULL);
 	return 0;
 }
 
@@ -1308,6 +1355,78 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	return 0;
 }
 
+static void gfs2_final_release_pages(struct gfs2_inode *ip)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_glock *gl = ip->i_gl;
+
+	truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (atomic_read(&gl->gl_revokes) == 0) {
+		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		clear_bit(GLF_DIRTY, &gl->gl_flags);
+	}
+}
+
+static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al;
+	struct gfs2_rgrpd *rgd;
+	int error;
+
+	if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+	if (error)
+		goto out_qs;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out_rindex_relse;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		goto out_rindex_relse;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
+				 sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_rg_gunlock;
+
+	gfs2_free_di(rgd, ip);
+
+	gfs2_final_release_pages(ip);
+
+	gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+	gfs2_quota_unhold(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
 /*
  * We have to (at the moment) hold the inodes main lock to cover
  * the gap between unlocking the shared lock on the iopen lock and
@@ -1371,15 +1490,13 @@ static void gfs2_evict_inode(struct inode *inode)
 	}
 
 	error = gfs2_dinode_dealloc(ip);
-	if (error)
-		goto out_unlock;
+	goto out_unlock;
 
 out_truncate:
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 	if (error)
 		goto out_unlock;
-	/* Needs to be done before glock release & also in a transaction */
-	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_final_release_pages(ip);
 	gfs2_trans_end(sdp);
 
 out_unlock:
@@ -1394,6 +1511,7 @@ out:
 	end_writeback(inode);
 
 	ip->i_gl->gl_object = NULL;
+	gfs2_glock_add_to_lru(ip->i_gl);
 	gfs2_glock_put(ip->i_gl);
 	ip->i_gl = NULL;
 	if (ip->i_iopen_gh.gh_gl) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 748ccb5..e20eab3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -81,7 +81,8 @@ static int gfs2_uuid_valid(const u8 *uuid)
 
 static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
 {
-	const u8 *uuid = sdp->sd_sb.sb_uuid;
+	struct super_block *s = sdp->sd_vfs;
+	const u8 *uuid = s->s_uuid;
 	buf[0] = '\0';
 	if (!gfs2_uuid_valid(uuid))
 		return 0;
@@ -616,7 +617,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 		       struct kobj_uevent_env *env)
 {
 	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	const u8 *uuid = sdp->sd_sb.sb_uuid;
+	struct super_block *s = sdp->sd_vfs;
+	const u8 *uuid = s->s_uuid;
 
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index cedb0bb..5d07609 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -10,6 +10,7 @@
 #include <linux/buffer_head.h>
 #include <linux/dlmconstants.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/writeback.h>
 #include "incore.h"
 #include "glock.h"
 
@@ -40,7 +41,9 @@
 	{(1UL << GLF_REPLY_PENDING),		"r" },		\
 	{(1UL << GLF_INITIAL),			"I" },		\
 	{(1UL << GLF_FROZEN),			"F" },		\
-	{(1UL << GLF_QUEUED),			"q" })
+	{(1UL << GLF_QUEUED),			"q" },		\
+	{(1UL << GLF_LRU),			"L" },		\
+	{(1UL << GLF_OBJECT),			"o" })
 
 #ifndef NUMPTY
 #define NUMPTY
@@ -94,7 +97,7 @@ TRACE_EVENT(gfs2_glock_state_change,
 		__entry->new_state	= glock_trace_state(new_state);
 		__entry->tgt_state	= glock_trace_state(gl->gl_target);
 		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
-		__entry->flags		= gl->gl_flags;
+		__entry->flags		= gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
 	),
 
 	TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
@@ -127,7 +130,7 @@ TRACE_EVENT(gfs2_glock_put,
 		__entry->gltype		= gl->gl_name.ln_type;
 		__entry->glnum		= gl->gl_name.ln_number;
 		__entry->cur_state	= glock_trace_state(gl->gl_state);
-		__entry->flags		= gl->gl_flags;
+		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
 	),
 
 	TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
@@ -161,7 +164,7 @@ TRACE_EVENT(gfs2_demote_rq,
 		__entry->glnum		= gl->gl_name.ln_number;
 		__entry->cur_state	= glock_trace_state(gl->gl_state);
 		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
-		__entry->flags		= gl->gl_flags;
+		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
 	),
 
 	TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
@@ -318,6 +321,33 @@ TRACE_EVENT(gfs2_log_blocks,
 		  MINOR(__entry->dev), __entry->blocks)
 );
 
+/* Writing back the AIL */
+TRACE_EVENT(gfs2_ail_flush,
+
+	TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start),
+
+	TP_ARGS(sdp, wbc, start),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int, start			)
+		__field(	int, sync_mode			)
+		__field(	long, nr_to_write		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sdp->sd_vfs->s_dev;
+		__entry->start		= start;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->nr_to_write	= wbc->nr_to_write;
+	),
+
+	TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->start ? "start" : "end",
+		  __entry->sync_mode == WB_SYNC_ALL ? "all" : "none",
+		  __entry->nr_to_write)
+);
+
 /* Section 3 - bmap
  *
  * Objectives:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1c..e7a0357 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
-	spin_lock(&mapping->i_mmap_lock);
+	mutex_lock(&mapping->i_mmap_mutex);
 	if (!prio_tree_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->i_mmap_mutex);
 	truncate_hugepages(inode, offset);
 	return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
index 33c963d..990d284 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/prefetch.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
 #include "internal.h"
@@ -325,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
 	memset(mapping, 0, sizeof(*mapping));
 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
 	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
+	mutex_init(&mapping->i_mmap_mutex);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
 	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-	mutex_init(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(address_space_init_once);
 
@@ -751,8 +751,12 @@ static void prune_icache(int nr_to_scan)
  * This function is passed the number of inodes to scan, and it returns the
  * total number of remaining possibly-reclaimable inodes.
  */
-static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink,
+				struct shrink_control *sc)
 {
+	int nr = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
 	if (nr) {
 		/*
 		 * Nasty deadlock avoidance.  We may hold various FS locks,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b1804..72ffa97 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
 	 * all outstanding updates to complete.
 	 */
 
-#ifdef COMMIT_STATS
-	spin_lock(&journal->j_list_lock);
-	summarise_journal_usage(journal);
-	spin_unlock(&journal->j_list_lock);
-#endif
-
 	/* Do we need to erase the effects of a prior journal_flush? */
 	if (journal->j_flags & JFS_FLUSHED) {
 		jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
                    required. */
 		JBUFFER_TRACE(jh, "file as BJ_Forget");
 		journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/* Wake up any transactions which were waiting for this
-		   IO to complete */
+		/*
+		 * Wake up any transactions which were waiting for this
+		 * IO to complete. The barrier must be here so that changes
+		 * by journal_file_buffer() take effect before wake_up_bit()
+		 * does the waitqueue check.
+		 */
+		smp_mb();
 		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713af..e2d4285 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
 int __log_start_commit(journal_t *journal, tid_t target)
 {
 	/*
-	 * Are we already doing a recent enough commit?
+	 * The only transaction we can possibly wait upon is the
+	 * currently running transaction (if it exists).  Otherwise,
+	 * the target tid must be an old one.
 	 */
-	if (!tid_geq(journal->j_commit_request, target)) {
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == target) {
 		/*
 		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
 			  journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
 		return 1;
-	}
+	} else if (!tid_geq(journal->j_commit_request, target))
+		/* This should never happen, but if it does, preserve
+		   the evidence before kjournald goes into a loop and
+		   increments j_commit_sequence beyond all recognition. */
+		WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+		    journal->j_commit_request, journal->j_commit_sequence,
+		    target, journal->j_running_transaction ?
+		    journal->j_running_transaction->t_tid : 0);
 	return 0;
 }
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319..f7ee81a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
  * This function is visible to journal users (like ext3fs), so is not
  * called with the journal already locked.
  *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
  */
 handle_t *journal_start(journal_t *journal, int nblocks)
 {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000..29148a8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * all outstanding updates to complete.
 	 */
 
-#ifdef COMMIT_STATS
-	spin_lock(&journal->j_list_lock);
-	summarise_journal_usage(journal);
-	spin_unlock(&journal->j_list_lock);
-#endif
-
 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 	if (journal->j_flags & JBD2_FLUSHED) {
 		jbd_debug(3, "super block updated\n");
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 1adc8d4..df0de27 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -10,6 +10,7 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
+#include <linux/prefetch.h>
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
 
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9e22085..d8d0938 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -481,7 +481,7 @@ static int inode_write_alias(struct super_block *sb,
 			val = inode_val0(inode);
 			break;
 		case INODE_USED_OFS:
-			val = cpu_to_be64(li->li_used_bytes);;
+			val = cpu_to_be64(li->li_used_bytes);
 			break;
 		case INODE_SIZE_OFS:
 			val = cpu_to_be64(i_size_read(inode));
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be..8c32ef3 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
  * What the mbcache registers as to get shrunk dynamically.
  */
 
-static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink,
+			      struct shrink_control *sc);
 
 static struct shrinker mb_cache_shrinker = {
 	.shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
  * gets low.
  *
  * @shrink: (ignored)
- * @nr_to_scan: Number of objects to scan
- * @gfp_mask: (ignored)
+ * @sc: shrink_control passed from reclaim
  *
  * Returns the number of objects which are present in the cache.
  */
 static int
-mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 {
 	LIST_HEAD(free_list);
 	struct mb_cache *cache;
 	struct mb_cache_entry *entry, *tmp;
 	int count = 0;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
 
 	mb_debug("trying to free %d entries", nr_to_scan);
 	spin_lock(&mb_cache_spinlock);
diff --git a/fs/namei.c b/fs/namei.c
index e3c4f11..6ff858c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1378,12 +1378,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 {
 	int res;
 
-	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
 		path_put_conditional(path, nd);
 		path_put(&nd->path);
 		return -ELOOP;
 	}
+	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 
 	nd->depth++;
 	current->link_count++;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4c..202f370 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 #endif
 	struct ncp_entry_info finfo;
 
-	data.wdog_pid = NULL;
+	memset(&data, 0, sizeof(data));
 	server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
 	if (!server)
 		return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 				struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
 
 				data.flags = md->flags;
-				data.int_flags = 0;
 				data.mounted_uid = md->mounted_uid;
 				data.wdog_pid = find_get_pid(md->wdog_pid);
 				data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 				data.file_mode = md->file_mode;
 				data.dir_mode = md->dir_mode;
 				data.info_fd = -1;
-				data.mounted_vol[0] = 0;
 			}
 			break;
 		default:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672..424e477 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
 	}
 }
 
-int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+			      struct shrink_control *sc)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
 
 	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
 		return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce..2df6ca7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
 
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
-					int nr_to_scan, gfp_t gfp_mask);
+					struct shrink_control *sc);
 
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 5232d3e..a2e2402 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -8,7 +8,7 @@
  *			Statistsics for the reply cache
  *	fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
  *			statistics for filehandle lookup
- *	io <bytes-read> <bytes-writtten>
+ *	io <bytes-read> <bytes-written>
  *			statistics for IO throughput
  *	th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> 
  *			time (seconds) when nfsd thread usage above thresholds
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index f768448..eed4d7b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -489,8 +489,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
 				     struct nilfs_palloc_req *req)
 {
-	nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
-	nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+	mark_buffer_dirty(req->pr_bitmap_bh);
+	mark_buffer_dirty(req->pr_desc_bh);
 	nilfs_mdt_mark_dirty(inode);
 
 	brelse(req->pr_bitmap_bh);
@@ -527,8 +527,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	kunmap(req->pr_bitmap_bh->b_page);
 	kunmap(req->pr_desc_bh->b_page);
 
-	nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
-	nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+	mark_buffer_dirty(req->pr_desc_bh);
+	mark_buffer_dirty(req->pr_bitmap_bh);
 	nilfs_mdt_mark_dirty(inode);
 
 	brelse(req->pr_bitmap_bh);
@@ -683,8 +683,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		kunmap(bitmap_bh->b_page);
 		kunmap(desc_bh->b_page);
 
-		nilfs_mdt_mark_buffer_dirty(desc_bh);
-		nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+		mark_buffer_dirty(desc_bh);
+		mark_buffer_dirty(bitmap_bh);
 		nilfs_mdt_mark_dirty(inode);
 
 		brelse(bitmap_bh);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 4723f04..aadbd0b 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -34,7 +34,9 @@
 
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-	return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+	struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info;
+
+	return nilfs->ns_dat;
 }
 
 static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 609cd22..a35ae35 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,12 +34,6 @@
 #include "page.h"
 #include "btnode.h"
 
-void nilfs_btnode_cache_init(struct address_space *btnc,
-			     struct backing_dev_info *bdi)
-{
-	nilfs_mapping_init(btnc, bdi);
-}
-
 void nilfs_btnode_cache_clear(struct address_space *btnc)
 {
 	invalidate_mapping_pages(btnc, 0, -1);
@@ -62,7 +56,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 		BUG();
 	}
 	memset(bh->b_data, 0, 1 << inode->i_blkbits);
-	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+	bh->b_bdev = inode->i_sb->s_bdev;
 	bh->b_blocknr = blocknr;
 	set_buffer_mapped(bh);
 	set_buffer_uptodate(bh);
@@ -94,10 +88,11 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 	if (pblocknr == 0) {
 		pblocknr = blocknr;
 		if (inode->i_ino != NILFS_DAT_INO) {
-			struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
+			struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 
 			/* blocknr is a virtual block number */
-			err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+			err = nilfs_dat_translate(nilfs->ns_dat, blocknr,
+						  &pblocknr);
 			if (unlikely(err)) {
 				brelse(bh);
 				goto out_locked;
@@ -120,7 +115,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 		goto found;
 	}
 	set_buffer_mapped(bh);
-	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+	bh->b_bdev = inode->i_sb->s_bdev;
 	bh->b_blocknr = pblocknr; /* set block address for read */
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
@@ -259,7 +254,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
 				       "invalid oldkey %lld (newkey=%lld)",
 				       (unsigned long long)oldkey,
 				       (unsigned long long)newkey);
-		nilfs_btnode_mark_dirty(obh);
+		mark_buffer_dirty(obh);
 
 		spin_lock_irq(&btnc->tree_lock);
 		radix_tree_delete(&btnc->page_tree, oldkey);
@@ -271,7 +266,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
 		unlock_page(opage);
 	} else {
 		nilfs_copy_buffer(nbh, obh);
-		nilfs_btnode_mark_dirty(nbh);
+		mark_buffer_dirty(nbh);
 
 		nbh->b_blocknr = newkey;
 		ctxt->bh = nbh;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 1b8ebd8..3a4dd2d 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
 					      __u64 blocknr);
@@ -51,7 +50,4 @@ void nilfs_btnode_commit_change_key(struct address_space *,
 void nilfs_btnode_abort_change_key(struct address_space *,
 				   struct nilfs_btnode_chkey_ctxt *);
 
-#define nilfs_btnode_mark_dirty(bh)	nilfs_mark_buffer_dirty(bh)
-
-
 #endif	/* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index d451ae0..7eafe46 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -714,7 +714,7 @@ static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
 				nilfs_btree_get_nonroot_node(path, level),
 				path[level].bp_index, key);
 			if (!buffer_dirty(path[level].bp_bh))
-				nilfs_btnode_mark_dirty(path[level].bp_bh);
+				mark_buffer_dirty(path[level].bp_bh);
 		} while ((path[level].bp_index == 0) &&
 			 (++level < nilfs_btree_height(btree) - 1));
 	}
@@ -739,7 +739,7 @@ static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
 		nilfs_btree_node_insert(node, path[level].bp_index,
 					*keyp, *ptrp, ncblk);
 		if (!buffer_dirty(path[level].bp_bh))
-			nilfs_btnode_mark_dirty(path[level].bp_bh);
+			mark_buffer_dirty(path[level].bp_bh);
 
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
@@ -777,9 +777,9 @@ static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
@@ -823,9 +823,9 @@ static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
@@ -870,9 +870,9 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	newkey = nilfs_btree_node_get_key(right, 0);
 	newptr = path[level].bp_newreq.bpr_ptr;
@@ -919,7 +919,7 @@ static void nilfs_btree_grow(struct nilfs_bmap *btree,
 	nilfs_btree_node_set_level(root, level + 1);
 
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
@@ -1194,7 +1194,7 @@ static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
 		nilfs_btree_node_delete(node, path[level].bp_index,
 					keyp, ptrp, ncblk);
 		if (!buffer_dirty(path[level].bp_bh))
-			nilfs_btnode_mark_dirty(path[level].bp_bh);
+			mark_buffer_dirty(path[level].bp_bh);
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
@@ -1226,9 +1226,9 @@ static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
@@ -1258,9 +1258,9 @@ static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
@@ -1289,7 +1289,7 @@ static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_sib_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+		mark_buffer_dirty(path[level].bp_sib_bh);
 
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = path[level].bp_sib_bh;
@@ -1315,7 +1315,7 @@ static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
 	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 
 	nilfs_btnode_delete(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
@@ -1709,7 +1709,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
 		nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
 		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
 		if (!buffer_dirty(bh))
-			nilfs_btnode_mark_dirty(bh);
+			mark_buffer_dirty(bh);
 		if (!nilfs_bmap_dirty(btree))
 			nilfs_bmap_set_dirty(btree);
 
@@ -1787,7 +1787,7 @@ static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
 {
 	while ((++level < nilfs_btree_height(btree) - 1) &&
 	       !buffer_dirty(path[level].bp_bh))
-		nilfs_btnode_mark_dirty(path[level].bp_bh);
+		mark_buffer_dirty(path[level].bp_bh);
 
 	return 0;
 }
@@ -2229,7 +2229,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
 	}
 
 	if (!buffer_dirty(bh))
-		nilfs_btnode_mark_dirty(bh);
+		mark_buffer_dirty(bh);
 	brelse(bh);
 	if (!nilfs_bmap_dirty(btree))
 		nilfs_bmap_set_dirty(btree);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 5ff15a8..c9b342c 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -216,14 +216,14 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
 		if (!nilfs_cpfile_is_in_first(cpfile, cno))
 			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
 								 kaddr, 1);
-		nilfs_mdt_mark_buffer_dirty(cp_bh);
+		mark_buffer_dirty(cp_bh);
 
 		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
 		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
 						       kaddr);
 		le64_add_cpu(&header->ch_ncheckpoints, 1);
 		kunmap_atomic(kaddr, KM_USER0);
-		nilfs_mdt_mark_buffer_dirty(header_bh);
+		mark_buffer_dirty(header_bh);
 		nilfs_mdt_mark_dirty(cpfile);
 	}
 
@@ -326,7 +326,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 		}
 		if (nicps > 0) {
 			tnicps += nicps;
-			nilfs_mdt_mark_buffer_dirty(cp_bh);
+			mark_buffer_dirty(cp_bh);
 			nilfs_mdt_mark_dirty(cpfile);
 			if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
 				count =
@@ -358,7 +358,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
 						       kaddr);
 		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
-		nilfs_mdt_mark_buffer_dirty(header_bh);
+		mark_buffer_dirty(header_bh);
 		nilfs_mdt_mark_dirty(cpfile);
 		kunmap_atomic(kaddr, KM_USER0);
 	}
@@ -671,10 +671,10 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 	le64_add_cpu(&header->ch_nsnapshots, 1);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nilfs_mdt_mark_buffer_dirty(prev_bh);
-	nilfs_mdt_mark_buffer_dirty(curr_bh);
-	nilfs_mdt_mark_buffer_dirty(cp_bh);
-	nilfs_mdt_mark_buffer_dirty(header_bh);
+	mark_buffer_dirty(prev_bh);
+	mark_buffer_dirty(curr_bh);
+	mark_buffer_dirty(cp_bh);
+	mark_buffer_dirty(header_bh);
 	nilfs_mdt_mark_dirty(cpfile);
 
 	brelse(prev_bh);
@@ -774,10 +774,10 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 	le64_add_cpu(&header->ch_nsnapshots, -1);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nilfs_mdt_mark_buffer_dirty(next_bh);
-	nilfs_mdt_mark_buffer_dirty(prev_bh);
-	nilfs_mdt_mark_buffer_dirty(cp_bh);
-	nilfs_mdt_mark_buffer_dirty(header_bh);
+	mark_buffer_dirty(next_bh);
+	mark_buffer_dirty(prev_bh);
+	mark_buffer_dirty(cp_bh);
+	mark_buffer_dirty(header_bh);
 	nilfs_mdt_mark_dirty(cpfile);
 
 	brelse(prev_bh);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 59e5fe7..fcc2f86 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -54,7 +54,7 @@ static int nilfs_dat_prepare_entry(struct inode *dat,
 static void nilfs_dat_commit_entry(struct inode *dat,
 				   struct nilfs_palloc_req *req)
 {
-	nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+	mark_buffer_dirty(req->pr_entry_bh);
 	nilfs_mdt_mark_dirty(dat);
 	brelse(req->pr_entry_bh);
 }
@@ -361,7 +361,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 	entry->de_blocknr = cpu_to_le64(blocknr);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nilfs_mdt_mark_buffer_dirty(entry_bh);
+	mark_buffer_dirty(entry_bh);
 	nilfs_mdt_mark_dirty(dat);
 
 	brelse(entry_bh);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 397e732..d7eeca6 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -111,7 +111,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	nilfs_transaction_commit(inode->i_sb);
 
  mapped:
-	SetPageChecked(page);
 	wait_on_page_writeback(page);
 	return VM_FAULT_LOCKED;
 }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1c2a3e2..08a07a2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,9 +48,6 @@
 #include "dat.h"
 #include "ifile.h"
 
-static const struct address_space_operations def_gcinode_aops = {
-};
-
 /*
  * nilfs_gccache_submit_read_data() - add data buffer and submit read request
  * @inode - gc inode
@@ -87,9 +84,9 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 		goto out;
 
 	if (pbn == 0) {
-		struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
-					  /* use original dat, not gc dat. */
-		err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+		err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
 		if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
 			brelse(bh);
 			goto failed;
@@ -103,7 +100,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 	}
 
 	if (!buffer_mapped(bh)) {
-		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+		bh->b_bdev = inode->i_sb->s_bdev;
 		set_buffer_mapped(bh);
 	}
 	bh->b_blocknr = pbn;
@@ -160,15 +157,11 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 	if (buffer_dirty(bh))
 		return -EEXIST;
 
-	if (buffer_nilfs_node(bh)) {
-		if (nilfs_btree_broken_node_block(bh)) {
-			clear_buffer_uptodate(bh);
-			return -EIO;
-		}
-		nilfs_btnode_mark_dirty(bh);
-	} else {
-		nilfs_mark_buffer_dirty(bh);
+	if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) {
+		clear_buffer_uptodate(bh);
+		return -EIO;
 	}
+	mark_buffer_dirty(bh);
 	return 0;
 }
 
@@ -178,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode)
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
-	inode->i_mapping->a_ops = &def_gcinode_aops;
+	inode->i_mapping->a_ops = &empty_aops;
 	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	ii->i_flags = 0;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index bfc73d3..684d763 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -80,7 +80,7 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
 		return ret;
 	}
 	nilfs_palloc_commit_alloc_entry(ifile, &req);
-	nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+	mark_buffer_dirty(req.pr_entry_bh);
 	nilfs_mdt_mark_dirty(ifile);
 	*out_ino = (ino_t)req.pr_entry_nr;
 	*out_bh = req.pr_entry_bh;
@@ -128,7 +128,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
 	raw_inode->i_flags = 0;
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+	mark_buffer_dirty(req.pr_entry_bh);
 	brelse(req.pr_entry_bh);
 
 	nilfs_palloc_commit_free_entry(ifile, &req);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index c0aa274..587f184 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -74,14 +74,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		    struct buffer_head *bh_result, int create)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 blknum = 0;
 	int err = 0, ret;
-	struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
 	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
 
-	down_read(&NILFS_MDT(dat)->mi_sem);
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
-	up_read(&NILFS_MDT(dat)->mi_sem);
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	if (ret >= 0) {	/* found */
 		map_bh(bh_result, inode->i_sb, blknum);
 		if (ret > 0)
@@ -596,6 +596,16 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 
+	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
+		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+		/* zero-fill unused portion in the case of super root block */
+		raw_inode->i_xattr = 0;
+		raw_inode->i_pad = 0;
+		memset((void *)raw_inode + sizeof(*raw_inode), 0,
+		       nilfs->ns_inode_size - sizeof(*raw_inode));
+	}
+
 	if (has_bmap)
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -872,8 +882,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 			return -EINVAL; /* NILFS_I_DIRTY may remain for
 					   freeing inode */
 		}
-		list_del(&ii->i_dirty);
-		list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
+		list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
 		set_bit(NILFS_I_QUEUED, &ii->i_state);
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
@@ -892,7 +901,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 		return err;
 	}
 	nilfs_update_inode(inode, ibh);
-	nilfs_mdt_mark_buffer_dirty(ibh);
+	mark_buffer_dirty(ibh);
 	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
 	brelse(ibh);
 	return 0;
@@ -931,7 +940,7 @@ void nilfs_dirty_inode(struct inode *inode)
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 __u64 start, __u64 len)
 {
-	struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	__u64 logical = 0, phys = 0, size = 0;
 	__u32 flags = 0;
 	loff_t isize;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f2469ba..41d6743 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -698,6 +698,63 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 	return 0;
 }
 
+static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
+			      void __user *argp)
+{
+	__u64 newsize;
+	int ret = -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&newsize, argp, sizeof(newsize)))
+		goto out_drop_write;
+
+	ret = nilfs_resize_fs(inode->i_sb, newsize);
+
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
+out:
+	return ret;
+}
+
+static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+	__u64 range[2];
+	__u64 minseg, maxseg;
+	unsigned long segbytes;
+	int ret = -EPERM;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(range, argp, sizeof(__u64[2])))
+		goto out;
+
+	ret = -ERANGE;
+	if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+		goto out;
+
+	segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
+
+	minseg = range[0] + segbytes - 1;
+	do_div(minseg, segbytes);
+	maxseg = NILFS_SB2_OFFSET_BYTES(range[1]);
+	do_div(maxseg, segbytes);
+	maxseg--;
+
+	ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
+out:
+	return ret;
+}
+
 static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 				unsigned int cmd, void __user *argp,
 				size_t membsz,
@@ -763,6 +820,10 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
 	case NILFS_IOCTL_SYNC:
 		return nilfs_ioctl_sync(inode, filp, cmd, argp);
+	case NILFS_IOCTL_RESIZE:
+		return nilfs_ioctl_resize(inode, filp, argp);
+	case NILFS_IOCTL_SET_ALLOC_RANGE:
+		return nilfs_ioctl_set_alloc_range(inode, argp);
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index a649b05..800e8d7 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -66,7 +66,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
 	kunmap_atomic(kaddr, KM_USER0);
 
 	set_buffer_uptodate(bh);
-	nilfs_mark_buffer_dirty(bh);
+	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(inode);
 	return 0;
 }
@@ -355,7 +355,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 	err = nilfs_mdt_read_block(inode, block, 0, &bh);
 	if (unlikely(err))
 		return err;
-	nilfs_mark_buffer_dirty(bh);
+	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(inode);
 	brelse(bh);
 	return 0;
@@ -450,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	address_space_init_once(&shadow->frozen_data);
-	nilfs_mapping_init(&shadow->frozen_data, bdi);
+	nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
 	address_space_init_once(&shadow->frozen_btnodes);
-	nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
+	nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
 	mi->mi_shadow = shadow;
 	return 0;
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ed68563..ab20a4b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -64,11 +64,6 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 	return inode->i_private;
 }
 
-static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
-{
-	return inode->i_sb->s_fs_info;
-}
-
 /* Default GFP flags using highmem */
 #define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
 
@@ -93,8 +88,6 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
 struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
 						struct buffer_head *bh);
 
-#define nilfs_mdt_mark_buffer_dirty(bh)	nilfs_mark_buffer_dirty(bh)
-
 static inline void nilfs_mdt_mark_dirty(struct inode *inode)
 {
 	if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
@@ -108,7 +101,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
 
 static inline __u64 nilfs_mdt_cno(struct inode *inode)
 {
-	return NILFS_I_NILFS(inode)->ns_cno;
+	return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
 }
 
 #define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a8dd344..a9c6a53 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -80,12 +80,6 @@ static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
 	return &ii->vfs_inode;
 }
 
-static inline struct inode *NILFS_AS_I(struct address_space *mapping)
-{
-	return (mapping->host) ? :
-		container_of(mapping, struct inode, i_data);
-}
-
 /*
  * Dynamic state flags of NILFS on-memory inode (i_state)
  */
@@ -298,6 +292,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
 					       int flip);
 int nilfs_commit_super(struct super_block *sb, int flag);
 int nilfs_cleanup_super(struct super_block *sb);
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
 int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 			    struct nilfs_root **root);
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 1168059..65221a0 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,8 +37,7 @@
 
 #define NILFS_BUFFER_INHERENT_BITS  \
 	((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
-	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
-	 (1UL << BH_NILFS_Checked))
+	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
 
 static struct buffer_head *
 __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -59,19 +58,6 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
 	return bh;
 }
 
-/*
- * Since the page cache of B-tree node pages or data page cache of pseudo
- * inodes does not have a valid mapping->host pointer, calling
- * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
- * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
- * To avoid this problem, the old style mark_buffer_dirty() is used instead.
- */
-void nilfs_mark_buffer_dirty(struct buffer_head *bh)
-{
-	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
-		__set_page_dirty_nobuffers(bh->b_page);
-}
-
 struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 				      struct address_space *mapping,
 				      unsigned long blkoff,
@@ -183,7 +169,7 @@ int nilfs_page_buffers_clean(struct page *page)
 void nilfs_page_bug(struct page *page)
 {
 	struct address_space *m;
-	unsigned long ino = 0;
+	unsigned long ino;
 
 	if (unlikely(!page)) {
 		printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
@@ -191,11 +177,8 @@ void nilfs_page_bug(struct page *page)
 	}
 
 	m = page->mapping;
-	if (m) {
-		struct inode *inode = NILFS_AS_I(m);
-		if (inode != NULL)
-			ino = inode->i_ino;
-	}
+	ino = m ? m->host->i_ino : 0;
+
 	printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
 	       "mapping=%p ino=%lu\n",
 	       page, atomic_read(&page->_count),
@@ -217,56 +200,6 @@ void nilfs_page_bug(struct page *page)
 }
 
 /**
- * nilfs_alloc_private_page - allocate a private page with buffer heads
- *
- * Return Value: On success, a pointer to the allocated page is returned.
- * On error, NULL is returned.
- */
-struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
-				      unsigned long state)
-{
-	struct buffer_head *bh, *head, *tail;
-	struct page *page;
-
-	page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
-	if (unlikely(!page))
-		return NULL;
-
-	lock_page(page);
-	head = alloc_page_buffers(page, size, 0);
-	if (unlikely(!head)) {
-		unlock_page(page);
-		__free_page(page);
-		return NULL;
-	}
-
-	bh = head;
-	do {
-		bh->b_state = (1UL << BH_NILFS_Allocated) | state;
-		tail = bh;
-		bh->b_bdev = bdev;
-		bh = bh->b_this_page;
-	} while (bh);
-
-	tail->b_this_page = head;
-	attach_page_buffers(page, head);
-
-	return page;
-}
-
-void nilfs_free_private_page(struct page *page)
-{
-	BUG_ON(!PageLocked(page));
-	BUG_ON(page->mapping);
-
-	if (page_has_buffers(page) && !try_to_free_buffers(page))
-		NILFS_PAGE_BUG(page, "failed to free page");
-
-	unlock_page(page);
-	__free_page(page);
-}
-
-/**
  * nilfs_copy_page -- copy the page with buffers
  * @dst: destination page
  * @src: source page
@@ -492,10 +425,10 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init(struct address_space *mapping,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
 			struct backing_dev_info *bdi)
 {
-	mapping->host = NULL;
+	mapping->host = inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->assoc_mapping = NULL;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f06b79a..fb7de71 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -38,14 +38,12 @@ enum {
 	BH_NILFS_Redirected,
 };
 
-BUFFER_FNS(NILFS_Allocated, nilfs_allocated)	/* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
 BUFFER_FNS(NILFS_Checked, nilfs_checked)	/* buffer is verified */
 BUFFER_FNS(NILFS_Redirected, nilfs_redirected)	/* redirected to a copy */
 
 
-void nilfs_mark_buffer_dirty(struct buffer_head *bh);
 int __nilfs_clear_page_dirty(struct page *);
 
 struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
@@ -54,14 +52,11 @@ void nilfs_forget_buffer(struct buffer_head *);
 void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
 int nilfs_page_buffers_clean(struct page *);
 void nilfs_page_bug(struct page *);
-struct page *nilfs_alloc_private_page(struct block_device *, int,
-				      unsigned long);
-void nilfs_free_private_page(struct page *);
 
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init(struct address_space *mapping,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
 			struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba4a645..a604ac0 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -387,9 +387,9 @@ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
 static void dispose_recovery_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
-		struct nilfs_recovery_block *rb
-			= list_entry(head->next,
-				     struct nilfs_recovery_block, list);
+		struct nilfs_recovery_block *rb;
+
+		rb = list_first_entry(head, struct nilfs_recovery_block, list);
 		list_del(&rb->list);
 		kfree(rb);
 	}
@@ -416,9 +416,9 @@ static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
 void nilfs_dispose_segment_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
-		struct nilfs_segment_entry *ent
-			= list_entry(head->next,
-				     struct nilfs_segment_entry, list);
+		struct nilfs_segment_entry *ent;
+
+		ent = list_first_entry(head, struct nilfs_segment_entry, list);
 		list_del(&ent->list);
 		kfree(ent);
 	}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2853ff2..850a7c0 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -239,12 +239,15 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
 				    u32 seed)
 {
 	struct nilfs_super_root *raw_sr;
+	struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
+	unsigned srsize;
 	u32 crc;
 
 	raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+	srsize = NILFS_SR_BYTES(nilfs->ns_inode_size);
 	crc = crc32_le(seed,
 		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-		       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+		       srsize - sizeof(raw_sr->sr_sum));
 	raw_sr->sr_sum = cpu_to_le32(crc);
 }
 
@@ -254,18 +257,6 @@ static void nilfs_release_buffers(struct list_head *list)
 
 	list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
 		list_del_init(&bh->b_assoc_buffers);
-		if (buffer_nilfs_allocated(bh)) {
-			struct page *clone_page = bh->b_page;
-
-			/* remove clone page */
-			brelse(bh);
-			page_cache_release(clone_page); /* for each bh */
-			if (page_count(clone_page) <= 2) {
-				lock_page(clone_page);
-				nilfs_free_private_page(clone_page);
-			}
-			continue;
-		}
 		brelse(bh);
 	}
 }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index afe4f21..141646e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -655,13 +655,10 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		if (unlikely(page->index > last))
 			break;
 
-		if (mapping->host) {
-			lock_page(page);
-			if (!page_has_buffers(page))
-				create_empty_buffers(page,
-						     1 << inode->i_blkbits, 0);
-			unlock_page(page);
-		}
+		lock_page(page);
+		if (!page_has_buffers(page))
+			create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+		unlock_page(page);
 
 		bh = head = page_buffers(page);
 		do {
@@ -809,7 +806,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 		/* The following code is duplicated with cpfile.  But, it is
 		   needed to collect the checkpoint even if it was not newly
 		   created */
-		nilfs_mdt_mark_buffer_dirty(bh_cp);
+		mark_buffer_dirty(bh_cp);
 		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
 		nilfs_cpfile_put_checkpoint(
 			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
@@ -889,12 +886,14 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 {
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
-	unsigned isz = nilfs->ns_inode_size;
+	unsigned isz, srsz;
 
 	bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
 	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	isz = nilfs->ns_inode_size;
+	srsz = NILFS_SR_BYTES(isz);
 
-	raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+	raw_sr->sr_bytes = cpu_to_le16(srsz);
 	raw_sr->sr_nongc_ctime
 		= cpu_to_le64(nilfs_doing_gc() ?
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
@@ -906,6 +905,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 				 NILFS_SR_CPFILE_OFFSET(isz), 1);
 	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
 				 NILFS_SR_SUFILE_OFFSET(isz), 1);
+	memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
 }
 
 static void nilfs_redirty_inodes(struct list_head *head)
@@ -954,8 +954,8 @@ static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
 
  dispose_buffers:
 	while (!list_empty(listp)) {
-		bh = list_entry(listp->next, struct buffer_head,
-				b_assoc_buffers);
+		bh = list_first_entry(listp, struct buffer_head,
+				      b_assoc_buffers);
 		list_del_init(&bh->b_assoc_buffers);
 		brelse(bh);
 	}
@@ -1500,10 +1500,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
 			nblocks = le32_to_cpu(finfo->fi_nblocks);
 			ndatablk = le32_to_cpu(finfo->fi_ndatablk);
 
-			if (buffer_nilfs_node(bh))
-				inode = NILFS_BTNC_I(bh->b_page->mapping);
-			else
-				inode = NILFS_AS_I(bh->b_page->mapping);
+			inode = bh->b_page->mapping->host;
 
 			if (mode == SC_LSEG_DSYNC)
 				sc_op = &nilfs_sc_dsync_ops;
@@ -1556,83 +1553,24 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
 	return 0;
 }
 
-static int
-nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
-{
-	struct page *clone_page;
-	struct buffer_head *bh, *head, *bh2;
-	void *kaddr;
-
-	bh = head = page_buffers(page);
-
-	clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
-	if (unlikely(!clone_page))
-		return -ENOMEM;
-
-	bh2 = page_buffers(clone_page);
-	kaddr = kmap_atomic(page, KM_USER0);
-	do {
-		if (list_empty(&bh->b_assoc_buffers))
-			continue;
-		get_bh(bh2);
-		page_cache_get(clone_page); /* for each bh */
-		memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
-		bh2->b_blocknr = bh->b_blocknr;
-		list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
-		list_add_tail(&bh->b_assoc_buffers, out);
-	} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
-	kunmap_atomic(kaddr, KM_USER0);
-
-	if (!TestSetPageWriteback(clone_page))
-		account_page_writeback(clone_page);
-	unlock_page(clone_page);
-
-	return 0;
-}
-
-static int nilfs_test_page_to_be_frozen(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-
-	if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
-		return 0;
-
-	if (page_mapped(page)) {
-		ClearPageChecked(page);
-		return 1;
-	}
-	return PageChecked(page);
-}
-
-static int nilfs_begin_page_io(struct page *page, struct list_head *out)
+static void nilfs_begin_page_io(struct page *page)
 {
 	if (!page || PageWriteback(page))
 		/* For split b-tree node pages, this function may be called
 		   twice.  We ignore the 2nd or later calls by this check. */
-		return 0;
+		return;
 
 	lock_page(page);
 	clear_page_dirty_for_io(page);
 	set_page_writeback(page);
 	unlock_page(page);
-
-	if (nilfs_test_page_to_be_frozen(page)) {
-		int err = nilfs_copy_replace_page_buffers(page, out);
-		if (unlikely(err))
-			return err;
-	}
-	return 0;
 }
 
-static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
-				       struct page **failed_page)
+static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
-	struct list_head *list = &sci->sc_copied_buffers;
-	int err;
 
-	*failed_page = NULL;
 	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
 		struct buffer_head *bh;
 
@@ -1662,11 +1600,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 				break;
 			}
 			if (bh->b_page != fs_page) {
-				err = nilfs_begin_page_io(fs_page, list);
-				if (unlikely(err)) {
-					*failed_page = fs_page;
-					goto out;
-				}
+				nilfs_begin_page_io(fs_page);
 				fs_page = bh->b_page;
 			}
 		}
@@ -1677,11 +1611,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 		set_page_writeback(bd_page);
 		unlock_page(bd_page);
 	}
-	err = nilfs_begin_page_io(fs_page, list);
-	if (unlikely(err))
-		*failed_page = fs_page;
- out:
-	return err;
+	nilfs_begin_page_io(fs_page);
 }
 
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
@@ -1694,24 +1624,6 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
 	return ret;
 }
 
-static void __nilfs_end_page_io(struct page *page, int err)
-{
-	if (!err) {
-		if (!nilfs_page_buffers_clean(page))
-			__set_page_dirty_nobuffers(page);
-		ClearPageError(page);
-	} else {
-		__set_page_dirty_nobuffers(page);
-		SetPageError(page);
-	}
-
-	if (buffer_nilfs_allocated(page_buffers(page))) {
-		if (TestClearPageWriteback(page))
-			dec_zone_page_state(page, NR_WRITEBACK);
-	} else
-		end_page_writeback(page);
-}
-
 static void nilfs_end_page_io(struct page *page, int err)
 {
 	if (!page)
@@ -1738,40 +1650,19 @@ static void nilfs_end_page_io(struct page *page, int err)
 		return;
 	}
 
-	__nilfs_end_page_io(page, err);
-}
-
-static void nilfs_clear_copied_buffers(struct list_head *list, int err)
-{
-	struct buffer_head *bh, *head;
-	struct page *page;
-
-	while (!list_empty(list)) {
-		bh = list_entry(list->next, struct buffer_head,
-				b_assoc_buffers);
-		page = bh->b_page;
-		page_cache_get(page);
-		head = bh = page_buffers(page);
-		do {
-			if (!list_empty(&bh->b_assoc_buffers)) {
-				list_del_init(&bh->b_assoc_buffers);
-				if (!err) {
-					set_buffer_uptodate(bh);
-					clear_buffer_dirty(bh);
-					clear_buffer_delay(bh);
-					clear_buffer_nilfs_volatile(bh);
-				}
-				brelse(bh); /* for b_assoc_buffers */
-			}
-		} while ((bh = bh->b_this_page) != head);
-
-		__nilfs_end_page_io(page, err);
-		page_cache_release(page);
+	if (!err) {
+		if (!nilfs_page_buffers_clean(page))
+			__set_page_dirty_nobuffers(page);
+		ClearPageError(page);
+	} else {
+		__set_page_dirty_nobuffers(page);
+		SetPageError(page);
 	}
+
+	end_page_writeback(page);
 }
 
-static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-			     int err)
+static void nilfs_abort_logs(struct list_head *logs, int err)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
@@ -1801,8 +1692,6 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
 			}
 			if (bh->b_page != fs_page) {
 				nilfs_end_page_io(fs_page, err);
-				if (fs_page && fs_page == failed_page)
-					return;
 				fs_page = bh->b_page;
 			}
 		}
@@ -1821,12 +1710,11 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
 
 	list_splice_tail_init(&sci->sc_write_logs, &logs);
 	ret = nilfs_wait_on_logs(&logs);
-	nilfs_abort_logs(&logs, NULL, ret ? : err);
+	nilfs_abort_logs(&logs, ret ? : err);
 
 	list_splice_tail_init(&sci->sc_segbufs, &logs);
 	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
 	nilfs_free_incomplete_logs(&logs, nilfs);
-	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
 
 	if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 		ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
@@ -1920,8 +1808,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 
 	nilfs_end_page_io(fs_page, 0);
 
-	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
-
 	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
 
 	if (nilfs_doing_gc())
@@ -1979,7 +1865,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 					      "failed to get inode block.\n");
 				return err;
 			}
-			nilfs_mdt_mark_buffer_dirty(ibh);
+			mark_buffer_dirty(ibh);
 			nilfs_mdt_mark_dirty(ifile);
 			spin_lock(&nilfs->ns_inode_lock);
 			if (likely(!ii->i_bh))
@@ -1991,8 +1877,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 
 		clear_bit(NILFS_I_QUEUED, &ii->i_state);
 		set_bit(NILFS_I_BUSY, &ii->i_state);
-		list_del(&ii->i_dirty);
-		list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
+		list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
 
@@ -2014,8 +1899,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 		clear_bit(NILFS_I_BUSY, &ii->i_state);
 		brelse(ii->i_bh);
 		ii->i_bh = NULL;
-		list_del(&ii->i_dirty);
-		list_add_tail(&ii->i_dirty, &ti->ti_garbage);
+		list_move_tail(&ii->i_dirty, &ti->ti_garbage);
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
 }
@@ -2026,7 +1910,6 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 {
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct page *failed_page;
 	int err;
 
 	sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2081,11 +1964,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
 
 		/* Write partial segments */
-		err = nilfs_segctor_prepare_write(sci, &failed_page);
-		if (err) {
-			nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
-			goto failed_to_write;
-		}
+		nilfs_segctor_prepare_write(sci);
 
 		nilfs_add_checksums_on_logs(&sci->sc_segbufs,
 					    nilfs->ns_crc_seed);
@@ -2687,7 +2566,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
 	INIT_LIST_HEAD(&sci->sc_segbufs);
 	INIT_LIST_HEAD(&sci->sc_write_logs);
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
-	INIT_LIST_HEAD(&sci->sc_copied_buffers);
 	init_timer(&sci->sc_timer);
 
 	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2741,8 +2619,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 	if (flag || !nilfs_segctor_confirm(sci))
 		nilfs_segctor_write_out(sci);
 
-	WARN_ON(!list_empty(&sci->sc_copied_buffers));
-
 	if (!list_empty(&sci->sc_dirty_files)) {
 		nilfs_warning(sci->sc_super, __func__,
 			      "dirty file(s) after the final construction\n");
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6c02a86..38a1d00 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -92,7 +92,6 @@ struct nilfs_segsum_pointer {
  * @sc_nblk_inc: Block count of current generation
  * @sc_dirty_files: List of files to be written
  * @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
  * @sc_freesegs: array of segment numbers to be freed
  * @sc_nfreesegs: number of segments on @sc_freesegs
  * @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -136,7 +135,6 @@ struct nilfs_sc_info {
 
 	struct list_head	sc_dirty_files;
 	struct list_head	sc_gc_inodes;
-	struct list_head	sc_copied_buffers;
 
 	__u64		       *sc_freesegs;
 	size_t			sc_nfreesegs;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1d6f488..0a0aba6 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -33,7 +33,9 @@
 
 struct nilfs_sufile_info {
 	struct nilfs_mdt_info mi;
-	unsigned long ncleansegs;
+	unsigned long ncleansegs;/* number of clean segments */
+	__u64 allocmin;		/* lower limit of allocatable segment range */
+	__u64 allocmax;		/* upper limit of allocatable segment range */
 };
 
 static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
@@ -96,6 +98,13 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
 				   create, NULL, bhp);
 }
 
+static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile,
+						   __u64 segnum)
+{
+	return nilfs_mdt_delete_block(sufile,
+				      nilfs_sufile_get_blkoff(sufile, segnum));
+}
+
 static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 				     u64 ncleanadd, u64 ndirtyadd)
 {
@@ -108,7 +117,7 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nilfs_mdt_mark_buffer_dirty(header_bh);
+	mark_buffer_dirty(header_bh);
 }
 
 /**
@@ -248,6 +257,35 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
 }
 
 /**
+ * nilfs_sufile_set_alloc_range - limit range of segment to be allocated
+ * @sufile: inode of segment usage file
+ * @start: minimum segment number of allocatable region (inclusive)
+ * @end: maximum segment number of allocatable region (inclusive)
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-ERANGE - invalid segment region
+ */
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
+{
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	__u64 nsegs;
+	int ret = -ERANGE;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+
+	if (start <= end && end < nsegs) {
+		sui->allocmin = start;
+		sui->allocmax = end;
+		ret = 0;
+	}
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
  * nilfs_sufile_alloc - allocate a segment
  * @sufile: inode of segment usage file
  * @segnump: pointer to segment number
@@ -269,11 +307,12 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	struct buffer_head *header_bh, *su_bh;
 	struct nilfs_sufile_header *header;
 	struct nilfs_segment_usage *su;
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
 	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
 	__u64 segnum, maxsegnum, last_alloc;
 	void *kaddr;
-	unsigned long nsegments, ncleansegs, nsus;
-	int ret, i, j;
+	unsigned long nsegments, ncleansegs, nsus, cnt;
+	int ret, j;
 
 	down_write(&NILFS_MDT(sufile)->mi_sem);
 
@@ -287,13 +326,31 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	nsegments = nilfs_sufile_get_nsegments(sufile);
+	maxsegnum = sui->allocmax;
 	segnum = last_alloc + 1;
-	maxsegnum = nsegments - 1;
-	for (i = 0; i < nsegments; i += nsus) {
-		if (segnum >= nsegments) {
-			/* wrap around */
-			segnum = 0;
-			maxsegnum = last_alloc;
+	if (segnum < sui->allocmin || segnum > sui->allocmax)
+		segnum = sui->allocmin;
+
+	for (cnt = 0; cnt < nsegments; cnt += nsus) {
+		if (segnum > maxsegnum) {
+			if (cnt < sui->allocmax - sui->allocmin + 1) {
+				/*
+				 * wrap around in the limited region.
+				 * if allocation started from
+				 * sui->allocmin, this never happens.
+				 */
+				segnum = sui->allocmin;
+				maxsegnum = last_alloc;
+			} else if (segnum > sui->allocmin &&
+				   sui->allocmax + 1 < nsegments) {
+				segnum = sui->allocmax + 1;
+				maxsegnum = nsegments - 1;
+			} else if (sui->allocmin > 0)  {
+				segnum = 0;
+				maxsegnum = sui->allocmin - 1;
+			} else {
+				break; /* never happens */
+			}
 		}
 		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
 							   &su_bh);
@@ -319,9 +376,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 			header->sh_last_alloc = cpu_to_le64(segnum);
 			kunmap_atomic(kaddr, KM_USER0);
 
-			NILFS_SUI(sufile)->ncleansegs--;
-			nilfs_mdt_mark_buffer_dirty(header_bh);
-			nilfs_mdt_mark_buffer_dirty(su_bh);
+			sui->ncleansegs--;
+			mark_buffer_dirty(header_bh);
+			mark_buffer_dirty(su_bh);
 			nilfs_mdt_mark_dirty(sufile);
 			brelse(su_bh);
 			*segnump = segnum;
@@ -364,7 +421,7 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
 	nilfs_sufile_mod_counter(header_bh, -1, 1);
 	NILFS_SUI(sufile)->ncleansegs--;
 
-	nilfs_mdt_mark_buffer_dirty(su_bh);
+	mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
 
@@ -395,7 +452,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
 	NILFS_SUI(sufile)->ncleansegs -= clean;
 
-	nilfs_mdt_mark_buffer_dirty(su_bh);
+	mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
 
@@ -421,7 +478,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 	sudirty = nilfs_segment_usage_dirty(su);
 	nilfs_segment_usage_set_clean(su);
 	kunmap_atomic(kaddr, KM_USER0);
-	nilfs_mdt_mark_buffer_dirty(su_bh);
+	mark_buffer_dirty(su_bh);
 
 	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
 	NILFS_SUI(sufile)->ncleansegs++;
@@ -441,7 +498,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 
 	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
 	if (!ret) {
-		nilfs_mdt_mark_buffer_dirty(bh);
+		mark_buffer_dirty(bh);
 		nilfs_mdt_mark_dirty(sufile);
 		brelse(bh);
 	}
@@ -476,7 +533,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 	su->su_nblocks = cpu_to_le32(nblocks);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nilfs_mdt_mark_buffer_dirty(bh);
+	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(sufile);
 	brelse(bh);
 
@@ -505,7 +562,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
 	struct buffer_head *header_bh;
 	struct nilfs_sufile_header *header;
-	struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
 	void *kaddr;
 	int ret;
 
@@ -555,11 +612,183 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 		nilfs_sufile_mod_counter(header_bh, -1, 0);
 		NILFS_SUI(sufile)->ncleansegs--;
 	}
-	nilfs_mdt_mark_buffer_dirty(su_bh);
+	mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
 
 /**
+  * nilfs_sufile_truncate_range - truncate range of segment array
+  * @sufile: inode of segment usage file
+  * @start: start segment number (inclusive)
+  * @end: end segment number (inclusive)
+  *
+  * Return Value: On success, 0 is returned.  On error, one of the
+  * following negative error codes is returned.
+  *
+  * %-EIO - I/O error.
+  *
+  * %-ENOMEM - Insufficient amount of memory available.
+  *
+  * %-EINVAL - Invalid number of segments specified
+  *
+  * %-EBUSY - Dirty or active segments are present in the range
+  */
+static int nilfs_sufile_truncate_range(struct inode *sufile,
+				       __u64 start, __u64 end)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh;
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su, *su2;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	unsigned long segusages_per_block;
+	unsigned long nsegs, ncleaned;
+	__u64 segnum;
+	void *kaddr;
+	ssize_t n, nc;
+	int ret;
+	int j;
+
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+
+	ret = -EINVAL;
+	if (start > end || start >= nsegs)
+		goto out;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out;
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	ncleaned = 0;
+
+	for (segnum = start; segnum <= end; segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  end - segnum + 1);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out_header;
+			/* hole */
+			continue;
+		}
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		su2 = su;
+		for (j = 0; j < n; j++, su = (void *)su + susz) {
+			if ((le32_to_cpu(su->su_flags) &
+			     ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+			    nilfs_segment_is_active(nilfs, segnum + j)) {
+				ret = -EBUSY;
+				kunmap_atomic(kaddr, KM_USER0);
+				brelse(su_bh);
+				goto out_header;
+			}
+		}
+		nc = 0;
+		for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) {
+			if (nilfs_segment_usage_error(su)) {
+				nilfs_segment_usage_set_clean(su);
+				nc++;
+			}
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		if (nc > 0) {
+			mark_buffer_dirty(su_bh);
+			ncleaned += nc;
+		}
+		brelse(su_bh);
+
+		if (n == segusages_per_block) {
+			/* make hole */
+			nilfs_sufile_delete_segment_usage_block(sufile, segnum);
+		}
+	}
+	ret = 0;
+
+out_header:
+	if (ncleaned > 0) {
+		NILFS_SUI(sufile)->ncleansegs += ncleaned;
+		nilfs_sufile_mod_counter(header_bh, ncleaned, 0);
+		nilfs_mdt_mark_dirty(sufile);
+	}
+	brelse(header_bh);
+out:
+	return ret;
+}
+
+/**
+ * nilfs_sufile_resize - resize segment array
+ * @sufile: inode of segment usage file
+ * @newnsegs: new number of segments
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - Enough free space is not left for shrinking
+ *
+ * %-EBUSY - Dirty or active segments exist in the region to be truncated
+ */
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
+{
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	void *kaddr;
+	unsigned long nsegs, nrsvsegs;
+	int ret = 0;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	nsegs = nilfs_sufile_get_nsegments(sufile);
+	if (nsegs == newnsegs)
+		goto out;
+
+	ret = -ENOSPC;
+	nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs);
+	if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs)
+		goto out;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out;
+
+	if (newnsegs > nsegs) {
+		sui->ncleansegs += newnsegs - nsegs;
+	} else /* newnsegs < nsegs */ {
+		ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1);
+		if (ret < 0)
+			goto out_header;
+
+		sui->ncleansegs -= nsegs - newnsegs;
+	}
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(sufile);
+	nilfs_set_nsegments(nilfs, newnsegs);
+
+out_header:
+	brelse(header_bh);
+out:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
  * nilfs_sufile_get_suinfo -
  * @sufile: inode of segment usage file
  * @segnum: segment number to start looking
@@ -583,7 +812,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 	struct nilfs_segment_usage *su;
 	struct nilfs_suinfo *si = buf;
 	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
-	struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
+	struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
 	void *kaddr;
 	unsigned long nsegs, segusages_per_block;
 	ssize_t n;
@@ -679,6 +908,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
 	kunmap_atomic(kaddr, KM_USER0);
 	brelse(header_bh);
 
+	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
+	sui->allocmin = 0;
+
 	unlock_new_inode(sufile);
  out:
 	*inodep = sufile;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a943fba..e84bc5b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,11 +31,12 @@
 
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
-	return NILFS_I_NILFS(sufile)->ns_nsegments;
+	return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments;
 }
 
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
 
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end);
 int nilfs_sufile_alloc(struct inode *, __u64 *);
 int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
 int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
@@ -61,6 +62,7 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 			       struct buffer_head *);
 
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
 int nilfs_sufile_read(struct super_block *sb, size_t susize,
 		      struct nilfs_inode *raw_inode, struct inode **inodep);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 062cca0..8351c44 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -56,6 +56,7 @@
 #include "btnode.h"
 #include "page.h"
 #include "cpfile.h"
+#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */
 #include "ifile.h"
 #include "dat.h"
 #include "segment.h"
@@ -165,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_state = 0;
 	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
+	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
 	return &ii->vfs_inode;
 }
 
@@ -347,6 +348,134 @@ int nilfs_cleanup_super(struct super_block *sb)
 	return ret;
 }
 
+/**
+ * nilfs_move_2nd_super - relocate secondary super block
+ * @sb: super block instance
+ * @sb2off: new offset of the secondary super block (in bytes)
+ */
+static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct buffer_head *nsbh;
+	struct nilfs_super_block *nsbp;
+	sector_t blocknr, newblocknr;
+	unsigned long offset;
+	int sb2i = -1;  /* array index of the secondary superblock */
+	int ret = 0;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	if (nilfs->ns_sbh[1] &&
+	    nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) {
+		sb2i = 1;
+		blocknr = nilfs->ns_sbh[1]->b_blocknr;
+	} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
+		sb2i = 0;
+		blocknr = nilfs->ns_sbh[0]->b_blocknr;
+	}
+	if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
+		goto out;  /* super block location is unchanged */
+
+	/* Get new super block buffer */
+	newblocknr = sb2off >> nilfs->ns_blocksize_bits;
+	offset = sb2off & (nilfs->ns_blocksize - 1);
+	nsbh = sb_getblk(sb, newblocknr);
+	if (!nsbh) {
+		printk(KERN_WARNING
+		       "NILFS warning: unable to move secondary superblock "
+		       "to block %llu\n", (unsigned long long)newblocknr);
+		ret = -EIO;
+		goto out;
+	}
+	nsbp = (void *)nsbh->b_data + offset;
+	memset(nsbp, 0, nilfs->ns_blocksize);
+
+	if (sb2i >= 0) {
+		memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+		brelse(nilfs->ns_sbh[sb2i]);
+		nilfs->ns_sbh[sb2i] = nsbh;
+		nilfs->ns_sbp[sb2i] = nsbp;
+	} else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) {
+		/* secondary super block will be restored to index 1 */
+		nilfs->ns_sbh[1] = nsbh;
+		nilfs->ns_sbp[1] = nsbp;
+	} else {
+		brelse(nsbh);
+	}
+out:
+	return ret;
+}
+
+/**
+ * nilfs_resize_fs - resize the filesystem
+ * @sb: super block instance
+ * @newsize: new size of the filesystem (in bytes)
+ */
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
+{
+	struct the_nilfs *nilfs = sb->s_fs_info;
+	struct nilfs_super_block **sbp;
+	__u64 devsize, newnsegs;
+	loff_t sb2off;
+	int ret;
+
+	ret = -ERANGE;
+	devsize = i_size_read(sb->s_bdev->bd_inode);
+	if (newsize > devsize)
+		goto out;
+
+	/*
+	 * Write lock is required to protect some functions depending
+	 * on the number of segments, the number of reserved segments,
+	 * and so forth.
+	 */
+	down_write(&nilfs->ns_segctor_sem);
+
+	sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
+	newnsegs = sb2off >> nilfs->ns_blocksize_bits;
+	do_div(newnsegs, nilfs->ns_blocks_per_segment);
+
+	ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
+	up_write(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		goto out;
+
+	ret = nilfs_construct_segment(sb);
+	if (ret < 0)
+		goto out;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_move_2nd_super(sb, sb2off);
+	ret = -EIO;
+	sbp = nilfs_prepare_super(sb, 0);
+	if (likely(sbp)) {
+		nilfs_set_log_cursor(sbp[0], nilfs);
+		/*
+		 * Drop NILFS_RESIZE_FS flag for compatibility with
+		 * mount-time resize which may be implemented in a
+		 * future release.
+		 */
+		sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) &
+					      ~NILFS_RESIZE_FS);
+		sbp[0]->s_dev_size = cpu_to_le64(newsize);
+		sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments);
+		if (sbp[1])
+			memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+		ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+	}
+	up_write(&nilfs->ns_sem);
+
+	/*
+	 * Reset the range of allocatable segments last.  This order
+	 * is important in the case of expansion because the secondary
+	 * superblock must be protected from log write until migration
+	 * completes.
+	 */
+	if (!ret)
+		nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1);
+out:
+	return ret;
+}
+
 static void nilfs_put_super(struct super_block *sb)
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d2acd1a..d327140 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -363,6 +363,24 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
 	return res;
 }
 
+/**
+ * nilfs_nrsvsegs - calculate the number of reserved segments
+ * @nilfs: nilfs object
+ * @nsegs: total number of segments
+ */
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+	return max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+		     DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage,
+				  100));
+}
+
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+	nilfs->ns_nsegments = nsegs;
+	nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs);
+}
+
 static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 				   struct nilfs_super_block *sbp)
 {
@@ -389,13 +407,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 	}
 
 	nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
-	nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
 	nilfs->ns_r_segments_percentage =
 		le32_to_cpu(sbp->s_r_segments_percentage);
-	nilfs->ns_nrsvsegs =
-		max_t(unsigned long, NILFS_MIN_NRSVSEGS,
-		      DIV_ROUND_UP(nilfs->ns_nsegments *
-				   nilfs->ns_r_segments_percentage, 100));
+	nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
 	nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
 	return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f496814..9992b11 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -268,6 +268,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev);
 void destroy_nilfs(struct the_nilfs *nilfs);
 int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
 int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5d32749..3c7606c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3706,7 +3706,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
 	context->cow_start = cow_start;
 	context->cow_len = cow_len;
 	context->ref_tree = ref_tree;
-	context->ref_root_bh = ref_root_bh;;
+	context->ref_root_bh = ref_root_bh;
 	context->cow_object = xv;
 
 	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index ce4f624..af9fdf0 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -565,7 +565,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state)
 
 	data = read_part_sector(state, 0, &sect);
 	if (!data) {
-		ldm_crit ("Disk read failed.");
+		ldm_info ("Disk read failed.");
 		return false;
 	}
 
@@ -1335,6 +1335,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
 
 	list_add_tail (&f->list, frags);
 found:
+	if (rec >= f->num) {
+		ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
+		return false;
+	}
+
 	if (f->map & (1 << rec)) {
 		ldm_error ("Duplicate VBLK, part %d.", rec);
 		f->map &= 0x7F;			/* Mark the group as broken */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3..3763b43 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
 
+struct proc_maps_private {
+	struct pid *pid;
+	struct task_struct *task;
+#ifdef CONFIG_MMU
+	struct vm_area_struct *tail_vma;
+#endif
+};
+
 void proc_init_inodecache(void);
 
 static inline struct pid *proc_pid(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d865..2c9db29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = {
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_NUMA
-extern int show_numa_map(struct seq_file *m, void *v);
+
+struct numa_maps {
+	struct vm_area_struct *vma;
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long active;
+	unsigned long writeback;
+	unsigned long mapcount_max;
+	unsigned long dirty;
+	unsigned long swapcache;
+	unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+	struct proc_maps_private proc_maps;
+	struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+{
+	int count = page_mapcount(page);
+
+	md->pages++;
+	if (pte_dirty || PageDirty(page))
+		md->dirty++;
+
+	if (PageSwapCache(page))
+		md->swapcache++;
+
+	if (PageActive(page) || PageUnevictable(page))
+		md->active++;
+
+	if (PageWriteback(page))
+		md->writeback++;
+
+	if (PageAnon(page))
+		md->anon++;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->node[page_to_nid(page)]++;
+}
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+		unsigned long end, struct mm_walk *walk)
+{
+	struct numa_maps *md;
+	spinlock_t *ptl;
+	pte_t *orig_pte;
+	pte_t *pte;
+
+	md = walk->private;
+	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	do {
+		struct page *page;
+		int nid;
+
+		if (!pte_present(*pte))
+			continue;
+
+		page = vm_normal_page(md->vma, addr, *pte);
+		if (!page)
+			continue;
+
+		if (PageReserved(page))
+			continue;
+
+		nid = page_to_nid(page);
+		if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+			continue;
+
+		gather_stats(page, md, pte_dirty(*pte));
+
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(orig_pte, ptl);
+	return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+		unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+	struct numa_maps *md;
+	struct page *page;
+
+	if (pte_none(*pte))
+		return 0;
+
+	page = pte_page(*pte);
+	if (!page)
+		return 0;
+
+	md = walk->private;
+	gather_stats(page, md, pte_dirty(*pte));
+	return 0;
+}
+
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+		unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+	struct numa_maps_private *numa_priv = m->private;
+	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md = &numa_priv->md;
+	struct file *file = vma->vm_file;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mm_walk walk = {};
+	struct mempolicy *pol;
+	int n;
+	char buffer[50];
+
+	if (!mm)
+		return 0;
+
+	/* Ensure we start with an empty set of numa_maps statistics. */
+	memset(md, 0, sizeof(*md));
+
+	md->vma = vma;
+
+	walk.hugetlb_entry = gather_hugetbl_stats;
+	walk.pmd_entry = gather_pte_stats;
+	walk.private = md;
+	walk.mm = mm;
+
+	pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+	mpol_to_str(buffer, sizeof(buffer), pol, 0);
+	mpol_cond_put(pol);
+
+	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+	if (file) {
+		seq_printf(m, " file=");
+		seq_path(m, &file->f_path, "\n\t= ");
+	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+		seq_printf(m, " heap");
+	} else if (vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack) {
+		seq_printf(m, " stack");
+	}
+
+	walk_page_range(vma->vm_start, vma->vm_end, &walk);
+
+	if (!md->pages)
+		goto out;
+
+	if (md->anon)
+		seq_printf(m, " anon=%lu", md->anon);
+
+	if (md->dirty)
+		seq_printf(m, " dirty=%lu", md->dirty);
+
+	if (md->pages != md->anon && md->pages != md->dirty)
+		seq_printf(m, " mapped=%lu", md->pages);
+
+	if (md->mapcount_max > 1)
+		seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+	if (md->swapcache)
+		seq_printf(m, " swapcache=%lu", md->swapcache);
+
+	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+		seq_printf(m, " active=%lu", md->active);
+
+	if (md->writeback)
+		seq_printf(m, " writeback=%lu", md->writeback);
+
+	for_each_node_state(n, N_HIGH_MEMORY)
+		if (md->node[n])
+			seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+	seq_putc(m, '\n');
+
+	if (m->count < m->size)
+		m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+	return 0;
+}
 
 static const struct seq_operations proc_pid_numa_maps_op = {
         .start  = m_start,
@@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
 
 static int numa_maps_open(struct inode *inode, struct file *file)
 {
-	return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+	struct numa_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->proc_maps.pid = proc_pid(inode);
+		ret = seq_open(file, &proc_pid_numa_maps_op);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
 }
 
 const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release_private,
 };
-#endif
+#endif /* CONFIG_NUMA */
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f835a25..f2c3ff2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -152,21 +152,27 @@ EXPORT_SYMBOL_GPL(pstore_register);
 void pstore_get_records(void)
 {
 	struct pstore_info *psi = psinfo;
-	size_t			size;
+	ssize_t			size;
 	u64			id;
 	enum pstore_type_id	type;
 	struct timespec		time;
-	int			failed = 0;
+	int			failed = 0, rc;
 
 	if (!psi)
 		return;
 
 	mutex_lock(&psinfo->buf_mutex);
+	rc = psi->open(psi);
+	if (rc)
+		goto out;
+
 	while ((size = psi->read(&id, &type, &time)) > 0) {
-		if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+		if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
 				  time, psi->erase))
 			failed++;
 	}
+	psi->close(psi);
+out:
 	mutex_unlock(&psinfo->buf_mutex);
 
 	if (failed)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f..5b572c8 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
  * This is called from kswapd when we think we need some
  * more memory
  */
-static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink,
+				 struct shrink_control *sc)
 {
+	int nr = sc->nr_to_scan;
+
 	if (nr) {
 		spin_lock(&dq_list_lock);
 		prune_dqcache(nr);
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index efc309f..7797218 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -42,7 +42,7 @@ config SQUASHFS_LZO
 	select LZO_DECOMPRESS
 	help
 	  Saying Y here includes support for reading Squashfs file systems
-	  compressed with LZO compresssion.  LZO compression is mainly
+	  compressed with LZO compression.  LZO compression is mainly
 	  aimed at embedded systems with slower CPUs where the overheads
 	  of zlib are too high.
 
@@ -57,7 +57,7 @@ config SQUASHFS_XZ
 	select XZ_DEC
 	help
 	  Saying Y here includes support for reading Squashfs file systems
-	  compressed with XZ compresssion.  XZ gives better compression than
+	  compressed with XZ compression.  XZ gives better compression than
 	  the default zlib compression, at the expense of greater CPU and
 	  memory overhead.
 
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index c37b520..4b5a3fb 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -29,7 +29,7 @@
  * plus functions layered ontop of the generic cache implementation to
  * access the metadata and fragment caches.
  *
- * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * To avoid out of memory and fragmentation issues with vmalloc the cache
  * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
  *
  * It should be noted that the cache is not used for file datablocks, these
diff --git a/fs/super.c b/fs/super.c
index 8a06881..c04f7e0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -948,8 +948,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
 	 * but s_maxbytes was an unsigned long long for many releases. Throw
 	 * this warning for a little while to try and catch filesystems that
-	 * violate this rule. This warning should be either removed or
-	 * converted to a BUG() in 2.6.34.
+	 * violate this rule.
 	 */
 	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
 		"negative value (%lld)\n", type->name, sb->s_maxbytes);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index da3fefe..1ad8c93 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -24,13 +24,6 @@
 
 #include "sysfs.h"
 
-/* used in crash dumps to help with debugging */
-static char last_sysfs_file[PATH_MAX];
-void sysfs_printk_last_file(void)
-{
-	printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
-}
-
 /*
  * There's one sysfs_buffer for each open file and one
  * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -337,11 +330,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	struct sysfs_buffer *buffer;
 	const struct sysfs_ops *ops;
 	int error = -EACCES;
-	char *p;
-
-	p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-	if (!IS_ERR(p))
-		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index c8769dc2..194414f 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -101,9 +101,9 @@ int sysfs_create_group(struct kobject *kobj,
 }
 
 /**
- * sysfs_update_group - given a directory kobject, create an attribute group
- * @kobj:	The kobject to create the group on
- * @grp:	The attribute group to create
+ * sysfs_update_group - given a directory kobject, update an attribute group
+ * @kobj:	The kobject to update the group on
+ * @grp:	The attribute group to update
  *
  * This function updates an attribute group.  Unlike
  * sysfs_create_group(), it will explicitly not warn or error if any
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 8c4fc14..f67acbd 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -22,16 +22,24 @@
 #include <linux/anon_inodes.h>
 #include <linux/timerfd.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 
 struct timerfd_ctx {
 	struct hrtimer tmr;
 	ktime_t tintv;
+	ktime_t moffs;
 	wait_queue_head_t wqh;
 	u64 ticks;
 	int expired;
 	int clockid;
+	struct rcu_head rcu;
+	struct list_head clist;
+	bool might_cancel;
 };
 
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
+
 /*
  * This gets called when the timer event triggers. We set the "expired"
  * flag, but we do not re-arm the timer (in case it's necessary,
@@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 	return HRTIMER_NORESTART;
 }
 
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list.
+ */
+void timerfd_clock_was_set(void)
+{
+	ktime_t moffs = ktime_get_monotonic_offset();
+	struct timerfd_ctx *ctx;
+	unsigned long flags;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+		if (!ctx->might_cancel)
+			continue;
+		spin_lock_irqsave(&ctx->wqh.lock, flags);
+		if (ctx->moffs.tv64 != moffs.tv64) {
+			ctx->moffs.tv64 = KTIME_MAX;
+			wake_up_locked(&ctx->wqh);
+		}
+		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+	}
+	rcu_read_unlock();
+}
+
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+	if (ctx->might_cancel) {
+		ctx->might_cancel = false;
+		spin_lock(&cancel_lock);
+		list_del_rcu(&ctx->clist);
+		spin_unlock(&cancel_lock);
+	}
+}
+
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+	if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+		return false;
+	ctx->moffs = ktime_get_monotonic_offset();
+	return true;
+}
+
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+	if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+	    (flags & TFD_TIMER_CANCEL_ON_SET)) {
+		if (!ctx->might_cancel) {
+			ctx->might_cancel = true;
+			spin_lock(&cancel_lock);
+			list_add_rcu(&ctx->clist, &cancel_list);
+			spin_unlock(&cancel_lock);
+		}
+	} else if (ctx->might_cancel) {
+		timerfd_remove_cancel(ctx);
+	}
+}
+
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
 	ktime_t remaining;
@@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
 
-static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
-			  const struct itimerspec *ktmr)
+static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
+			 const struct itimerspec *ktmr)
 {
 	enum hrtimer_mode htmode;
 	ktime_t texp;
+	int clockid = ctx->clockid;
 
 	htmode = (flags & TFD_TIMER_ABSTIME) ?
 		HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
@@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
 	ctx->expired = 0;
 	ctx->ticks = 0;
 	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
-	hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
+	hrtimer_init(&ctx->tmr, clockid, htmode);
 	hrtimer_set_expires(&ctx->tmr, texp);
 	ctx->tmr.function = timerfd_tmrproc;
-	if (texp.tv64 != 0)
+	if (texp.tv64 != 0) {
 		hrtimer_start(&ctx->tmr, texp, htmode);
+		if (timerfd_canceled(ctx))
+			return -ECANCELED;
+	}
+	return 0;
 }
 
 static int timerfd_release(struct inode *inode, struct file *file)
 {
 	struct timerfd_ctx *ctx = file->private_data;
 
+	timerfd_remove_cancel(ctx);
 	hrtimer_cancel(&ctx->tmr);
-	kfree(ctx);
+	kfree_rcu(ctx, rcu);
 	return 0;
 }
 
@@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 		res = -EAGAIN;
 	else
 		res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
+
+	/*
+	 * If clock has changed, we do not care about the
+	 * ticks and we do not rearm the timer. Userspace must
+	 * reevaluate anyway.
+	 */
+	if (timerfd_canceled(ctx)) {
+		ctx->ticks = 0;
+		ctx->expired = 0;
+		res = -ECANCELED;
+	}
+
 	if (ctx->ticks) {
 		ticks = ctx->ticks;
+
 		if (ctx->expired && ctx->tintv.tv64) {
 			/*
 			 * If tintv.tv64 != 0, this is a periodic timer that
@@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	init_waitqueue_head(&ctx->wqh);
 	ctx->clockid = clockid;
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+	ctx->moffs = ktime_get_monotonic_offset();
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
 			       O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	struct file *file;
 	struct timerfd_ctx *ctx;
 	struct itimerspec ktmr, kotmr;
+	int ret;
 
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
@@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 		return PTR_ERR(file);
 	ctx = file->private_data;
 
+	timerfd_setup_cancel(ctx, flags);
+
 	/*
 	 * We need to stop the existing timer before reprogramming
 	 * it to the new values.
@@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	/*
 	 * Re-program the timer to the new value ...
 	 */
-	timerfd_setup(ctx, flags, &ktmr);
+	ret = timerfd_setup(ctx, flags, &ktmr);
 
 	spin_unlock_irq(&ctx->wqh.lock);
 	fput(file);
 	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
 		return -EFAULT;
 
-	return 0;
+	return ret;
 }
 
 SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da..315de66 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
 	long long liab;
 
 	spin_lock(&c->space_lock);
-	liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+	liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
 	spin_unlock(&c->space_lock);
 	return liab;
 }
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 	int idx_lebs;
 	long long idx_size;
 
-	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+	idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
 	/* And make sure we have thrice the index size of space reserved */
 	idx_size += idx_size << 1;
 	/*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
  * budgeted index space to the size of the current index, multiplies this by 3,
  * and makes sure this does not exceed the amount of free LEBs.
  *
- * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
  * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
  *    be large, because UBIFS does not do any index consolidation as long as
  *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
  *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
  *
  * This function returns zero in case of success, and %-ENOSPC in case of
  * failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
 	       c->lst.taken_empty_lebs;
 	if (unlikely(rsvd_idx_lebs > lebs)) {
 		dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
-			 "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+			 "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
 			 rsvd_idx_lebs);
 		return -ENOSPC;
 	}
 
 	available = ubifs_calc_available(c, min_idx_lebs);
-	outstanding = c->budg_data_growth + c->budg_dd_growth;
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
 
 	if (unlikely(available < outstanding)) {
 		dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
 	if (available - outstanding <= c->rp_size && !can_use_rp(c))
 		return -ENOSPC;
 
-	c->min_idx_lebs = min_idx_lebs;
+	c->bi.min_idx_lebs = min_idx_lebs;
 	return 0;
 }
 
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
 {
 	int data_growth;
 
-	data_growth = req->new_ino  ? c->inode_budget : 0;
+	data_growth = req->new_ino  ? c->bi.inode_budget : 0;
 	if (req->new_page)
-		data_growth += c->page_budget;
+		data_growth += c->bi.page_budget;
 	if (req->new_dent)
-		data_growth += c->dent_budget;
+		data_growth += c->bi.dent_budget;
 	data_growth += req->new_ino_d;
 	return data_growth;
 }
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
 {
 	int dd_growth;
 
-	dd_growth = req->dirtied_page ? c->page_budget : 0;
+	dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
 
 	if (req->dirtied_ino)
-		dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+		dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
 	if (req->mod_dent)
-		dd_growth += c->dent_budget;
+		dd_growth += c->bi.dent_budget;
 	dd_growth += req->dirtied_ino_d;
 	return dd_growth;
 }
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 
 again:
 	spin_lock(&c->space_lock);
-	ubifs_assert(c->budg_idx_growth >= 0);
-	ubifs_assert(c->budg_data_growth >= 0);
-	ubifs_assert(c->budg_dd_growth >= 0);
+	ubifs_assert(c->bi.idx_growth >= 0);
+	ubifs_assert(c->bi.data_growth >= 0);
+	ubifs_assert(c->bi.dd_growth >= 0);
 
-	if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+	if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
 		dbg_budg("no space");
 		spin_unlock(&c->space_lock);
 		return -ENOSPC;
 	}
 
-	c->budg_idx_growth += idx_growth;
-	c->budg_data_growth += data_growth;
-	c->budg_dd_growth += dd_growth;
+	c->bi.idx_growth += idx_growth;
+	c->bi.data_growth += data_growth;
+	c->bi.dd_growth += dd_growth;
 
 	err = do_budget_space(c);
 	if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
 	}
 
 	/* Restore the old values */
-	c->budg_idx_growth -= idx_growth;
-	c->budg_data_growth -= data_growth;
-	c->budg_dd_growth -= dd_growth;
+	c->bi.idx_growth -= idx_growth;
+	c->bi.data_growth -= data_growth;
+	c->bi.dd_growth -= dd_growth;
 	spin_unlock(&c->space_lock);
 
 	if (req->fast) {
@@ -506,9 +506,9 @@ again:
 			goto again;
 		}
 		dbg_budg("FS is full, -ENOSPC");
-		c->nospace = 1;
+		c->bi.nospace = 1;
 		if (can_use_rp(c) || c->rp_size == 0)
-			c->nospace_rp = 1;
+			c->bi.nospace_rp = 1;
 		smp_wmb();
 	} else
 		ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
  * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
  * since the index changes (which were budgeted for in @req->idx_growth) will
  * only be written to the media on commit, this function moves the index budget
- * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
- * zeroed by the commit operation.
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
+ * by the commit operation.
  */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 	if (!req->data_growth && !req->dd_growth)
 		return;
 
-	c->nospace = c->nospace_rp = 0;
+	c->bi.nospace = c->bi.nospace_rp = 0;
 	smp_wmb();
 
 	spin_lock(&c->space_lock);
-	c->budg_idx_growth -= req->idx_growth;
-	c->budg_uncommitted_idx += req->idx_growth;
-	c->budg_data_growth -= req->data_growth;
-	c->budg_dd_growth -= req->dd_growth;
-	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-
-	ubifs_assert(c->budg_idx_growth >= 0);
-	ubifs_assert(c->budg_data_growth >= 0);
-	ubifs_assert(c->budg_dd_growth >= 0);
-	ubifs_assert(c->min_idx_lebs < c->main_lebs);
-	ubifs_assert(!(c->budg_idx_growth & 7));
-	ubifs_assert(!(c->budg_data_growth & 7));
-	ubifs_assert(!(c->budg_dd_growth & 7));
+	c->bi.idx_growth -= req->idx_growth;
+	c->bi.uncommitted_idx += req->idx_growth;
+	c->bi.data_growth -= req->data_growth;
+	c->bi.dd_growth -= req->dd_growth;
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	ubifs_assert(c->bi.idx_growth >= 0);
+	ubifs_assert(c->bi.data_growth >= 0);
+	ubifs_assert(c->bi.dd_growth >= 0);
+	ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
+	ubifs_assert(!(c->bi.idx_growth & 7));
+	ubifs_assert(!(c->bi.data_growth & 7));
+	ubifs_assert(!(c->bi.dd_growth & 7));
 	spin_unlock(&c->space_lock);
 }
 
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 {
 	spin_lock(&c->space_lock);
 	/* Release the index growth reservation */
-	c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
 	/* Release the data growth reservation */
-	c->budg_data_growth -= c->page_budget;
+	c->bi.data_growth -= c->bi.page_budget;
 	/* Increase the dirty data growth reservation instead */
-	c->budg_dd_growth += c->page_budget;
+	c->bi.dd_growth += c->bi.page_budget;
 	/* And re-calculate the indexing space reservation */
-	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 	spin_unlock(&c->space_lock);
 }
 
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 
 	memset(&req, 0, sizeof(struct ubifs_budget_req));
 	/* The "no space" flags will be cleared because dd_growth is > 0 */
-	req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+	req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
 	ubifs_release_budget(c, &req);
 }
 
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
 	int rsvd_idx_lebs, lebs;
 	long long available, outstanding, free;
 
-	ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	available = ubifs_calc_available(c, c->min_idx_lebs);
+	ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
+	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
 
 	/*
 	 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
 	 * Note, the calculations below are similar to what we have in
 	 * 'do_budget_space()', so refer there for comments.
 	 */
-	if (c->min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+	if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
 	else
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01de..87cd0ea 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
 	c->mst_node->root_len    = cpu_to_le32(zroot.len);
 	c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
 	c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
-	c->mst_node->index_size  = cpu_to_le64(c->old_idx_sz);
+	c->mst_node->index_size  = cpu_to_le64(c->bi.old_idx_sz);
 	c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
 	c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
 	c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d374..0bb2bce 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
-#include <linux/slab.h>
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
 
-unsigned int ubifs_msg_flags;
 unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
 
-module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
 module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
 
-MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
 MODULE_PARM_DESC(debug_chks, "Debug check flags");
 MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
 
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
 		printk(KERN_DEBUG "\t  big_lpt      %u\n",
 		       !!(sup_flags & UBIFS_FLG_BIGLPT));
+		printk(KERN_DEBUG "\t  space_fixup  %u\n",
+		       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
 		printk(KERN_DEBUG "\tmin_io_size    %u\n",
 		       le32_to_cpu(sup->min_io_size));
 		printk(KERN_DEBUG "\tleb_size       %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
 	spin_unlock(&dbg_lock);
 }
 
-void dbg_dump_budg(struct ubifs_info *c)
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
 {
 	int i;
 	struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
 	struct ubifs_gced_idx_leb *idx_gc;
 	long long available, outstanding, free;
 
-	ubifs_assert(spin_is_locked(&c->space_lock));
+	spin_lock(&c->space_lock);
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
-	       "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
-	       c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
-	printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
-	       "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
-	       c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
-	       c->freeable_cnt);
-	printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
-	       "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
-	       c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+	printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
+	       "total budget sum %lld\n", current->pid,
+	       bi->data_growth + bi->dd_growth,
+	       bi->data_growth + bi->dd_growth + bi->idx_growth);
+	printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
+	       "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
+	       bi->idx_growth);
+	printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
+	       "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
+	       bi->uncommitted_idx);
+	printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+	       bi->page_budget, bi->inode_budget, bi->dent_budget);
+	printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+	       bi->nospace, bi->nospace_rp);
+	printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+
+	if (bi != &c->bi)
+		/*
+		 * If we are dumping saved budgeting data, do not print
+		 * additional information which is about the current state, not
+		 * the old one which corresponded to the saved budgeting data.
+		 */
+		goto out_unlock;
+
+	printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+	       c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
 	printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
 	       "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
 	       atomic_long_read(&c->dirty_zn_cnt),
 	       atomic_long_read(&c->clean_zn_cnt));
-	printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
-	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
 	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
 	       c->gc_lnum, c->ihead_lnum);
+
 	/* If we are in R/O mode, journal heads do not exist */
 	if (c->jheads)
 		for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
 	printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
 
 	/* Print budgeting predictions */
-	available = ubifs_calc_available(c, c->min_idx_lebs);
-	outstanding = c->budg_data_growth + c->budg_dd_growth;
+	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+	outstanding = c->bi.data_growth + c->bi.dd_growth;
 	free = ubifs_get_free_space_nolock(c);
 	printk(KERN_DEBUG "Budgeting predictions:\n");
 	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
 	       available, outstanding, free);
+out_unlock:
 	spin_unlock(&dbg_lock);
+	spin_unlock(&c->space_lock);
 }
 
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
 		if (bud->lnum == lp->lnum) {
 			int head = 0;
 			for (i = 0; i < c->jhead_cnt; i++) {
-				if (lp->lnum == c->jheads[i].wbuf.lnum) {
+				/*
+				 * Note, if we are in R/O mode or in the middle
+				 * of mounting/re-mounting, the write-buffers do
+				 * not exist.
+				 */
+				if (c->jheads &&
+				    lp->lnum == c->jheads[i].wbuf.lnum) {
 					printk(KERN_CONT ", jhead %s",
 					       dbg_jhead(i));
 					head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
 
 	spin_lock(&c->space_lock);
 	memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+	memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+	d->saved_idx_gc_cnt = c->idx_gc_cnt;
 
 	/*
 	 * We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
 out:
 	ubifs_msg("saved lprops statistics dump");
 	dbg_dump_lstats(&d->saved_lst);
-	ubifs_get_lp_stats(c, &lst);
-
+	ubifs_msg("saved budgeting info dump");
+	dbg_dump_budg(c, &d->saved_bi);
+	ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
 	ubifs_msg("current lprops statistics dump");
+	ubifs_get_lp_stats(c, &lst);
 	dbg_dump_lstats(&lst);
-
-	spin_lock(&c->space_lock);
-	dbg_dump_budg(c);
-	spin_unlock(&c->space_lock);
+	ubifs_msg("current budgeting info dump");
+	dbg_dump_budg(c, &c->bi);
 	dump_stack();
 	return -EINVAL;
 }
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
 	struct rb_node **p, *parent = NULL;
 	struct fsck_inode *fscki;
 	ino_t inum = key_inum_flash(c, &ino->key);
+	struct inode *inode;
+	struct ubifs_inode *ui;
 
 	p = &fsckd->inodes.rb_node;
 	while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
 	if (!fscki)
 		return ERR_PTR(-ENOMEM);
 
+	inode = ilookup(c->vfs_sb, inum);
+
 	fscki->inum = inum;
-	fscki->nlink = le32_to_cpu(ino->nlink);
-	fscki->size = le64_to_cpu(ino->size);
-	fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
-	fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
-	fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
-	fscki->mode = le32_to_cpu(ino->mode);
+	/*
+	 * If the inode is present in the VFS inode cache, use it instead of
+	 * the on-flash inode which might be out-of-date. E.g., the size might
+	 * be out-of-date. If we do not do this, the following may happen, for
+	 * example:
+	 *   1. A power cut happens
+	 *   2. We mount the file-system R/O, the replay process fixes up the
+	 *      inode size in the VFS cache, but on on-flash.
+	 *   3. 'check_leaf()' fails because it hits a data node beyond inode
+	 *      size.
+	 */
+	if (!inode) {
+		fscki->nlink = le32_to_cpu(ino->nlink);
+		fscki->size = le64_to_cpu(ino->size);
+		fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+		fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+		fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+		fscki->mode = le32_to_cpu(ino->mode);
+	} else {
+		ui = ubifs_inode(inode);
+		fscki->nlink = inode->i_nlink;
+		fscki->size = inode->i_size;
+		fscki->xattr_cnt = ui->xattr_cnt;
+		fscki->xattr_sz = ui->xattr_size;
+		fscki->xattr_nms = ui->xattr_names;
+		fscki->mode = inode->i_mode;
+		iput(inode);
+	}
+
 	if (S_ISDIR(fscki->mode)) {
 		fscki->calc_sz = UBIFS_INO_NODE_SZ;
 		fscki->calc_cnt = 2;
 	}
+
 	rb_link_node(&fscki->rb, parent, p);
 	rb_insert_color(&fscki->rb, &fsckd->inodes);
+
 	return fscki;
 }
 
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
 		hashb = key_block(c, &sb->key);
 
 		if (hasha > hashb) {
-			ubifs_err("larger hash %u goes before %u", hasha, hashb);
+			ubifs_err("larger hash %u goes before %u",
+				  hasha, hashb);
 			goto error_dump;
 		}
 	}
@@ -2437,14 +2491,12 @@ error_dump:
 	return 0;
 }
 
-static int invocation_cnt;
-
 int dbg_force_in_the_gaps(void)
 {
-	if (!dbg_force_in_the_gaps_enabled)
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
 		return 0;
-	/* Force in-the-gaps every 8th commit */
-	return !((invocation_cnt++) & 0x7);
+
+	return !(random32() & 7);
 }
 
 /* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 		 int len, int check)
 {
 	if (in_failure_mode(desc))
-		return -EIO;
+		return -EROFS;
 	return ubi_leb_read(desc, lnum, buf, offset, len, check);
 }
 
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
 	int err, failing;
 
 	if (in_failure_mode(desc))
-		return -EIO;
+		return -EROFS;
 	failing = do_fail(desc, lnum, 1);
 	if (failing)
 		cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
 	if (err)
 		return err;
 	if (failing)
-		return -EIO;
+		return -EROFS;
 	return 0;
 }
 
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
 	int err;
 
 	if (do_fail(desc, lnum, 1))
-		return -EIO;
+		return -EROFS;
 	err = ubi_leb_change(desc, lnum, buf, len, dtype);
 	if (err)
 		return err;
 	if (do_fail(desc, lnum, 1))
-		return -EIO;
+		return -EROFS;
 	return 0;
 }
 
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
 	int err;
 
 	if (do_fail(desc, lnum, 0))
-		return -EIO;
+		return -EROFS;
 	err = ubi_leb_erase(desc, lnum);
 	if (err)
 		return err;
 	if (do_fail(desc, lnum, 0))
-		return -EIO;
+		return -EROFS;
 	return 0;
 }
 
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
 	int err;
 
 	if (do_fail(desc, lnum, 0))
-		return -EIO;
+		return -EROFS;
 	err = ubi_leb_unmap(desc, lnum);
 	if (err)
 		return err;
 	if (do_fail(desc, lnum, 0))
-		return -EIO;
+		return -EROFS;
 	return 0;
 }
 
 int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
 {
 	if (in_failure_mode(desc))
-		return -EIO;
+		return -EROFS;
 	return ubi_is_mapped(desc, lnum);
 }
 
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
 	int err;
 
 	if (do_fail(desc, lnum, 0))
-		return -EIO;
+		return -EROFS;
 	err = ubi_leb_map(desc, lnum, dtype);
 	if (err)
 		return err;
 	if (do_fail(desc, lnum, 0))
-		return -EIO;
+		return -EROFS;
 	return 0;
 }
 
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
 static int open_debugfs_file(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
-	return 0;
+	return nonseekable_open(inode, file);
 }
 
 static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 
 	if (file->f_path.dentry == d->dfs_dump_lprops)
 		dbg_dump_lprops(c);
-	else if (file->f_path.dentry == d->dfs_dump_budg) {
-		spin_lock(&c->space_lock);
-		dbg_dump_budg(c);
-		spin_unlock(&c->space_lock);
-	} else if (file->f_path.dentry == d->dfs_dump_tnc) {
+	else if (file->f_path.dentry == d->dfs_dump_budg)
+		dbg_dump_budg(c, &c->bi);
+	else if (file->f_path.dentry == d->dfs_dump_tnc) {
 		mutex_lock(&c->tnc_mutex);
 		dbg_dump_tnc(c);
 		mutex_unlock(&c->tnc_mutex);
 	} else
 		return -EINVAL;
 
-	*ppos += count;
 	return count;
 }
 
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
 	.open = open_debugfs_file,
 	.write = write_debugfs_file,
 	.owner = THIS_MODULE,
-	.llseek = default_llseek,
+	.llseek = no_llseek,
 };
 
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493ca..a811ac4 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
+#include <linux/random.h>
+
 /**
  * ubifs_debug_info - per-FS debugging information.
  * @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
  * @new_ihead_offs: used by debugging to check @c->ihead_offs
  *
  * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
- * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
  *
- * dfs_dir_name: name of debugfs directory containing this file-system's files
- * dfs_dir: direntry object of the file-system debugfs directory
- * dfs_dump_lprops: "dump lprops" debugfs knob
- * dfs_dump_budg: "dump budgeting information" debugfs knob
- * dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
  */
 struct ubifs_debug_info {
 	struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
 	int new_ihead_offs;
 
 	struct ubifs_lp_stats saved_lst;
+	struct ubifs_budg_info saved_bi;
 	long long saved_free;
+	int saved_idx_gc_cnt;
 
 	char dfs_dir_name[100];
 	struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
 	}                                                                      \
 } while (0)
 
-#define dbg_dump_stack() do {                                                  \
-	if (!dbg_failure_mode)                                                 \
-		dump_stack();                                                  \
-} while (0)
-
-/* Generic debugging messages */
-#define dbg_msg(fmt, ...) do {                                                 \
-	spin_lock(&dbg_lock);                                                  \
-	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
-	       __func__, ##__VA_ARGS__);                                       \
-	spin_unlock(&dbg_lock);                                                \
-} while (0)
-
-#define dbg_do_msg(typ, fmt, ...) do {                                         \
-	if (ubifs_msg_flags & typ)                                             \
-		dbg_msg(fmt, ##__VA_ARGS__);                                   \
-} while (0)
+#define dbg_dump_stack() dump_stack()
 
 #define dbg_err(fmt, ...) do {                                                 \
 	spin_lock(&dbg_lock);                                                  \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
 
-/* General messages */
-#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define ubifs_dbg_msg(type, fmt, ...) do {                        \
+	spin_lock(&dbg_lock);                                     \
+	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
+	spin_unlock(&dbg_lock);                                   \
+} while (0)
 
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+/* General messages */
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
-
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
-
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
-
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
-
+#define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
-
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
-
+#define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
-
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
-
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
-
+#define dbg_log(fmt, ...)   ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
-
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
-
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
-
-/*
- * Debugging message type flags.
- *
- * UBIFS_MSG_GEN: general messages
- * UBIFS_MSG_JNL: journal messages
- * UBIFS_MSG_MNT: mount messages
- * UBIFS_MSG_CMT: commit messages
- * UBIFS_MSG_FIND: LEB find messages
- * UBIFS_MSG_BUDG: budgeting messages
- * UBIFS_MSG_GC: garbage collection messages
- * UBIFS_MSG_TNC: TNC messages
- * UBIFS_MSG_LP: lprops messages
- * UBIFS_MSG_IO: I/O messages
- * UBIFS_MSG_LOG: log messages
- * UBIFS_MSG_SCAN: scan messages
- * UBIFS_MSG_RCVRY: recovery messages
- */
-enum {
-	UBIFS_MSG_GEN   = 0x1,
-	UBIFS_MSG_JNL   = 0x2,
-	UBIFS_MSG_MNT   = 0x4,
-	UBIFS_MSG_CMT   = 0x8,
-	UBIFS_MSG_FIND  = 0x10,
-	UBIFS_MSG_BUDG  = 0x20,
-	UBIFS_MSG_GC    = 0x40,
-	UBIFS_MSG_TNC   = 0x80,
-	UBIFS_MSG_LP    = 0x100,
-	UBIFS_MSG_IO    = 0x200,
-	UBIFS_MSG_LOG   = 0x400,
-	UBIFS_MSG_SCAN  = 0x800,
-	UBIFS_MSG_RCVRY = 0x1000,
-};
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
 
 /*
  * Debugging check flags.
@@ -233,11 +186,9 @@ enum {
 /*
  * Special testing flags.
  *
- * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
  * UBIFS_TST_RCVRY: failure mode for recovery testing
  */
 enum {
-	UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
 	UBIFS_TST_RCVRY             = 0x4,
 };
 
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
 		       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
 void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
 int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 
 /* Force the use of in-the-gaps method for testing */
-
-#define dbg_force_in_the_gaps_enabled \
-	(ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
-
+static inline int dbg_force_in_the_gaps_enabled(void)
+{
+	return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
 int dbg_force_in_the_gaps(void);
 
 /* Failure mode for recovery testing */
-
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
 
 #ifndef UBIFS_DBG_PRESERVE_UBI
-
 #define ubi_leb_read   dbg_leb_read
 #define ubi_leb_write  dbg_leb_write
 #define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
 #define ubi_leb_unmap  dbg_leb_unmap
 #define ubi_is_mapped  dbg_is_mapped
 #define ubi_leb_map    dbg_leb_map
-
 #endif
 
 int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 		       __func__, __LINE__, current->pid);                      \
 } while (0)
 
-#define dbg_err(fmt, ...)   do {                                               \
-	if (0)                                                                 \
-		ubifs_err(fmt, ##__VA_ARGS__);                                 \
+#define dbg_err(fmt, ...)   do {                   \
+	if (0)                                     \
+		ubifs_err(fmt, ##__VA_ARGS__);     \
 } while (0)
 
-#define dbg_msg(fmt, ...) do {                                                 \
-	if (0)                                                                 \
-		printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
-		       current->pid, __func__, ##__VA_ARGS__);                 \
+#define ubifs_dbg_msg(fmt, ...) do {               \
+	if (0)                                     \
+		pr_debug(fmt "\n", ##__VA_ARGS__); \
 } while (0)
 
 #define dbg_dump_stack()
 #define ubifs_assert_cmt_locked(c)
 
-#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
 
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
 dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
 static inline void
 dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-static inline void dbg_dump_budg(struct ubifs_info *c)            { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+	      const struct ubifs_budg_info *bi)                   { return; }
 static inline void dbg_dump_lprop(const struct ubifs_info *c,
 				  const struct ubifs_lprops *lp)  { return; }
 static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
 			      struct list_head *head)             { return 0; }
 
 static inline int dbg_force_in_the_gaps(void)                     { return 0; }
-#define dbg_force_in_the_gaps_enabled 0
-#define dbg_failure_mode              0
+#define dbg_force_in_the_gaps_enabled() 0
+#define dbg_failure_mode                0
 
 static inline int dbg_debugfs_init(void)                          { return 0; }
 static inline void dbg_debugfs_exit(void)                         { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67..ef5abd3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 		ubifs_release_budget(c, &req);
 	else {
 		/* We've deleted something - clean the "no space" flags */
-		c->nospace = c->nospace_rp = 0;
+		c->bi.nospace = c->bi.nospace_rp = 0;
 		smp_wmb();
 	}
 	return 0;
@@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 		ubifs_release_budget(c, &req);
 	else {
 		/* We've deleted something - clean the "no space" flags */
-		c->nospace = c->nospace_rp = 0;
+		c->bi.nospace = c->bi.nospace_rp = 0;
 		smp_wmb();
 	}
 	return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db7..5e7fccf 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
  */
 static void release_existing_page_budget(struct ubifs_info *c)
 {
-	struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+	struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
 
 	ubifs_release_budget(c, &req);
 }
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
  * the page locked, and it locks @ui_mutex. However, write-back does take inode
  * @i_mutex, which means other VFS operations may be run on this inode at the
  * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
- * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
- * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
- * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
+ * then drops the truncated pages. And while dropping the pages, it takes the
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
  *
  * XXX(truncate): with the new truncate sequence this is not true anymore,
  * and the calls to truncate_setsize can be move around freely.  They should
@@ -1189,7 +1189,7 @@ out_budg:
 	if (budgeted)
 		ubifs_release_budget(c, &req);
 	else {
-		c->nospace = c->nospace_rp = 0;
+		c->bi.nospace = c->bi.nospace_rp = 0;
 		smp_wmb();
 	}
 	return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
 
 	dbg_gen("syncing inode %lu", inode->i_ino);
 
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (c->ro_mount)
+		/*
+		 * For some really strange reasons VFS does not filter out
+		 * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+		 */
 		return 0;
 
 	/*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 }
 
 /*
- * mmap()d file has taken write protection fault and is being made
- * writable. UBIFS must ensure page is budgeted for.
+ * mmap()d file has taken write protection fault and is being made writable.
+ * UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+				 struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int err;
 
-	/* 'generic_file_mmap()' takes care of NOMMU case */
 	err = generic_file_mmap(file, vma);
 	if (err)
 		return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383..2559d17 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 		 * But if the index takes fewer LEBs than it is reserved for it,
 		 * this function must avoid picking those reserved LEBs.
 		 */
-		if (c->min_idx_lebs >= c->lst.idx_lebs) {
-			rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+		if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
+			rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
 			exclude_index = 1;
 		}
 		spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 			pick_free = 0;
 	} else {
 		spin_lock(&c->space_lock);
-		exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+		exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
 		spin_unlock(&c->space_lock);
 	}
 
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 
 	/* Check if there are enough empty LEBs for commit */
 	spin_lock(&c->space_lock);
-	if (c->min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+	if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->bi.min_idx_lebs -  c->lst.idx_lebs;
 	else
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f108..ded29f6 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
 	if (err)
 		return err;
 
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		return err;
+
 	err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
 	if (err)
 		return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
  * This function compares data nodes @a and @b. Returns %1 if @a has greater
  * inode or block number, and %-1 otherwise.
  */
-int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
 	ino_t inuma, inumb;
 	struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
  * first and sorted by length in descending order. Directory entry nodes go
  * after inode nodes and are sorted in ascending hash valuer order.
  */
-int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+			     struct list_head *b)
 {
 	ino_t inuma, inumb;
 	struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 	ubifs_assert(c->gc_lnum != lnum);
 	ubifs_assert(wbuf->lnum != lnum);
 
+	if (lp->free + lp->dirty == c->leb_size) {
+		/* Special case - a free LEB  */
+		dbg_gc("LEB %d is free, return it", lp->lnum);
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+
+		if (lp->free != c->leb_size) {
+			/*
+			 * Write buffers must be sync'd before unmapping
+			 * freeable LEBs, because one of them may contain data
+			 * which obsoletes something in 'lp->pnum'.
+			 */
+			err = gc_sync_wbufs(c);
+			if (err)
+				return err;
+			err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+						  0, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		err = ubifs_leb_unmap(c, lp->lnum);
+		if (err)
+			return err;
+
+		if (c->gc_lnum == -1) {
+			c->gc_lnum = lnum;
+			return LEB_RETAINED;
+		}
+
+		return LEB_FREED;
+	}
+
 	/*
 	 * We scan the entire LEB even though we only really need to scan up to
 	 * (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 		       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
 		       lp.free + lp.dirty, min_space);
 
-		if (lp.free + lp.dirty == c->leb_size) {
-			/* An empty LEB was returned */
-			dbg_gc("LEB %d is free, return it", lp.lnum);
-			/*
-			 * ubifs_find_dirty_leb() doesn't return freeable index
-			 * LEBs.
-			 */
-			ubifs_assert(!(lp.flags & LPROPS_INDEX));
-			if (lp.free != c->leb_size) {
-				/*
-				 * Write buffers must be sync'd before
-				 * unmapping freeable LEBs, because one of them
-				 * may contain data which obsoletes something
-				 * in 'lp.pnum'.
-				 */
-				ret = gc_sync_wbufs(c);
-				if (ret)
-					goto out;
-				ret = ubifs_change_one_lp(c, lp.lnum,
-							  c->leb_size, 0, 0, 0,
-							  0);
-				if (ret)
-					goto out;
-			}
-			ret = ubifs_leb_unmap(c, lp.lnum);
-			if (ret)
-				goto out;
-			ret = lp.lnum;
-			break;
-		}
-
 		space_before = c->leb_size - wbuf->offs - wbuf->used;
 		if (wbuf->lnum == -1)
 			space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b..166951e 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 	ubifs_assert(wbuf->size % c->min_io_size == 0);
 	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->leb_size - wbuf->offs >= c->max_write_size)
-		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
 
 	if (c->ro_error)
 		return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
  * @dtype: data type
  *
  * This function targets the write-buffer to logical eraseblock @lnum:@offs.
- * The write-buffer is synchronized if it is not empty. Returns zero in case of
- * success and a negative error code in case of failure.
+ * The write-buffer has to be empty. Returns zero in case of success and a
+ * negative error code in case of failure.
  */
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 			   int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 	ubifs_assert(offs >= 0 && offs <= c->leb_size);
 	ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
 	ubifs_assert(lnum != wbuf->lnum);
-
-	if (wbuf->used > 0) {
-		int err = ubifs_wbuf_sync_nolock(wbuf);
-
-		if (err)
-			return err;
-	}
+	ubifs_assert(wbuf->used == 0);
 
 	spin_lock(&wbuf->lock);
 	wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 {
 	struct ubifs_info *c = wbuf->c;
-	int err, written, n, aligned_len = ALIGN(len, 8), offs;
+	int err, written, n, aligned_len = ALIGN(len, 8);
 
 	dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
 	       dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
 	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->leb_size - wbuf->offs >= c->max_write_size)
-		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
 
 	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
 		err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 		goto exit;
 	}
 
-	offs = wbuf->offs;
 	written = 0;
 
 	if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 		if (err)
 			goto out;
 
-		offs += wbuf->size;
+		wbuf->offs += wbuf->size;
 		len -= wbuf->avail;
 		aligned_len -= wbuf->avail;
 		written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 		if (err)
 			goto out;
 
-		offs += wbuf->size;
+		wbuf->offs += wbuf->size;
 		len -= wbuf->size;
 		aligned_len -= wbuf->size;
 		written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	n = aligned_len >> c->max_write_shift;
 	if (n) {
 		n <<= c->max_write_shift;
-		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
-		err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
-				    wbuf->dtype);
+		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
+		       wbuf->offs);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+				    wbuf->offs, n, wbuf->dtype);
 		if (err)
 			goto out;
-		offs += n;
+		wbuf->offs += n;
 		aligned_len -= n;
 		len -= n;
 		written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 		 */
 		memcpy(wbuf->buf, buf + written, len);
 
-	wbuf->offs = offs;
 	if (c->leb_size - wbuf->offs >= c->max_write_size)
 		wbuf->size = c->max_write_size;
 	else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e8..34b1679 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
 	 * LEB with some empty space.
 	 */
 	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
-	if (lnum >= 0) {
-		/* Found an LEB, add it to the journal head */
-		err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
-		if (err)
-			goto out_return;
-		/* A new bud was successfully allocated and added to the log */
+	if (lnum >= 0)
 		goto out;
-	}
 
 	err = lnum;
 	if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
 		return 0;
 	}
 
-	err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
-	if (err)
-		goto out_return;
 	offs = 0;
 
 out:
+	/*
+	 * Make sure we synchronize the write-buffer before we add the new bud
+	 * to the log. Otherwise we may have a power cut after the log
+	 * reference node for the last bud (@lnum) is written but before the
+	 * write-buffer data are written to the next-to-last bud
+	 * (@wbuf->lnum). And the effect would be that the recovery would see
+	 * that there is corruption in the next-to-last bud.
+	 */
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		goto out_return;
+	err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+	if (err)
+		goto out_return;
 	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
 	if (err)
 		goto out_unlock;
@@ -380,10 +385,8 @@ out:
 	if (err == -ENOSPC) {
 		/* This are some budgeting problems, print useful information */
 		down_write(&c->commit_sem);
-		spin_lock(&c->space_lock);
 		dbg_dump_stack();
-		dbg_dump_budg(c);
-		spin_unlock(&c->space_lock);
+		dbg_dump_budg(c, &c->bi);
 		dbg_dump_lprops(c);
 		cmt_retries = dbg_check_lprops(c);
 		up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780..affea94 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
 }
 
 /**
- * next_log_lnum - switch to the next log LEB.
- * @c: UBIFS file-system description object
- * @lnum: current log LEB
- */
-static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
-{
-	lnum += 1;
-	if (lnum > c->log_last)
-		lnum = UBIFS_LOG_LNUM;
-
-	return lnum;
-}
-
-/**
  * empty_log_bytes - calculate amount of empty space in the log.
  * @c: UBIFS file-system description object
  */
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	ref->jhead = cpu_to_le32(jhead);
 
 	if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
-		c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
 		c->lhead_offs = 0;
 	}
 
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
 
 	/* Switch to the next log LEB */
 	if (c->lhead_offs) {
-		c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
 		c->lhead_offs = 0;
 	}
 
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
 
 	c->lhead_offs += len;
 	if (c->lhead_offs == c->leb_size) {
-		c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
 		c->lhead_offs = 0;
 	}
 
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
 	}
 	mutex_lock(&c->log_mutex);
 	for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
-	     lnum = next_log_lnum(c, lnum)) {
+	     lnum = ubifs_next_log_lnum(c, lnum)) {
 		dbg_log("unmap log LEB %d", lnum);
 		err = ubifs_leb_unmap(c, lnum);
 		if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
 		err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
 		if (err)
 			return err;
-		*lnum = next_log_lnum(c, *lnum);
+		*lnum = ubifs_next_log_lnum(c, *lnum);
 		*offs = 0;
 	}
 	memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
 		ubifs_scan_destroy(sleb);
 		if (lnum == c->lhead_lnum)
 			break;
-		lnum = next_log_lnum(c, lnum);
+		lnum = ubifs_next_log_lnum(c, lnum);
 	}
 	if (offs) {
 		int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
 	/* Unmap remaining LEBs */
 	lnum = write_lnum;
 	do {
-		lnum = next_log_lnum(c, lnum);
+		lnum = ubifs_next_log_lnum(c, lnum);
 		err = ubifs_leb_unmap(c, lnum);
 		if (err)
 			return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847..667884f 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
 }
 
 /**
- * struct scan_check_data - data provided to scan callback function.
- * @lst: LEB properties statistics
- * @err: error code
- */
-struct scan_check_data {
-	struct ubifs_lp_stats lst;
-	int err;
-};
-
-/**
  * scan_check_cb - scan callback.
  * @c: the UBIFS file-system description object
  * @lp: LEB properties to scan
  * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @lst: lprops statistics to update
  *
  * This function returns a code that indicates whether the scan should continue
  * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
  */
 static int scan_check_cb(struct ubifs_info *c,
 			 const struct ubifs_lprops *lp, int in_tree,
-			 struct scan_check_data *data)
+			 struct ubifs_lp_stats *lst)
 {
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
-	struct ubifs_lp_stats *lst = &data->lst;
 	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
 	void *buf = NULL;
 
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
 		if (cat != (lp->flags & LPROPS_CAT_MASK)) {
 			ubifs_err("bad LEB category %d expected %d",
 				  (lp->flags & LPROPS_CAT_MASK), cat);
-			goto out;
+			return -EINVAL;
 		}
 	}
 
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
 			}
 			if (!found) {
 				ubifs_err("bad LPT list (category %d)", cat);
-				goto out;
+				return -EINVAL;
 			}
 		}
 	}
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
 		if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
 		    lp != heap->arr[lp->hpos]) {
 			ubifs_err("bad LPT heap (category %d)", cat);
-			goto out;
+			return -EINVAL;
 		}
 	}
 
 	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
-	if (!buf) {
-		ubifs_err("cannot allocate memory to scan LEB %d", lnum);
-		goto out;
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * After an unclean unmount, empty and freeable LEBs
+	 * may contain garbage - do not scan them.
+	 */
+	if (lp->free == c->leb_size) {
+		lst->empty_lebs += 1;
+		lst->total_free += c->leb_size;
+		lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+		return LPT_SCAN_CONTINUE;
+	}
+	if (lp->free + lp->dirty == c->leb_size &&
+	    !(lp->flags & LPROPS_INDEX)) {
+		lst->total_free  += lp->free;
+		lst->total_dirty += lp->dirty;
+		lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
+		return LPT_SCAN_CONTINUE;
 	}
 
 	sleb = ubifs_scan(c, lnum, 0, buf, 0);
 	if (IS_ERR(sleb)) {
-		/*
-		 * After an unclean unmount, empty and freeable LEBs
-		 * may contain garbage.
-		 */
-		if (lp->free == c->leb_size) {
-			ubifs_err("scan errors were in empty LEB "
-				  "- continuing checking");
-			lst->empty_lebs += 1;
-			lst->total_free += c->leb_size;
-			lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-			ret = LPT_SCAN_CONTINUE;
-			goto exit;
-		}
-
-		if (lp->free + lp->dirty == c->leb_size &&
-		    !(lp->flags & LPROPS_INDEX)) {
-			ubifs_err("scan errors were in freeable LEB "
-				  "- continuing checking");
-			lst->total_free  += lp->free;
-			lst->total_dirty += lp->dirty;
-			lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-			ret = LPT_SCAN_CONTINUE;
-			goto exit;
+		ret = PTR_ERR(sleb);
+		if (ret == -EUCLEAN) {
+			dbg_dump_lprops(c);
+			dbg_dump_budg(c, &c->bi);
 		}
-		data->err = PTR_ERR(sleb);
-		ret = LPT_SCAN_STOP;
-		goto exit;
+		goto out;
 	}
 
 	is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
 	}
 
 	ubifs_scan_destroy(sleb);
-	ret = LPT_SCAN_CONTINUE;
-exit:
 	vfree(buf);
-	return ret;
+	return LPT_SCAN_CONTINUE;
 
 out_print:
 	ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
 	dbg_dump_leb(c, lnum);
 out_destroy:
 	ubifs_scan_destroy(sleb);
+	ret = -EINVAL;
 out:
 	vfree(buf);
-	data->err = -EINVAL;
-	return LPT_SCAN_STOP;
+	return ret;
 }
 
 /**
@@ -1278,8 +1260,7 @@ out:
 int dbg_check_lprops(struct ubifs_info *c)
 {
 	int i, err;
-	struct scan_check_data data;
-	struct ubifs_lp_stats *lst = &data.lst;
+	struct ubifs_lp_stats lst;
 
 	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
 		return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
 			return err;
 	}
 
-	memset(lst, 0, sizeof(struct ubifs_lp_stats));
-
-	data.err = 0;
+	memset(&lst, 0, sizeof(struct ubifs_lp_stats));
 	err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
 				    (ubifs_lpt_scan_callback)scan_check_cb,
-				    &data);
+				    &lst);
 	if (err && err != -ENOSPC)
 		goto out;
-	if (data.err) {
-		err = data.err;
-		goto out;
-	}
 
-	if (lst->empty_lebs != c->lst.empty_lebs ||
-	    lst->idx_lebs != c->lst.idx_lebs ||
-	    lst->total_free != c->lst.total_free ||
-	    lst->total_dirty != c->lst.total_dirty ||
-	    lst->total_used != c->lst.total_used) {
+	if (lst.empty_lebs != c->lst.empty_lebs ||
+	    lst.idx_lebs != c->lst.idx_lebs ||
+	    lst.total_free != c->lst.total_free ||
+	    lst.total_dirty != c->lst.total_dirty ||
+	    lst.total_used != c->lst.total_used) {
 		ubifs_err("bad overall accounting");
 		ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
 			  "total_free %lld, total_dirty %lld, total_used %lld",
-			  lst->empty_lebs, lst->idx_lebs, lst->total_free,
-			  lst->total_dirty, lst->total_used);
+			  lst.empty_lebs, lst.idx_lebs, lst.total_free,
+			  lst.total_dirty, lst.total_used);
 		ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
 			  "total_free %lld, total_dirty %lld, total_used %lld",
 			  c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
 		goto out;
 	}
 
-	if (lst->total_dead != c->lst.total_dead ||
-	    lst->total_dark != c->lst.total_dark) {
+	if (lst.total_dead != c->lst.total_dead ||
+	    lst.total_dark != c->lst.total_dark) {
 		ubifs_err("bad dead/dark space accounting");
 		ubifs_err("calculated: total_dead %lld, total_dark %lld",
-			  lst->total_dead, lst->total_dark);
+			  lst.total_dead, lst.total_dark);
 		ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
 			  c->lst.total_dead, c->lst.total_dark);
 		err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69b..dfcb574 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
 #include <linux/slab.h>
 #include "ubifs.h"
 
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
+
 /**
  * first_dirty_cnode - find first dirty cnode.
  * @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
 			if (nnode->nbranch[iip].lnum)
 				break;
 		}
-       } while (iip >= UBIFS_LPT_FANOUT);
+	} while (iip >= UBIFS_LPT_FANOUT);
 
 	/* Go right */
 	nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
 		c->lpt_drty_flgs |= LSAVE_DIRTY;
 		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
 	}
+
+	if (dbg_populate_lsave(c))
+		return;
+
 	list_for_each_entry(lprops, &c->empty_list, list) {
 		c->lsave[cnt++] = lprops->lnum;
 		if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
 	       current->pid);
 }
 
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	int i;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+	if (random32() & 3)
+		return 0;
+
+	for (i = 0; i < c->lsave_cnt; i++)
+		c->lsave[i] = c->main_first;
+
+	list_for_each_entry(lprops, &c->empty_list, list)
+		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+	list_for_each_entry(lprops, &c->freeable_list, list)
+		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+	list_for_each_entry(lprops, &c->frdi_idx_list, list)
+		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++)
+		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+
+	return 1;
+}
+
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47af..278c238 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
 	}
 
 	main_sz = (long long)c->main_lebs * c->leb_size;
-	if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+	if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
 		err = 9;
 		goto out;
 	}
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
 	}
 
 	if (c->lst.total_dead + c->lst.total_dark +
-	    c->lst.total_used + c->old_idx_sz > main_sz) {
+	    c->lst.total_used + c->bi.old_idx_sz > main_sz) {
 		err = 21;
 		goto out;
 	}
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
 	c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
 	c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
 	c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
-	c->old_idx_sz      = le64_to_cpu(c->mst_node->index_size);
+	c->bi.old_idx_sz   = le64_to_cpu(c->mst_node->index_size);
 	c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
 	c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
 	c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
 	c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
 	c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
 
-	c->calc_idx_sz = c->old_idx_sz;
+	c->calc_idx_sz = c->bi.old_idx_sz;
 
 	if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
 		c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04d..0b5296a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
 	mutex_unlock(&c->lp_mutex);
 }
 
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+	lnum += 1;
+	if (lnum > c->log_last)
+		lnum = UBIFS_LOG_LNUM;
+
+	return lnum;
+}
+
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318..bd644bf 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
 		sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
 		if (IS_ERR(sleb)) {
 			if (PTR_ERR(sleb) == -EUCLEAN)
-				sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+				sleb = ubifs_recover_leb(c, lnum, 0,
+							 c->sbuf, 0);
 			if (IS_ERR(sleb)) {
 				err = PTR_ERR(sleb);
 				break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6f..731d9e2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 
 /**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
  * @sleb: scanned LEB information
  * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
  *
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
  */
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 {
 	int dropped = 0;
 
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 		kfree(snod);
 		sleb->nodes_cnt -= 1;
 		dropped = 1;
+		if (!grouped)
+			break;
 	}
 	return dropped;
 }
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 					 int offs, void *sbuf, int grouped)
 {
-	int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
-	int empty_chkd = 0, start = offs;
+	int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
 	struct ubifs_scan_leb *sleb;
 	void *buf = sbuf + offs;
 
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 	if (IS_ERR(sleb))
 		return sleb;
 
-	if (sleb->ecc)
-		need_clean = 1;
-
+	ubifs_assert(len >= 8);
 	while (len >= 8) {
-		int ret;
-
 		dbg_scan("look at LEB %d:%d (%d bytes left)",
 			 lnum, offs, len);
 
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 		 * Scan quietly until there is an error from which we cannot
 		 * recover
 		 */
-		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
-
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
 		if (ret == SCANNED_A_NODE) {
 			/* A valid node, and not a padding node */
 			struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 			offs += node_len;
 			buf += node_len;
 			len -= node_len;
-			continue;
-		}
-
-		if (ret > 0) {
+		} else if (ret > 0) {
 			/* Padding bytes or a valid padding node */
 			offs += ret;
 			buf += ret;
 			len -= ret;
-			continue;
-		}
-
-		if (ret == SCANNED_EMPTY_SPACE) {
-			if (!is_empty(buf, len)) {
-				if (!is_last_write(c, buf, offs))
-					break;
-				clean_buf(c, &buf, lnum, &offs, &len);
-				need_clean = 1;
-			}
-			empty_chkd = 1;
+		} else if (ret == SCANNED_EMPTY_SPACE ||
+			   ret == SCANNED_GARBAGE     ||
+			   ret == SCANNED_A_BAD_PAD_NODE ||
+			   ret == SCANNED_A_CORRUPT_NODE) {
+			dbg_rcvry("found corruption - %d", ret);
 			break;
-		}
-
-		if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
-			if (is_last_write(c, buf, offs)) {
-				clean_buf(c, &buf, lnum, &offs, &len);
-				need_clean = 1;
-				empty_chkd = 1;
-				break;
-			}
-
-		if (ret == SCANNED_A_CORRUPT_NODE)
-			if (no_more_nodes(c, buf, len, lnum, offs)) {
-				clean_buf(c, &buf, lnum, &offs, &len);
-				need_clean = 1;
-				empty_chkd = 1;
-				break;
-			}
-
-		if (quiet) {
-			/* Redo the last scan but noisily */
-			quiet = 0;
-			continue;
-		}
-
-		switch (ret) {
-		case SCANNED_GARBAGE:
-			dbg_err("garbage");
-			goto corrupted;
-		case SCANNED_A_CORRUPT_NODE:
-		case SCANNED_A_BAD_PAD_NODE:
-			dbg_err("bad node");
-			goto corrupted;
-		default:
-			dbg_err("unknown");
+		} else {
+			dbg_err("unexpected return value %d", ret);
 			err = -EINVAL;
 			goto error;
 		}
 	}
 
-	if (!empty_chkd && !is_empty(buf, len)) {
-		if (is_last_write(c, buf, offs)) {
-			clean_buf(c, &buf, lnum, &offs, &len);
-			need_clean = 1;
-		} else {
+	if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
+		if (!is_last_write(c, buf, offs))
+			goto corrupted_rescan;
+	} else if (ret == SCANNED_A_CORRUPT_NODE) {
+		if (!no_more_nodes(c, buf, len, lnum, offs))
+			goto corrupted_rescan;
+	} else if (!is_empty(buf, len)) {
+		if (!is_last_write(c, buf, offs)) {
 			int corruption = first_non_ff(buf, len);
 
 			/*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 		}
 	}
 
-	/* Drop nodes from incomplete group */
-	if (grouped && drop_incomplete_group(sleb, &offs)) {
-		buf = sbuf + offs;
-		len = c->leb_size - offs;
-		clean_buf(c, &buf, lnum, &offs, &len);
-		need_clean = 1;
-	}
+	min_io_unit = round_down(offs, c->min_io_size);
+	if (grouped)
+		/*
+		 * If nodes are grouped, always drop the incomplete group at
+		 * the end.
+		 */
+		drop_last_node(sleb, &offs, 1);
 
-	if (offs % c->min_io_size) {
-		clean_buf(c, &buf, lnum, &offs, &len);
-		need_clean = 1;
-	}
+	/*
+	 * While we are in the middle of the same min. I/O unit keep dropping
+	 * nodes. So basically, what we want is to make sure that the last min.
+	 * I/O unit where we saw the corruption is dropped completely with all
+	 * the uncorrupted node which may possibly sit there.
+	 *
+	 * In other words, let's name the min. I/O unit where the corruption
+	 * starts B, and the previous min. I/O unit A. The below code tries to
+	 * deal with a situation when half of B contains valid nodes or the end
+	 * of a valid node, and the second half of B contains corrupted data or
+	 * garbage. This means that UBIFS had been writing to B just before the
+	 * power cut happened. I do not know how realistic is this scenario
+	 * that half of the min. I/O unit had been written successfully and the
+	 * other half not, but this is possible in our 'failure mode emulation'
+	 * infrastructure at least.
+	 *
+	 * So what is the problem, why we need to drop those nodes? Whey can't
+	 * we just clean-up the second half of B by putting a padding node
+	 * there? We can, and this works fine with one exception which was
+	 * reproduced with power cut emulation testing and happens extremely
+	 * rarely. The description follows, but it is worth noting that that is
+	 * only about the GC head, so we could do this trick only if the bud
+	 * belongs to the GC head, but it does not seem to be worth an
+	 * additional "if" statement.
+	 *
+	 * So, imagine the file-system is full, we run GC which is moving valid
+	 * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+	 * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+	 * and will try to continue. Imagine that LEB X is currently the
+	 * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+	 * same as amount of free space in LEB X.
+	 *
+	 * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+	 * are here trying to recover LEB Y which is the GC head LEB. We find
+	 * the min. I/O unit B as described above. Then we clean-up LEB Y by
+	 * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+	 * fails, because it cannot find a dirty LEB which could be GC'd into
+	 * LEB Y! Even LEB X does not match because the amount of valid nodes
+	 * there does not fit the free space in LEB Y any more! And this is
+	 * because of the padding node which we added to LEB Y. The
+	 * user-visible effect of this which I once observed and analysed is
+	 * that we cannot mount the file-system with -ENOSPC error.
+	 *
+	 * So obviously, to make sure that situation does not happen we should
+	 * free min. I/O unit B in LEB Y completely and the last used min. I/O
+	 * unit in LEB Y should be A. This is basically what the below code
+	 * tries to do.
+	 */
+	while (min_io_unit == round_down(offs, c->min_io_size) &&
+	       min_io_unit != offs &&
+	       drop_last_node(sleb, &offs, grouped));
+
+	buf = sbuf + offs;
+	len = c->leb_size - offs;
 
+	clean_buf(c, &buf, lnum, &offs, &len);
 	ubifs_end_scan(c, sleb, lnum, offs);
 
-	if (need_clean) {
-		err = fix_unclean_leb(c, sleb, start);
-		if (err)
-			goto error;
-	}
+	err = fix_unclean_leb(c, sleb, start);
+	if (err)
+		goto error;
 
 	return sleb;
 
+corrupted_rescan:
+	/* Re-scan the corrupted data with verbose messages */
+	dbg_err("corruptio %d", ret);
+	ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
 	ubifs_scanned_corruption(c, lnum, offs, buf);
 	err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
 }
 
 /**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	/*
+	 * Note, it is very important to first search for an empty LEB and then
+	 * run the commit, not vice-versa. The reason is that there might be
+	 * only one empty LEB at the moment, the one which has been the
+	 * @c->gc_lnum just before the power cut happened. During the regular
+	 * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+	 * one but GC can grab it. But at this moment this single empty LEB is
+	 * not marked as taken, so if we run commit - what happens? Right, the
+	 * commit will grab it and write the index there. Remember that the
+	 * index always expands as long as there is free space, and it only
+	 * starts consolidating when we run out of space.
+	 *
+	 * IOW, if we run commit now, we might not be able to find a free LEB
+	 * after this.
+	 */
+	lnum = ubifs_find_free_leb_for_idx(c);
+	if (lnum < 0) {
+		dbg_err("could not find an empty LEB");
+		dbg_dump_lprops(c);
+		dbg_dump_budg(c, &c->bi);
+		return lnum;
+	}
+
+	/* Reset the index flag */
+	err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				  LPROPS_INDEX, 0);
+	if (err)
+		return err;
+
+	c->gc_lnum = lnum;
+	dbg_rcvry("found empty LEB %d, run commit", lnum);
+
+	return ubifs_run_commit(c);
+}
+
+/**
  * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
  * @c: UBIFS file-system description object
  *
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 {
 	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 	struct ubifs_lprops lp;
-	int lnum, err;
+	int err;
+
+	dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
 
 	c->gc_lnum = -1;
-	if (wbuf->lnum == -1) {
-		dbg_rcvry("no GC head LEB");
-		goto find_free;
-	}
-	/*
-	 * See whether the used space in the dirtiest LEB fits in the GC head
-	 * LEB.
-	 */
-	if (wbuf->offs == c->leb_size) {
-		dbg_rcvry("no room in GC head LEB");
-		goto find_free;
-	}
+	if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
+		return grab_empty_leb(c);
+
 	err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
 	if (err) {
-		/*
-		 * There are no dirty or empty LEBs subject to here being
-		 * enough for the index. Try to use
-		 * 'ubifs_find_free_leb_for_idx()', which will return any empty
-		 * LEBs (ignoring index requirements). If the index then
-		 * doesn't have enough LEBs the recovery commit will fail -
-		 * which is the  same result anyway i.e. recovery fails. So
-		 * there is no problem ignoring index  requirements and just
-		 * grabbing a free LEB since we have already established there
-		 * is not a dirty LEB we could have used instead.
-		 */
-		if (err == -ENOSPC) {
-			dbg_rcvry("could not find a dirty LEB");
-			goto find_free;
-		}
-		return err;
-	}
-	ubifs_assert(!(lp.flags & LPROPS_INDEX));
-	lnum = lp.lnum;
-	if (lp.free + lp.dirty == c->leb_size) {
-		/* An empty LEB was returned */
-		if (lp.free != c->leb_size) {
-			err = ubifs_change_one_lp(c, lnum, c->leb_size,
-						  0, 0, 0, 0);
-			if (err)
-				return err;
-		}
-		err = ubifs_leb_unmap(c, lnum);
-		if (err)
+		if (err != -ENOSPC)
 			return err;
-		c->gc_lnum = lnum;
-		dbg_rcvry("allocated LEB %d for GC", lnum);
-		/* Run the commit */
-		dbg_rcvry("committing");
-		return ubifs_run_commit(c);
-	}
-	/*
-	 * There was no empty LEB so the used space in the dirtiest LEB must fit
-	 * in the GC head LEB.
-	 */
-	if (lp.free + lp.dirty < wbuf->offs) {
-		dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
-			  lnum, wbuf->lnum, wbuf->offs);
-		err = ubifs_return_leb(c, lnum);
-		if (err)
-			return err;
-		goto find_free;
+
+		dbg_rcvry("could not find a dirty LEB");
+		return grab_empty_leb(c);
 	}
+
+	ubifs_assert(!(lp.flags & LPROPS_INDEX));
+	ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
+
 	/*
 	 * We run the commit before garbage collection otherwise subsequent
 	 * mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 	err = ubifs_run_commit(c);
 	if (err)
 		return err;
-	/*
-	 * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
-	 * - use locking to keep 'ubifs_assert()' happy.
-	 */
-	dbg_rcvry("GC'ing LEB %d", lnum);
+
+	dbg_rcvry("GC'ing LEB %d", lp.lnum);
 	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
 	err = ubifs_garbage_collect_leb(c, &lp);
 	if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 			err = -EINVAL;
 		return err;
 	}
-	if (err != LEB_RETAINED) {
-		dbg_err("GC returned %d", err);
+
+	ubifs_assert(err == LEB_RETAINED);
+	if (err != LEB_RETAINED)
 		return -EINVAL;
-	}
+
 	err = ubifs_leb_unmap(c, c->gc_lnum);
 	if (err)
 		return err;
-	dbg_rcvry("allocated LEB %d for GC", lnum);
-	return 0;
 
-find_free:
-	/*
-	 * There is no GC head LEB or the free space in the GC head LEB is too
-	 * small, or there are not dirty LEBs. Allocate gc_lnum by calling
-	 * 'ubifs_find_free_leb_for_idx()' so GC is not run.
-	 */
-	lnum = ubifs_find_free_leb_for_idx(c);
-	if (lnum < 0) {
-		dbg_err("could not find an empty LEB");
-		return lnum;
-	}
-	/* And reset the index flag */
-	err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
-				  LPROPS_INDEX, 0);
-	if (err)
-		return err;
-	c->gc_lnum = lnum;
-	dbg_rcvry("allocated LEB %d for GC", lnum);
-	/* Run the commit */
-	dbg_rcvry("committing");
-	return ubifs_run_commit(c);
+	dbg_rcvry("allocated LEB %d for GC", lp.lnum);
+	return 0;
 }
 
 /**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
 	err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
 	if (err)
 		goto out;
-	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
 		  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
 	return 0;
 
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
 				e->i_size = le64_to_cpu(ino->size);
 			}
 		}
+
 		if (e->exists && e->i_size < e->d_size) {
-			if (!e->inode && c->ro_mount) {
+			if (c->ro_mount) {
 				/* Fix the inode size and pin it in memory */
 				struct inode *inode;
+				struct ubifs_inode *ui;
+
+				ubifs_assert(!e->inode);
 
 				inode = ubifs_iget(c->vfs_sb, e->inum);
 				if (IS_ERR(inode))
 					return PTR_ERR(inode);
+
+				ui = ubifs_inode(inode);
 				if (inode->i_size < e->d_size) {
 					dbg_rcvry("ino %lu size %lld -> %lld",
 						  (unsigned long)e->inum,
-						  e->d_size, inode->i_size);
+						  inode->i_size, e->d_size);
 					inode->i_size = e->d_size;
-					ubifs_inode(inode)->ui_size = e->d_size;
+					ui->ui_size = e->d_size;
+					ui->synced_i_size = e->d_size;
 					e->inode = inode;
 					this = rb_next(this);
 					continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
 					iput(e->inode);
 			}
 		}
+
 		this = rb_next(this);
 		rb_erase(&e->rb, &c->size_tree);
 		kfree(e);
 	}
+
 	return 0;
 }
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d36..6617280 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
  */
 
 #include "ubifs.h"
-
-/*
- * Replay flags.
- *
- * REPLAY_DELETION: node was deleted
- * REPLAY_REF: node is a reference node
- */
-enum {
-	REPLAY_DELETION = 1,
-	REPLAY_REF = 2,
-};
+#include <linux/list_sort.h>
 
 /**
- * struct replay_entry - replay tree entry.
+ * struct replay_entry - replay list entry.
  * @lnum: logical eraseblock number of the node
  * @offs: node offset
  * @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
  * @sqnum: node sequence number
- * @flags: replay flags
- * @rb: links the replay tree
+ * @list: links the replay list
  * @key: node key
  * @nm: directory entry name
  * @old_size: truncation old size
  * @new_size: truncation new size
- * @free: amount of free space in a bud
- * @dirty: amount of dirty space in a bud from padding and deletion nodes
- * @jhead: journal head number of the bud
  *
- * UBIFS journal replay must compare node sequence numbers, which means it must
- * build a tree of node information to insert into the TNC.
+ * The replay process first scans all buds and builds the replay list, then
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
  */
 struct replay_entry {
 	int lnum;
 	int offs;
 	int len;
+	unsigned int deletion:1;
 	unsigned long long sqnum;
-	int flags;
-	struct rb_node rb;
+	struct list_head list;
 	union ubifs_key key;
 	union {
 		struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
 			loff_t old_size;
 			loff_t new_size;
 		};
-		struct {
-			int free;
-			int dirty;
-			int jhead;
-		};
 	};
 };
 
@@ -90,57 +73,64 @@ struct replay_entry {
  * struct bud_entry - entry in the list of buds to replay.
  * @list: next bud in the list
  * @bud: bud description object
- * @free: free bytes in the bud
  * @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
  */
 struct bud_entry {
 	struct list_head list;
 	struct ubifs_bud *bud;
-	int free;
 	unsigned long long sqnum;
+	int free;
+	int dirty;
 };
 
 /**
  * set_bud_lprops - set free and dirty space used by a bud.
  * @c: UBIFS file-system description object
- * @r: replay entry of bud
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
  */
-static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
 {
 	const struct ubifs_lprops *lp;
 	int err = 0, dirty;
 
 	ubifs_get_lprops(c);
 
-	lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+	lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
 	if (IS_ERR(lp)) {
 		err = PTR_ERR(lp);
 		goto out;
 	}
 
 	dirty = lp->dirty;
-	if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+	if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
 		/*
 		 * The LEB was added to the journal with a starting offset of
 		 * zero which means the LEB must have been empty. The LEB
-		 * property values should be lp->free == c->leb_size and
-		 * lp->dirty == 0, but that is not the case. The reason is that
-		 * the LEB was garbage collected. The garbage collector resets
-		 * the free and dirty space without recording it anywhere except
-		 * lprops, so if there is not a commit then lprops does not have
-		 * that information next time the file system is mounted.
+		 * property values should be @lp->free == @c->leb_size and
+		 * @lp->dirty == 0, but that is not the case. The reason is that
+		 * the LEB had been garbage collected before it became the bud,
+		 * and there was not commit inbetween. The garbage collector
+		 * resets the free and dirty space without recording it
+		 * anywhere except lprops, so if there was no commit then
+		 * lprops does not have that information.
 		 *
 		 * We do not need to adjust free space because the scan has told
 		 * us the exact value which is recorded in the replay entry as
-		 * r->free.
+		 * @b->free.
 		 *
 		 * However we do need to subtract from the dirty space the
 		 * amount of space that the garbage collector reclaimed, which
 		 * is the whole LEB minus the amount of space that was free.
 		 */
-		dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+		dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
 			lp->free, lp->dirty);
-		dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+		dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
 			lp->free, lp->dirty);
 		dirty -= c->leb_size - lp->free;
 		/*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
 		 */
 		if (dirty != 0)
 			dbg_msg("LEB %d lp: %d free %d dirty "
-				"replay: %d free %d dirty", r->lnum, lp->free,
-				lp->dirty, r->free, r->dirty);
+				"replay: %d free %d dirty", b->bud->lnum,
+				lp->free, lp->dirty, b->free, b->dirty);
 	}
-	lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+	lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
 			     lp->flags | LPROPS_TAKEN, 0);
 	if (IS_ERR(lp)) {
 		err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
 	}
 
 	/* Make sure the journal head points to the latest bud */
-	err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
-				     c->leb_size - r->free, UBI_SHORTTERM);
+	err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
+				     b->bud->lnum, c->leb_size - b->free,
+				     UBI_SHORTTERM);
 
 out:
 	ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
 }
 
 /**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+	int err;
+
+	list_for_each_entry(b, &c->replay_buds, list) {
+		err = set_bud_lprops(c, b);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
  * trun_remove_range - apply a replay entry for a truncation to the TNC.
  * @c: UBIFS file-system description object
  * @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
  */
 static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 {
-	int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+	int err;
 
-	dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
-		r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+	dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
+		r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
 
 	/* Set c->replay_sqnum to help deal with dangling branches. */
 	c->replay_sqnum = r->sqnum;
 
-	if (r->flags & REPLAY_REF)
-		err = set_bud_lprops(c, r);
-	else if (is_hash_key(c, &r->key)) {
-		if (deletion)
+	if (is_hash_key(c, &r->key)) {
+		if (r->deletion)
 			err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
 		else
 			err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
 					       r->len, &r->nm);
 	} else {
-		if (deletion)
+		if (r->deletion)
 			switch (key_type(c, &r->key)) {
 			case UBIFS_INO_KEY:
 			{
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 			return err;
 
 		if (c->need_recovery)
-			err = ubifs_recover_size_accum(c, &r->key, deletion,
+			err = ubifs_recover_size_accum(c, &r->key, r->deletion,
 						       r->new_size);
 	}
 
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 }
 
 /**
- * destroy_replay_tree - destroy the replay.
- * @c: UBIFS file-system description object
+ * replay_entries_cmp - compare 2 replay entries.
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
  *
- * Destroy the replay tree.
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
  */
-static void destroy_replay_tree(struct ubifs_info *c)
+static int replay_entries_cmp(void *priv, struct list_head *a,
+			      struct list_head *b)
 {
-	struct rb_node *this = c->replay_tree.rb_node;
-	struct replay_entry *r;
-
-	while (this) {
-		if (this->rb_left) {
-			this = this->rb_left;
-			continue;
-		} else if (this->rb_right) {
-			this = this->rb_right;
-			continue;
-		}
-		r = rb_entry(this, struct replay_entry, rb);
-		this = rb_parent(this);
-		if (this) {
-			if (this->rb_left == &r->rb)
-				this->rb_left = NULL;
-			else
-				this->rb_right = NULL;
-		}
-		if (is_hash_key(c, &r->key))
-			kfree(r->nm.name);
-		kfree(r);
-	}
-	c->replay_tree = RB_ROOT;
+	struct replay_entry *ra, *rb;
+
+	cond_resched();
+	if (a == b)
+		return 0;
+
+	ra = list_entry(a, struct replay_entry, list);
+	rb = list_entry(b, struct replay_entry, list);
+	ubifs_assert(ra->sqnum != rb->sqnum);
+	if (ra->sqnum > rb->sqnum)
+		return 1;
+	return -1;
 }
 
 /**
- * apply_replay_tree - apply the replay tree to the TNC.
+ * apply_replay_list - apply the replay list to the TNC.
  * @c: UBIFS file-system description object
  *
- * Apply the replay tree.
- * Returns zero in case of success and a negative error code in case of
- * failure.
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
+ * success and a negative error code in case of failure.
  */
-static int apply_replay_tree(struct ubifs_info *c)
+static int apply_replay_list(struct ubifs_info *c)
 {
-	struct rb_node *this = rb_first(&c->replay_tree);
+	struct replay_entry *r;
+	int err;
 
-	while (this) {
-		struct replay_entry *r;
-		int err;
+	list_sort(c, &c->replay_list, &replay_entries_cmp);
 
+	list_for_each_entry(r, &c->replay_list, list) {
 		cond_resched();
 
-		r = rb_entry(this, struct replay_entry, rb);
 		err = apply_replay_entry(c, r);
 		if (err)
 			return err;
-		this = rb_next(this);
 	}
+
 	return 0;
 }
 
 /**
- * insert_node - insert a node to the replay tree.
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+	struct replay_entry *r, *tmp;
+
+	list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+		if (is_hash_key(c, &r->key))
+			kfree(r->nm.name);
+		list_del(&r->list);
+		kfree(r);
+	}
+}
+
+/**
+ * insert_node - insert a node to the replay list
  * @c: UBIFS file-system description object
  * @lnum: node logical eraseblock number
  * @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
  * @old_size: truncation old size
  * @new_size: truncation new size
  *
- * This function inserts a scanned non-direntry node to the replay tree. The
- * replay tree is an RB-tree containing @struct replay_entry elements which are
- * indexed by the sequence number. The replay tree is applied at the very end
- * of the replay process. Since the tree is sorted in sequence number order,
- * the older modifications are applied first. This function returns zero in
- * case of success and a negative error code in case of failure.
+ * This function inserts a scanned non-direntry node to the replay list. The
+ * replay list contains @struct replay_entry elements, and we sort this list in
+ * sequence number order before applying it. The replay list is applied at the
+ * very end of the replay process. Since the list is sorted in sequence number
+ * order, the older modifications are applied first. This function returns zero
+ * in case of success and a negative error code in case of failure.
  */
 static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 		       union ubifs_key *key, unsigned long long sqnum,
 		       int deletion, int *used, loff_t old_size,
 		       loff_t new_size)
 {
-	struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
 	struct replay_entry *r;
 
+	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+
 	if (key_inum(c, key) >= c->highest_inum)
 		c->highest_inum = key_inum(c, key);
 
-	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-	while (*p) {
-		parent = *p;
-		r = rb_entry(parent, struct replay_entry, rb);
-		if (sqnum < r->sqnum) {
-			p = &(*p)->rb_left;
-			continue;
-		} else if (sqnum > r->sqnum) {
-			p = &(*p)->rb_right;
-			continue;
-		}
-		ubifs_err("duplicate sqnum in replay");
-		return -EINVAL;
-	}
-
 	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
 	if (!r)
 		return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 	r->lnum = lnum;
 	r->offs = offs;
 	r->len = len;
+	r->deletion = !!deletion;
 	r->sqnum = sqnum;
-	r->flags = (deletion ? REPLAY_DELETION : 0);
+	key_copy(c, key, &r->key);
 	r->old_size = old_size;
 	r->new_size = new_size;
-	key_copy(c, key, &r->key);
 
-	rb_link_node(&r->rb, parent, p);
-	rb_insert_color(&r->rb, &c->replay_tree);
+	list_add_tail(&r->list, &c->replay_list);
 	return 0;
 }
 
 /**
- * insert_dent - insert a directory entry node into the replay tree.
+ * insert_dent - insert a directory entry node into the replay list.
  * @c: UBIFS file-system description object
  * @lnum: node logical eraseblock number
  * @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
  * @deletion: non-zero if this is a deletion
  * @used: number of bytes in use in a LEB
  *
- * This function inserts a scanned directory entry node to the replay tree.
- * Returns zero in case of success and a negative error code in case of
- * failure.
- *
- * This function is also used for extended attribute entries because they are
- * implemented as directory entry nodes.
+ * This function inserts a scanned directory entry node or an extended
+ * attribute entry to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
  */
 static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
 		       union ubifs_key *key, const char *name, int nlen,
 		       unsigned long long sqnum, int deletion, int *used)
 {
-	struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
 	struct replay_entry *r;
 	char *nbuf;
 
+	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
 	if (key_inum(c, key) >= c->highest_inum)
 		c->highest_inum = key_inum(c, key);
 
-	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
-	while (*p) {
-		parent = *p;
-		r = rb_entry(parent, struct replay_entry, rb);
-		if (sqnum < r->sqnum) {
-			p = &(*p)->rb_left;
-			continue;
-		}
-		if (sqnum > r->sqnum) {
-			p = &(*p)->rb_right;
-			continue;
-		}
-		ubifs_err("duplicate sqnum in replay");
-		return -EINVAL;
-	}
-
 	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
 	if (!r)
 		return -ENOMEM;
+
 	nbuf = kmalloc(nlen + 1, GFP_KERNEL);
 	if (!nbuf) {
 		kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
 	r->lnum = lnum;
 	r->offs = offs;
 	r->len = len;
+	r->deletion = !!deletion;
 	r->sqnum = sqnum;
+	key_copy(c, key, &r->key);
 	r->nm.len = nlen;
 	memcpy(nbuf, name, nlen);
 	nbuf[nlen] = '\0';
 	r->nm.name = nbuf;
-	r->flags = (deletion ? REPLAY_DELETION : 0);
-	key_copy(c, key, &r->key);
 
-	ubifs_assert(!*p);
-	rb_link_node(&r->rb, parent, p);
-	rb_insert_color(&r->rb, &c->replay_tree);
+	list_add_tail(&r->list, &c->replay_list);
 	return 0;
 }
 
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
 }
 
 /**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+	struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+	struct ubifs_bud *next;
+	uint32_t data;
+	int err;
+
+	if (list_is_last(&bud->list, &jh->buds_list))
+		return 1;
+
+	/*
+	 * The following is a quirk to make sure we work correctly with UBIFS
+	 * images used with older UBIFS.
+	 *
+	 * Normally, the last bud will be the last in the journal head's list
+	 * of bud. However, there is one exception if the UBIFS image belongs
+	 * to older UBIFS. This is fairly unlikely: one would need to use old
+	 * UBIFS, then have a power cut exactly at the right point, and then
+	 * try to mount this image with new UBIFS.
+	 *
+	 * The exception is: it is possible to have 2 buds A and B, A goes
+	 * before B, and B is the last, bud B is contains no data, and bud A is
+	 * corrupted at the end. The reason is that in older versions when the
+	 * journal code switched the next bud (from A to B), it first added a
+	 * log reference node for the new bud (B), and only after this it
+	 * synchronized the write-buffer of current bud (A). But later this was
+	 * changed and UBIFS started to always synchronize the write-buffer of
+	 * the bud (A) before writing the log reference for the new bud (B).
+	 *
+	 * But because older UBIFS always synchronized A's write-buffer before
+	 * writing to B, we can recognize this exceptional situation but
+	 * checking the contents of bud B - if it is empty, then A can be
+	 * treated as the last and we can recover it.
+	 *
+	 * TODO: remove this piece of code in a couple of years (today it is
+	 * 16.05.2011).
+	 */
+	next = list_entry(bud->list.next, struct ubifs_bud, list);
+	if (!list_is_last(&next->list, &jh->buds_list))
+		return 0;
+
+	err = ubi_read(c->ubi, next->lnum, (char *)&data,
+		       next->start, 4);
+	if (err)
+		return 0;
+
+	return data == 0xFFFFFFFF;
+}
+
+/**
  * replay_bud - replay a bud logical eraseblock.
  * @c: UBIFS file-system description object
- * @lnum: bud logical eraseblock number to replay
- * @offs: bud start offset
- * @jhead: journal head to which this bud belongs
- * @free: amount of free space in the bud is returned here
- * @dirty: amount of dirty space from padding and deletion nodes is returned
- * here
+ * @b: bud entry which describes the bud
  *
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
  */
-static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
-		      int *free, int *dirty)
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
 {
-	int err = 0, used = 0;
+	int is_last = is_last_bud(c, b->bud);
+	int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
-	struct ubifs_bud *bud;
 
-	dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
-	if (c->need_recovery)
-		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+	dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
+		lnum, b->bud->jhead, offs, is_last);
+
+	if (c->need_recovery && is_last)
+		/*
+		 * Recover only last LEBs in the journal heads, because power
+		 * cuts may cause corruptions only in these LEBs, because only
+		 * these LEBs could possibly be written to at the power cut
+		 * time.
+		 */
+		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+					 b->bud->jhead != GCHD);
 	else
 		sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
 	if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
 			goto out;
 	}
 
-	bud = ubifs_search_bud(c, lnum);
-	if (!bud)
-		BUG();
-
+	ubifs_assert(ubifs_search_bud(c, lnum));
 	ubifs_assert(sleb->endpt - offs >= used);
 	ubifs_assert(sleb->endpt % c->min_io_size == 0);
 
-	*dirty = sleb->endpt - offs - used;
-	*free = c->leb_size - sleb->endpt;
+	b->dirty = sleb->endpt - offs - used;
+	b->free = c->leb_size - sleb->endpt;
+	dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
 
 out:
 	ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
 }
 
 /**
- * insert_ref_node - insert a reference node to the replay tree.
- * @c: UBIFS file-system description object
- * @lnum: node logical eraseblock number
- * @offs: node offset
- * @sqnum: sequence number
- * @free: amount of free space in bud
- * @dirty: amount of dirty space from padding and deletion nodes
- * @jhead: journal head number for the bud
- *
- * This function inserts a reference node to the replay tree and returns zero
- * in case of success or a negative error code in case of failure.
- */
-static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
-			   unsigned long long sqnum, int free, int dirty,
-			   int jhead)
-{
-	struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
-	struct replay_entry *r;
-
-	dbg_mnt("add ref LEB %d:%d", lnum, offs);
-	while (*p) {
-		parent = *p;
-		r = rb_entry(parent, struct replay_entry, rb);
-		if (sqnum < r->sqnum) {
-			p = &(*p)->rb_left;
-			continue;
-		} else if (sqnum > r->sqnum) {
-			p = &(*p)->rb_right;
-			continue;
-		}
-		ubifs_err("duplicate sqnum in replay tree");
-		return -EINVAL;
-	}
-
-	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
-	if (!r)
-		return -ENOMEM;
-
-	r->lnum = lnum;
-	r->offs = offs;
-	r->sqnum = sqnum;
-	r->flags = REPLAY_REF;
-	r->free = free;
-	r->dirty = dirty;
-	r->jhead = jhead;
-
-	rb_link_node(&r->rb, parent, p);
-	rb_insert_color(&r->rb, &c->replay_tree);
-	return 0;
-}
-
-/**
  * replay_buds - replay all buds.
  * @c: UBIFS file-system description object
  *
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
 static int replay_buds(struct ubifs_info *c)
 {
 	struct bud_entry *b;
-	int err, uninitialized_var(free), uninitialized_var(dirty);
+	int err;
+	unsigned long long prev_sqnum = 0;
 
 	list_for_each_entry(b, &c->replay_buds, list) {
-		err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
-				 &free, &dirty);
-		if (err)
-			return err;
-		err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
-				      free, dirty, b->bud->jhead);
+		err = replay_bud(c, b);
 		if (err)
 			return err;
+
+		ubifs_assert(b->sqnum > prev_sqnum);
+		prev_sqnum = b->sqnum;
 	}
 
 	return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
 	if (err)
 		goto out;
 
-	err = apply_replay_tree(c);
+	err = apply_replay_list(c);
+	if (err)
+		goto out;
+
+	err = set_buds_lprops(c);
 	if (err)
 		goto out;
 
 	/*
-	 * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
-	 * to roughly estimate index growth. Things like @c->min_idx_lebs
+	 * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
+	 * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
 	 * depend on it. This means we have to initialize it to make sure
 	 * budgeting works properly.
 	 */
-	c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
-	c->budg_uncommitted_idx *= c->max_idx_node_sz;
+	c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+	c->bi.uncommitted_idx *= c->max_idx_node_sz;
 
 	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
 	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
 		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
 		(unsigned long)c->highest_inum);
 out:
-	destroy_replay_tree(c);
+	destroy_replay_list(c);
 	destroy_bud_list(c);
 	c->replaying = 0;
 	return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b47..c606f01 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
  * @c: UBIFS file-system description object
  *
  * This function returns a pointer to the superblock node or a negative error
- * code.
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
  */
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
 {
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
 	memcpy(&c->uuid, &sup->uuid, 16);
 	c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+	c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
 
 	/* Automatically increase file system size to the maximum size */
 	c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
 	kfree(sup);
 	return err;
 }
+
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+	int err;
+
+	ubifs_assert(len >= 0);
+	ubifs_assert(len % c->min_io_size == 0);
+	ubifs_assert(len < c->leb_size);
+
+	if (len == 0) {
+		dbg_mnt("unmap empty LEB %d", lnum);
+		return ubi_leb_unmap(c->ubi, lnum);
+	}
+
+	dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+	err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+	if (err)
+		return err;
+
+	return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+	int lnum, err = 0;
+	struct ubifs_lprops *lprops;
+
+	ubifs_get_lprops(c);
+
+	/* Fixup LEBs in the master area */
+	for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+		err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+		if (err)
+			goto out;
+	}
+
+	/* Unmap unused log LEBs */
+	lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+	while (lnum != c->ltail_lnum) {
+		err = fixup_leb(c, lnum, 0);
+		if (err)
+			goto out;
+		lnum = ubifs_next_log_lnum(c, lnum);
+	}
+
+	/* Fixup the current log head */
+	err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+	if (err)
+		goto out;
+
+	/* Fixup LEBs in the LPT area */
+	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+		int free = c->ltab[lnum - c->lpt_first].free;
+
+		if (free > 0) {
+			err = fixup_leb(c, lnum, c->leb_size - free);
+			if (err)
+				goto out;
+		}
+	}
+
+	/* Unmap LEBs in the orphans area */
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		err = fixup_leb(c, lnum, 0);
+		if (err)
+			goto out;
+	}
+
+	/* Fixup LEBs in the main area */
+	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
+
+		if (lprops->free > 0) {
+			err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+	int err;
+	struct ubifs_sb_node *sup;
+
+	ubifs_assert(c->space_fixup);
+	ubifs_assert(!c->ro_mount);
+
+	ubifs_msg("start fixing up free space");
+
+	err = fixup_free_space(c);
+	if (err)
+		return err;
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	/* Free-space fixup is no longer required */
+	c->space_fixup = 0;
+	sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+
+	err = ubifs_write_sb_node(c, sup);
+	kfree(sup);
+	if (err)
+		return err;
+
+	ubifs_msg("free space fixup complete");
+	return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f..6db0bdaa 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
 		ubifs_release_dirty_inode_budget(c, ui);
 	else {
 		/* We've deleted something - clean the "no space" flags */
-		c->nospace = c->nospace_rp = 0;
+		c->bi.nospace = c->bi.nospace_rp = 0;
 		smp_wmb();
 	}
 done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
 	 * be compressed and direntries are of the maximum size.
 	 *
 	 * Note, data, which may be stored in inodes is budgeted separately, so
-	 * it is not included into 'c->inode_budget'.
+	 * it is not included into 'c->bi.inode_budget'.
 	 */
-	c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
-	c->inode_budget = UBIFS_INO_NODE_SZ;
-	c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+	c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+	c->bi.inode_budget = UBIFS_INO_NODE_SZ;
+	c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
 
 	/*
 	 * When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
 {
 	long long tmp64;
 
-	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	/*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
 {
 	ubifs_assert(c->dark_wm > 0);
 	if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
-		ubifs_err("insufficient free space to mount in read/write mode");
-		dbg_dump_budg(c);
+		ubifs_err("insufficient free space to mount in R/W mode");
+		dbg_dump_budg(c, &c->bi);
 		dbg_dump_lprops(c);
 		return -ENOSPC;
 	}
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_lpt;
 
-	err = dbg_check_idx_size(c, c->old_idx_sz);
+	err = dbg_check_idx_size(c, c->bi.old_idx_sz);
 	if (err)
 		goto out_lpt;
 
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_journal;
 
 	/* Calculate 'min_idx_lebs' after journal replay */
-	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 
 	err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
 	if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
 	} else
 		ubifs_assert(c->lst.taken_empty_lebs > 0);
 
+	if (!c->ro_mount && c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out_infos;
+	}
+
 	err = dbg_check_filesystem(c);
 	if (err)
 		goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
 		c->main_lebs, c->main_first, c->leb_cnt - 1);
 	dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
 	dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
-		c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+		c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+		c->bi.old_idx_sz >> 20);
 	dbg_msg("key hash type:       %d", c->key_hash_type);
 	dbg_msg("tree fanout:         %d", c->fanout);
 	dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
 		UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
 	dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
-	        UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+		UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
 		UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
 	dbg_msg("dead watermark:      %d", c->dead_wm);
 	dbg_msg("dark watermark:      %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		}
 		sup->leb_cnt = cpu_to_le32(c->leb_cnt);
 		err = ubifs_write_sb_node(c, sup);
+		kfree(sup);
 		if (err)
 			goto out;
 	}
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		 */
 		err = dbg_check_space_info(c);
 	}
+
+	if (c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out;
+	}
+
 	mutex_unlock(&c->umount_mutex);
 	return err;
 
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
 	 * to write them back because of I/O errors.
 	 */
 	if (!c->ro_error) {
-		ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
-		ubifs_assert(c->budg_idx_growth == 0);
-		ubifs_assert(c->budg_dd_growth == 0);
-		ubifs_assert(c->budg_data_growth == 0);
+		ubifs_assert(c->bi.idx_growth == 0);
+		ubifs_assert(c->bi.dd_growth == 0);
+		ubifs_assert(c->bi.data_growth == 0);
 	}
 
 	/*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de48597..8119b1f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
 		if (err) {
 			/* Ensure the znode is dirtied */
 			if (znode->cnext || !ubifs_zn_dirty(znode)) {
-				    znode = dirty_cow_bottom_up(c, znode);
-				    if (IS_ERR(znode)) {
-					    err = PTR_ERR(znode);
-					    goto out_unlock;
-				    }
+				znode = dirty_cow_bottom_up(c, znode);
+				if (IS_ERR(znode)) {
+					err = PTR_ERR(znode);
+					goto out_unlock;
+				}
 			}
 			err = tnc_delete(c, znode, n);
 		}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5..41920f3 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
 				c->gap_lebs = NULL;
 				return err;
 			}
-			if (!dbg_force_in_the_gaps_enabled) {
+			if (dbg_force_in_the_gaps_enabled()) {
 				/*
 				 * Do not print scary warnings if the debugging
 				 * option which forces in-the-gaps is enabled.
 				 */
-				ubifs_err("out of space");
-				spin_lock(&c->space_lock);
-				dbg_dump_budg(c);
-				spin_unlock(&c->space_lock);
+				ubifs_warn("out of space");
+				dbg_dump_budg(c, &c->bi);
 				dbg_dump_lprops(c);
 			}
 			/* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 	spin_lock(&c->space_lock);
 	/*
 	 * Although we have not finished committing yet, update size of the
-	 * committed index ('c->old_idx_sz') and zero out the index growth
+	 * committed index ('c->bi.old_idx_sz') and zero out the index growth
 	 * budget. It is OK to do this now, because we've reserved all the
 	 * space which is needed to commit the index, and it is save for the
 	 * budgeting subsystem to assume the index is already committed,
 	 * even though it is not.
 	 */
-	ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-	c->old_idx_sz = c->calc_idx_sz;
-	c->budg_uncommitted_idx = 0;
-	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	c->bi.old_idx_sz = c->calc_idx_sz;
+	c->bi.uncommitted_idx = 0;
+	c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 	spin_unlock(&c->space_lock);
 	mutex_unlock(&c->tnc_mutex);
 
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca78..e24380c 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
  * Superblock flags.
  *
  * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
  */
 enum {
 	UBIFS_FLG_BIGLPT = 0x02,
+	UBIFS_FLG_SPACE_FIXUP = 0x04,
 };
 
 /**
@@ -434,7 +436,7 @@ struct ubifs_ch {
 	__u8 node_type;
 	__u8 group_type;
 	__u8 padding[2];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
 union ubifs_dev_desc {
 	__le32 new;
 	__le64 huge;
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
 	__le16 compr_type;
 	__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
 	__u8 data[];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
 	__le16 nlen;
 	__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
 	__u8 name[];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
 	__le16 compr_type;
 	__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
 	__u8 data[];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
 	__u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
 	__le64 old_size;
 	__le64 new_size;
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
 struct ubifs_pad_node {
 	struct ubifs_ch ch;
 	__le32 pad_len;
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
 	__u8 uuid[16];
 	__le32 ro_compat_version;
 	__u8 padding2[3968];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
 	__le32 idx_lebs;
 	__le32 leb_cnt;
 	__u8 padding[344];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
 	__le32 offs;
 	__le32 jhead;
 	__u8 padding[28];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
 	__le32 offs;
 	__le32 len;
 	__u8 key[];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
 	__le16 child_cnt;
 	__le16 level;
 	__u8 branches[];
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
 struct ubifs_cs_node {
 	struct ubifs_ch ch;
 	__le64 cmt_no;
-} __attribute__ ((packed));
+} __packed;
 
 /**
  * struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
 	struct ubifs_ch ch;
 	__le64 cmt_no;
 	__le64 inos[];
-} __attribute__ ((packed));
+} __packed;
 
 #endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3..93d1412 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
  * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
  * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
  * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
- * with 'ubifs_writepage()' (see file.c). All the other inode fields are
- * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
  * could consider to rework locking and base it on "shadow" fields.
  */
 struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
 	unsigned int compr_type:2;
 };
 
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ *             other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ *                   which still have to be taken into account because the index
+ *                   has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ *               mount)
+ */
+struct ubifs_budg_info {
+	long long idx_growth;
+	long long data_growth;
+	long long dd_growth;
+	long long uncommitted_idx;
+	unsigned long long old_idx_sz;
+	int min_idx_lebs;
+	unsigned int nospace:1;
+	unsigned int nospace_rp:1;
+	int page_budget;
+	int inode_budget;
+	int dent_budget;
+};
+
 struct ubifs_debug_info;
 
 /**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
  * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
  *
  * @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
  * @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
  * @dirty_zn_cnt: number of dirty znodes
  * @clean_zn_cnt: number of clean znodes
  *
- * @budg_idx_growth: amount of bytes budgeted for index growth
- * @budg_data_growth: amount of bytes budgeted for cached data
- * @budg_dd_growth: amount of bytes budgeted for cached data that will make
- *                  other data dirty
- * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
- *                        but which still have to be taken into account because
- *                        the index has not been committed so far
- * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
- *              @nospace, and @nospace_rp;
- * @min_idx_lebs: minimum number of LEBs required for the index
- * @old_idx_sz: size of index on flash
+ * @space_lock: protects @bi and @lst
+ * @lst: lprops statistics
+ * @bi: budgeting information
  * @calc_idx_sz: temporary variable which is used to calculate new index size
  *               (contains accurate new index size at end of TNC commit start)
- * @lst: lprops statistics
- * @nospace: non-zero if the file-system does not have flash space (used as
- *           optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- *              pool is full
- *
- * @page_budget: budget for a page
- * @inode_budget: budget for an inode
- * @dent_budget: budget for a directory entry
  *
  * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
- * I/O unit
+ *                 I/O unit
  * @mst_node_alsz: master node aligned size
  * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
  * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
  * @replaying: %1 during journal replay
  * @mounting: %1 while mounting
  * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @replay_tree: temporary tree used during journal replay
  * @replay_list: temporary list used during journal replay
  * @replay_buds: list of buds to replay
  * @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
 	wait_queue_head_t cmt_wq;
 
 	unsigned int big_lpt:1;
+	unsigned int space_fixup:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 	unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
 	atomic_long_t dirty_zn_cnt;
 	atomic_long_t clean_zn_cnt;
 
-	long long budg_idx_growth;
-	long long budg_data_growth;
-	long long budg_dd_growth;
-	long long budg_uncommitted_idx;
 	spinlock_t space_lock;
-	int min_idx_lebs;
-	unsigned long long old_idx_sz;
-	unsigned long long calc_idx_sz;
 	struct ubifs_lp_stats lst;
-	unsigned int nospace:1;
-	unsigned int nospace_rp:1;
-
-	int page_budget;
-	int inode_budget;
-	int dent_budget;
+	struct ubifs_budg_info bi;
+	unsigned long long calc_idx_sz;
 
 	int ref_node_alsz;
 	int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
 	unsigned int replaying:1;
 	unsigned int mounting:1;
 	unsigned int remounting_rw:1;
-	struct rb_root replay_tree;
 	struct list_head replay_list;
 	struct list_head replay_buds;
 	unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
 int ubifs_read_superblock(struct ubifs_info *c);
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
 
 /* replay.c */
 int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f46..16f19f5 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
 	SECURITY_XATTR,
 };
 
-static const struct inode_operations none_inode_operations;
-static const struct file_operations none_file_operations;
+static const struct inode_operations empty_iops;
+static const struct file_operations empty_fops;
 
 /**
  * create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 
 	/* Re-define all operations to be "nothing" */
 	inode->i_mapping->a_ops = &empty_aops;
-	inode->i_op = &none_inode_operations;
-	inode->i_fop = &none_file_operations;
+	inode->i_op = &empty_iops;
+	inode->i_fop = &empty_fops;
 
 	inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
 	ui = ubifs_inode(inode);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e765743..b4d791a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -409,7 +409,7 @@ out:
 }
 
 /**
- * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * ufs_getfrag_block() - `get_block_t' function, interface between UFS and
  * readpage, writepage and so on
  */
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9ef9ed2..5e68099 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
-#include <linux/list_sort.h>
 
 #include "xfs_sb.h"
 #include "xfs_inum.h"
@@ -709,6 +708,27 @@ xfs_buf_get_empty(
 	return bp;
 }
 
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+	struct xfs_buf		*bp,
+	size_t			len)
+{
+	if (bp->b_pages)
+		_xfs_buf_free_pages(bp);
+
+	bp->b_pages = NULL;
+	bp->b_page_count = 0;
+	bp->b_addr = NULL;
+	bp->b_file_offset = 0;
+	bp->b_buffer_length = bp->b_count_desired = len;
+	bp->b_bn = XFS_BUF_DADDR_NULL;
+	bp->b_flags &= ~XBF_MAPPED;
+}
+
 static inline struct page *
 mem_to_page(
 	void			*addr)
@@ -1402,12 +1422,12 @@ restart:
 int
 xfs_buftarg_shrink(
 	struct shrinker		*shrink,
-	int			nr_to_scan,
-	gfp_t			mask)
+	struct shrink_control	*sc)
 {
 	struct xfs_buftarg	*btp = container_of(shrink,
 					struct xfs_buftarg, bt_shrinker);
 	struct xfs_buf		*bp;
+	int nr_to_scan = sc->nr_to_scan;
 	LIST_HEAD(dispose);
 
 	if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a9a1c45..50a7d5f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b3486df..54e623b 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -586,7 +586,8 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_RESVSP_32:
 	case XFS_IOC_UNRESVSP_32:
 	case XFS_IOC_RESVSP64_32:
-	case XFS_IOC_UNRESVSP64_32: {
+	case XFS_IOC_UNRESVSP64_32:
+	case XFS_IOC_ZERO_RANGE_32: {
 		struct xfs_flock64	bf;
 
 		if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 08b6057..80f4060 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 {
 #define XFS_IOC_UNRESVSP_32	_IOW('X', 41, struct compat_xfs_flock64)
 #define XFS_IOC_RESVSP64_32	_IOW('X', 42, struct compat_xfs_flock64)
 #define XFS_IOC_UNRESVSP64_32	_IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32	_IOW('X', 57, struct compat_xfs_flock64)
 
 typedef struct compat_xfs_fsop_geom_v1 {
 	__u32		blocksize;	/* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 244be9c..8633521 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -70,6 +70,7 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/list_sort.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 9f76cce..bd672de 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -41,23 +41,6 @@ __xfs_printk(
 	printk("%sXFS: %pV\n", level, vaf);
 }
 
-void xfs_printk(
-	const char		*level,
-	const struct xfs_mount	*mp,
-	const char		*fmt, ...)
-{
-	struct va_format	vaf;
-	va_list			args;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	__xfs_printk(level, mp, &vaf);
-	va_end(args);
-}
-
 #define define_xfs_printk_level(func, kern_level)		\
 void func(const struct xfs_mount *mp, const char *fmt, ...)	\
 {								\
@@ -95,8 +78,7 @@ xfs_alert_tag(
 	int			do_panic = 0;
 
 	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-		xfs_printk(KERN_ALERT, mp,
-			"XFS: Transforming an alert into a BUG.");
+		xfs_alert(mp, "Transforming an alert into a BUG.");
 		do_panic = 1;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index f1b3fc1..7fb7ea0 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,9 +3,6 @@
 
 struct xfs_mount;
 
-extern void xfs_printk(const char *level, const struct xfs_mount *mp,
-                      const char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
         __attribute__ ((format (printf, 2, 3)));
 extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
@@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
 extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
         __attribute__ ((format (printf, 2, 3)));
 #else
-static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 {
 }
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b38e58d..b0aa59e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1787,10 +1787,6 @@ init_xfs_fs(void)
 	if (error)
 		goto out_cleanup_procfs;
 
-	error = xfs_init_workqueues();
-	if (error)
-		goto out_sysctl_unregister;
-
 	vfs_initquota();
 
 	error = register_filesystem(&xfs_fs_type);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3e898a4..8ecad5f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -267,6 +267,16 @@ xfs_sync_inode_attr(
 
 	error = xfs_iflush(ip, flags);
 
+	/*
+	 * We don't want to try again on non-blocking flushes that can't run
+	 * again immediately. If an inode really must be written, then that's
+	 * what the SYNC_WAIT flag is for.
+	 */
+	if (error == EAGAIN) {
+		ASSERT(!(flags & SYNC_WAIT));
+		error = 0;
+	}
+
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	return error;
@@ -1022,13 +1032,14 @@ xfs_reclaim_inodes(
 static int
 xfs_reclaim_inode_shrink(
 	struct shrinker	*shrink,
-	int		nr_to_scan,
-	gfp_t		gfp_mask)
+	struct shrink_control *sc)
 {
 	struct xfs_mount *mp;
 	struct xfs_perag *pag;
 	xfs_agnumber_t	ag;
 	int		reclaimable;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
 
 	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
 	if (nr_to_scan) {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2d0bcb4..d48b7a5 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
 
 );
 
-#define XFS_BUSY_SYNC \
-	{ 0,	"async" }, \
-	{ 1,	"sync" }
-
-TRACE_EVENT(xfs_alloc_busy,
-	TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-		 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-	TP_ARGS(trans, agno, agbno, len, sync),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(struct xfs_trans *, tp)
-		__field(int, tid)
-		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
-		__field(xfs_extlen_t, len)
-		__field(int, sync)
-	),
-	TP_fast_assign(
-		__entry->dev = trans->t_mountp->m_super->s_dev;
-		__entry->tp = trans;
-		__entry->tid = trans->t_ticket->t_tid;
-		__entry->agno = agno;
-		__entry->agbno = agbno;
-		__entry->len = len;
-		__entry->sync = sync;
-	),
-	TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->tp,
-		  __entry->tid,
-		  __entry->agno,
-		  __entry->agbno,
-		  __entry->len,
-		  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
-
-);
-
-TRACE_EVENT(xfs_alloc_unbusy,
+DECLARE_EVENT_CLASS(xfs_busy_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
 		 xfs_agblock_t agbno, xfs_extlen_t len),
 	TP_ARGS(mp, agno, agbno, len),
@@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
 		  __entry->agbno,
 		  __entry->len)
 );
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
 
-#define XFS_BUSY_STATES \
-	{ 0,	"missing" }, \
-	{ 1,	"found" }
-
-TRACE_EVENT(xfs_alloc_busysearch,
+TRACE_EVENT(xfs_alloc_busy_trim,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 xfs_agblock_t agbno, xfs_extlen_t len, int found),
-	TP_ARGS(mp, agno, agbno, len, found),
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t tbno, xfs_extlen_t tlen),
+	TP_ARGS(mp, agno, agbno, len, tbno, tlen),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(int, found)
+		__field(xfs_agblock_t, tbno)
+		__field(xfs_extlen_t, tlen)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->found = found;
+		__entry->tbno = tbno;
+		__entry->tlen = tlen;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+	TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
-		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+		  __entry->tbno,
+		  __entry->tlen)
 );
 
 TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		  __entry->wasfromfl,
 		  __entry->isfl,
 		  __entry->userdata,
-		  __entry->firstblock)
+		  (unsigned long long)__entry->firstblock)
 )
 
 #define DEFINE_ALLOC_EVENT(name) \
@@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa..b94dace 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(struct shrinker *, int, gfp_t);
+STATIC int	xfs_qm_shake(struct shrinker *, struct shrink_control *);
 
 static struct shrinker xfs_qm_shaker = {
 	.shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
 STATIC int
 xfs_qm_shake(
 	struct shrinker	*shrink,
-	int		nr_to_scan,
-	gfp_t		gfp_mask)
+	struct shrink_control *sc)
 {
 	int	ndqused, nfree, n;
+	gfp_t gfp_mask = sc->gfp_mask;
 
 	if (!kmem_shake_allow(gfp_mask))
 		return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 58632cc..da0a561 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,6 @@ struct xfs_busy_extent {
 	xfs_agnumber_t	agno;
 	xfs_agblock_t	bno;
 	xfs_extlen_t	length;
-	xlog_tid_t	tid;		/* transaction that created this */
 };
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 27d64d7..acdced8 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-/*
- * Prototypes for per-ag allocation routines
- */
-
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-	xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-
-/*
- * Internal functions.
- */
+		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
+		xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
 
 /*
  * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
 	xfs_extlen_t	*reslen)	/* result length */
 {
 	xfs_agblock_t	bno;
-	xfs_extlen_t	diff;
 	xfs_extlen_t	len;
 
-	if (args->alignment > 1 && foundlen >= args->minlen) {
-		bno = roundup(foundbno, args->alignment);
-		diff = bno - foundbno;
-		len = diff >= foundlen ? 0 : foundlen - diff;
+	/* Trim busy sections out of found extent */
+	xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+	if (args->alignment > 1 && len >= args->minlen) {
+		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
+		xfs_extlen_t	diff = aligned_bno - bno;
+
+		*resbno = aligned_bno;
+		*reslen = diff >= len ? 0 : len - diff;
 	} else {
-		bno = foundbno;
-		len = foundlen;
+		*resbno = bno;
+		*reslen = len;
 	}
-	*resbno = bno;
-	*reslen = len;
 }
 
 /*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
 		return 1;
 	agf = XFS_BUF_TO_AGF(args->agbp);
 	diff = be32_to_cpu(agf->agf_freeblks)
-		+ be32_to_cpu(agf->agf_flcount)
 		- args->len - args->minleft;
 	if (diff >= 0)
 		return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
 		if (error)
 			return error;
 
-		/*
-		 * Search the busylist for these blocks and mark the
-		 * transaction as synchronous if blocks are found. This
-		 * avoids the need to block due to a synchronous log
-		 * force to ensure correct ordering as the synchronous
-		 * transaction will guarantee that for us.
-		 */
-		if (xfs_alloc_busy_search(args->mp, args->agno,
-					args->agbno, args->len))
-			xfs_trans_set_sync(args->tp);
+		ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+					      args->agbno, args->len));
 	}
 
 	if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
 {
 	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
 	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
-	xfs_agblock_t	end;	/* end of allocated extent */
 	int		error;
 	xfs_agblock_t	fbno;	/* start block of found extent */
-	xfs_agblock_t	fend;	/* end block of found extent */
 	xfs_extlen_t	flen;	/* length of found extent */
+	xfs_agblock_t	tbno;	/* start block of trimmed extent */
+	xfs_extlen_t	tlen;	/* length of trimmed extent */
+	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	xfs_agblock_t	end;	/* end of allocated extent */
 	int		i;	/* success/failure of operation */
-	xfs_agblock_t	maxend;	/* end of maximal extent */
-	xfs_agblock_t	minend;	/* end of minimal extent */
 	xfs_extlen_t	rlen;	/* length of returned extent */
 
 	ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
 		goto error0;
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	ASSERT(fbno <= args->agbno);
-	minend = args->agbno + args->minlen;
-	maxend = args->agbno + args->maxlen;
-	fend = fbno + flen;
 
 	/*
-	 * Give up if the freespace isn't long enough for the minimum request.
+	 * Check for overlapping busy extents.
+	 */
+	xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+	/*
+	 * Give up if the start of the extent is busy, or the freespace isn't
+	 * long enough for the minimum request.
 	 */
-	if (fend < minend)
+	if (tbno > args->agbno)
+		goto not_found;
+	if (tlen < args->minlen)
+		goto not_found;
+	tend = tbno + tlen;
+	if (tend < args->agbno + args->minlen)
 		goto not_found;
 
 	/*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
 	 *
 	 * Fix the length according to mod and prod if given.
 	 */
-	end = XFS_AGBLOCK_MIN(fend, maxend);
+	end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
 	args->len = end - args->agbno;
 	xfs_alloc_fix_len(args);
 	if (!xfs_alloc_fix_minleft(args))
 		goto not_found;
 
 	rlen = args->len;
-	ASSERT(args->agbno + rlen <= fend);
+	ASSERT(args->agbno + rlen <= tend);
 	end = args->agbno + rlen;
 
 	/*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
 	struct xfs_btree_cur	**scur,	/* searching cursor */
 	xfs_agblock_t		gdiff,	/* difference for search comparison */
 	xfs_agblock_t		*sbno,	/* extent found by search */
-	xfs_extlen_t		*slen,
-	xfs_extlen_t		*slena,	/* aligned length */
+	xfs_extlen_t		*slen,	/* extent length */
+	xfs_agblock_t		*sbnoa,	/* aligned extent found by search */
+	xfs_extlen_t		*slena,	/* aligned extent length */
 	int			dir)	/* 0 = search right, 1 = search left */
 {
-	xfs_agblock_t		bno;
 	xfs_agblock_t		new;
 	xfs_agblock_t		sdiff;
 	int			error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
+		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
 
 		/*
 		 * The good extent is closer than this one.
 		 */
 		if (!dir) {
-			if (bno >= args->agbno + gdiff)
+			if (*sbnoa >= args->agbno + gdiff)
 				goto out_use_good;
 		} else {
-			if (bno <= args->agbno - gdiff)
+			if (*sbnoa <= args->agbno - gdiff)
 				goto out_use_good;
 		}
 
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
 			xfs_alloc_fix_len(args);
 
 			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-						       args->alignment, *sbno,
-						       *slen, &new);
+						       args->alignment, *sbnoa,
+						       *slena, &new);
 
 			/*
 			 * Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	gtbnoa;		/* aligned ... */
 	xfs_extlen_t	gtdiff;		/* difference to right side entry */
 	xfs_extlen_t	gtlen;		/* length of right side entry */
-	xfs_extlen_t	gtlena = 0;	/* aligned ... */
+	xfs_extlen_t	gtlena;		/* aligned ... */
 	xfs_agblock_t	gtnew;		/* useful start bno of right side */
 	int		error;		/* error code */
 	int		i;		/* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	ltbnoa;		/* aligned ... */
 	xfs_extlen_t	ltdiff;		/* difference to left side entry */
 	xfs_extlen_t	ltlen;		/* length of left side entry */
-	xfs_extlen_t	ltlena = 0;	/* aligned ... */
+	xfs_extlen_t	ltlena;		/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
 #if defined(DEBUG) && defined(__KERNEL__)
 	/*
 	 * Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
 
 	dofirst = random32() & 1;
 #endif
+
+restart:
+	bno_cur_lt = NULL;
+	bno_cur_gt = NULL;
+	ltlen = 0;
+	gtlena = 0;
+	ltlena = 0;
+
 	/*
 	 * Get a cursor for the by-size btree.
 	 */
 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
 		args->agno, XFS_BTNUM_CNT);
-	ltlen = 0;
-	bno_cur_lt = bno_cur_gt = NULL;
+
 	/*
 	 * See if there are any free extents as big as maxlen.
 	 */
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
 			goto error0;
 		if (i == 0 || ltlen == 0) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_noentry(args);
 			return 0;
 		}
 		ASSERT(i == 1);
 	}
 	args->wasfromfl = 0;
+
 	/*
 	 * First algorithm.
 	 * If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
 			if (args->len < blen)
 				continue;
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, ltbno, ltlen, &ltnew);
+				args->alignment, ltbnoa, ltlena, &ltnew);
 			if (ltnew != NULLAGBLOCK &&
 			    (args->len > blen || ltdiff < bdiff)) {
 				bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, ltbno, ltlen, &ltnew);
+				args->alignment, ltbnoa, ltlena, &ltnew);
 
 			error = xfs_alloc_find_best_extent(args,
 						&bno_cur_lt, &bno_cur_gt,
-						ltdiff, &gtbno, &gtlen, &gtlena,
+						ltdiff, &gtbno, &gtlen,
+						&gtbnoa, &gtlena,
 						0 /* search right */);
 		} else {
 			ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, gtbno, gtlen, &gtnew);
+				args->alignment, gtbnoa, gtlena, &gtnew);
 
 			error = xfs_alloc_find_best_extent(args,
 						&bno_cur_gt, &bno_cur_lt,
-						gtdiff, &ltbno, &ltlen, &ltlena,
+						gtdiff, &ltbno, &ltlen,
+						&ltbnoa, &ltlena,
 						1 /* search left */);
 		}
 
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
 	 * If we couldn't get anything, give up.
 	 */
 	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+		if (!forced++) {
+			trace_xfs_alloc_near_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+
 		trace_xfs_alloc_size_neither(args);
 		args->agbno = NULLAGBLOCK;
 		return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
 		return 0;
 	}
 	rlen = args->len;
-	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
-		ltlen, &ltnew);
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+				     ltbnoa, ltlena, &ltnew);
 	ASSERT(ltnew >= ltbno);
-	ASSERT(ltnew + rlen <= ltbno + ltlen);
+	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
 	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 	args->agbno = ltnew;
+
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
 			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
 		goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
 	int		i;		/* temp status variable */
 	xfs_agblock_t	rbno;		/* returned block number */
 	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
 
+restart:
 	/*
 	 * Allocate and initialize a cursor for the by-size btree.
 	 */
 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
 		args->agno, XFS_BTNUM_CNT);
 	bno_cur = NULL;
+
 	/*
 	 * Look for an entry >= maxlen+alignment-1 blocks.
 	 */
 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
 			args->maxlen + args->alignment - 1, &i)))
 		goto error0;
+
 	/*
-	 * If none, then pick up the last entry in the tree unless the
-	 * tree is empty.
+	 * If none or we have busy extents that we cannot allocate from, then
+	 * we have to settle for a smaller extent. In the case that there are
+	 * no large extents, this will return the last entry in the tree unless
+	 * the tree is empty. In the case that there are only busy large
+	 * extents, this will return the largest small extent unless there
+	 * are no smaller extents available.
 	 */
-	if (!i) {
-		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
-				&flen, &i)))
+	if (!i || forced > 1) {
+		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+						   &fbno, &flen, &i);
+		if (error)
 			goto error0;
 		if (i == 0 || flen == 0) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
 			return 0;
 		}
 		ASSERT(i == 1);
+		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+	} else {
+		/*
+		 * Search for a non-busy extent that is large enough.
+		 * If we are at low space, don't check, or if we fall of
+		 * the end of the btree, turn off the busy check and
+		 * restart.
+		 */
+		for (;;) {
+			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+
+			if (rlen >= args->maxlen)
+				break;
+
+			error = xfs_btree_increment(cnt_cur, 0, &i);
+			if (error)
+				goto error0;
+			if (i == 0) {
+				/*
+				 * Our only valid extents must have been busy.
+				 * Make it unbusy by forcing the log out and
+				 * retrying. If we've been here before, forcing
+				 * the log isn't making the extents available,
+				 * which means they have probably been freed in
+				 * this transaction.  In that case, we have to
+				 * give up on them and we'll attempt a minlen
+				 * allocation the next time around.
+				 */
+				xfs_btree_del_cursor(cnt_cur,
+						     XFS_BTREE_NOERROR);
+				trace_xfs_alloc_size_busy(args);
+				if (!forced++)
+					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				goto restart;
+			}
+		}
 	}
-	/*
-	 * There's a freespace as big as maxlen+alignment-1, get it.
-	 */
-	else {
-		if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	}
+
 	/*
 	 * In the first case above, we got the last entry in the
 	 * by-size btree.  Now we check to see if the space hits maxlen
 	 * once aligned; if not, we search left for something better.
 	 * This can't happen in the second case above.
 	 */
-	xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
 			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
 	 * Fix up the length.
 	 */
 	args->len = rlen;
-	xfs_alloc_fix_len(args);
-	if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		trace_xfs_alloc_size_nominleft(args);
-		args->agbno = NULLAGBLOCK;
-		return 0;
+	if (rlen < args->minlen) {
+		if (!forced++) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		goto out_nominleft;
 	}
+	xfs_alloc_fix_len(args);
+
+	if (!xfs_alloc_fix_minleft(args))
+		goto out_nominleft;
 	rlen = args->len;
 	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
 	/*
@@ -1287,6 +1350,12 @@ error0:
 	if (bno_cur)
 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
 	return error;
+
+out_nominleft:
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	trace_xfs_alloc_size_nominleft(args);
+	args->agbno = NULLAGBLOCK;
+	return 0;
 }
 
 /*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
 		if (error)
 			goto error0;
 		if (fbno != NULLAGBLOCK) {
+			xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+					     args->userdata);
+
 			if (args->userdata) {
 				xfs_buf_t	*bp;
 
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
 
 	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
 
-	/*
-	 * Since blocks move to the free list without the coordination
-	 * used in xfs_bmap_finish, we can't allow block to be available
-	 * for reallocation and non-transaction writing (user data)
-	 * until we know that the transaction that moved it to the free
-	 * list is permanently on disk.  We track the blocks by declaring
-	 * these blocks as "busy"; the busy list is maintained on a per-ag
-	 * basis and each transaction records which entries should be removed
-	 * when the iclog commits to disk.  If a busy block is allocated,
-	 * the iclog is pushed up to the LSN that freed the block.
-	 */
-	xfs_alloc_busy_insert(tp, agno, bno, len);
 	return 0;
 
  error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
 	xfs_alloc_log_agf(tp, agbp, logflags);
 	*bnop = bno;
 
-	/*
-	 * As blocks are freed, they are added to the per-ag busy list and
-	 * remain there until the freeing transaction is committed to disk.
-	 * Now that we have allocated blocks, this list must be searched to see
-	 * if a block is being reused.  If one is, then the freeing transaction
-	 * must be pushed to disk before this transaction.
-	 *
-	 * We do this by setting the current transaction to a sync transaction
-	 * which guarantees that the freeing transaction is on disk before this
-	 * transaction. This is done instead of a synchronous log force here so
-	 * that we don't sit and wait with the AGF locked in the transaction
-	 * during the log force.
-	 */
-	if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
-		xfs_trans_set_sync(tp);
 	return 0;
 }
 
@@ -2423,105 +2468,13 @@ xfs_free_extent(
 	}
 
 	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+	if (!error)
+		xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
 error0:
 	xfs_perag_put(args.pag);
 	return error;
 }
 
-
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transactions have not yet hit disk.  If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_busy_clear - remove an item from the per-ag busy list
- * xfs_alloc_busy_search - search for a busy extent
- */
-
-/*
- * Insert a new extent into the busy tree.
- *
- * The busy extent tree is indexed by the start block of the busy extent.
- * there can be multiple overlapping ranges in the busy extent tree but only
- * ever one entry at a given start block. The reason for this is that
- * multi-block extents can be freed, then smaller chunks of that extent
- * allocated and freed again before the first transaction commit is on disk.
- * If the exact same start block is freed a second time, we have to wait for
- * that busy extent to pass out of the tree before the new extent is inserted.
- * There are two main cases we have to handle here.
- *
- * The first case is a transaction that triggers a "free - allocate - free"
- * cycle. This can occur during btree manipulations as a btree block is freed
- * to the freelist, then allocated from the free list, then freed again. In
- * this case, the second extxpnet free is what triggers the duplicate and as
- * such the transaction IDs should match. Because the extent was allocated in
- * this transaction, the transaction must be marked as synchronous. This is
- * true for all cases where the free/alloc/free occurs in the one transaction,
- * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
- * This serves to catch violations of the second case quite effectively.
- *
- * The second case is where the free/alloc/free occur in different
- * transactions. In this case, the thread freeing the extent the second time
- * can't mark the extent busy immediately because it is already tracked in a
- * transaction that may be committing.  When the log commit for the existing
- * busy extent completes, the busy extent will be removed from the tree. If we
- * allow the second busy insert to continue using that busy extent structure,
- * it can be freed before this transaction is safely in the log.  Hence our
- * only option in this case is to force the log to remove the existing busy
- * extent from the list before we insert the new one with the current
- * transaction ID.
- *
- * The problem we are trying to avoid in the free-alloc-free in separate
- * transactions is most easily described with a timeline:
- *
- *      Thread 1	Thread 2	Thread 3	xfslogd
- *	xact alloc
- *	free X
- *	mark busy
- *	commit xact
- *	free xact
- *			xact alloc
- *			alloc X
- *			busy search
- *			mark xact sync
- *			commit xact
- *			free xact
- *			force log
- *			checkpoint starts
- *			....
- *					xact alloc
- *					free X
- *					mark busy
- *					finds match
- *					*** KABOOM! ***
- *					....
- *							log IO completes
- *							unbusy X
- *			checkpoint completes
- *
- * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
- * the checkpoint completes, and the busy extent it matched will have been
- * removed from the tree when it is woken. Hence it can then continue safely.
- *
- * However, to ensure this matching process is robust, we need to use the
- * transaction ID for identifying transaction, as delayed logging results in
- * the busy extent and transaction lifecycles being different. i.e. the busy
- * extent is active for a lot longer than the transaction.  Hence the
- * transaction structure can be freed and reallocated, then mark the same
- * extent busy again in the new transaction. In this case the new transaction
- * will have a different tid but can have the same address, and hence we need
- * to check against the tid.
- *
- * Future: for delayed logging, we could avoid the log force if the extent was
- * first freed in the current checkpoint sequence. This, however, requires the
- * ability to pin the current checkpoint in memory until this transaction
- * commits to ensure that both the original free and the current one combine
- * logically into the one checkpoint. If the checkpoint sequences are
- * different, however, we still need to wait on a log force.
- */
 void
 xfs_alloc_busy_insert(
 	struct xfs_trans	*tp,
@@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert(
 	struct xfs_busy_extent	*busyp;
 	struct xfs_perag	*pag;
 	struct rb_node		**rbp;
-	struct rb_node		*parent;
-	int			match;
-
+	struct rb_node		*parent = NULL;
 
 	new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
 	if (!new) {
@@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert(
 		 * block, make this a synchronous transaction to insure that
 		 * the block is not reused before this transaction commits.
 		 */
-		trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+		trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
 		xfs_trans_set_sync(tp);
 		return;
 	}
@@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert(
 	new->agno = agno;
 	new->bno = bno;
 	new->length = len;
-	new->tid = xfs_log_get_trans_ident(tp);
-
 	INIT_LIST_HEAD(&new->list);
 
 	/* trace before insert to be able to see failed inserts */
-	trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+	trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
 
 	pag = xfs_perag_get(tp->t_mountp, new->agno);
-restart:
 	spin_lock(&pag->pagb_lock);
 	rbp = &pag->pagb_tree.rb_node;
-	parent = NULL;
-	busyp = NULL;
-	match = 0;
-	while (*rbp && match >= 0) {
+	while (*rbp) {
 		parent = *rbp;
 		busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
 
 		if (new->bno < busyp->bno) {
-			/* may overlap, but exact start block is lower */
 			rbp = &(*rbp)->rb_left;
-			if (new->bno + new->length > busyp->bno)
-				match = busyp->tid == new->tid ? 1 : -1;
+			ASSERT(new->bno + new->length <= busyp->bno);
 		} else if (new->bno > busyp->bno) {
-			/* may overlap, but exact start block is higher */
 			rbp = &(*rbp)->rb_right;
-			if (bno < busyp->bno + busyp->length)
-				match = busyp->tid == new->tid ? 1 : -1;
+			ASSERT(bno >= busyp->bno + busyp->length);
 		} else {
-			match = busyp->tid == new->tid ? 1 : -1;
-			break;
+			ASSERT(0);
 		}
 	}
-	if (match < 0) {
-		/* overlap marked busy in different transaction */
-		spin_unlock(&pag->pagb_lock);
-		xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
-		goto restart;
-	}
-	if (match > 0) {
-		/*
-		 * overlap marked busy in same transaction. Update if exact
-		 * start block match, otherwise combine the busy extents into
-		 * a single range.
-		 */
-		if (busyp->bno == new->bno) {
-			busyp->length = max(busyp->length, new->length);
-			spin_unlock(&pag->pagb_lock);
-			ASSERT(tp->t_flags & XFS_TRANS_SYNC);
-			xfs_perag_put(pag);
-			kmem_free(new);
-			return;
-		}
-		rb_erase(&busyp->rb_node, &pag->pagb_tree);
-		new->length = max(busyp->bno + busyp->length,
-					new->bno + new->length) -
-				min(busyp->bno, new->bno);
-		new->bno = min(busyp->bno, new->bno);
-	} else
-		busyp = NULL;
 
 	rb_link_node(&new->rb_node, parent, rbp);
 	rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2532,6 @@ restart:
 	list_add(&new->list, &tp->t_busy);
 	spin_unlock(&pag->pagb_lock);
 	xfs_perag_put(pag);
-	kmem_free(busyp);
 }
 
 /*
@@ -2668,31 +2580,443 @@ xfs_alloc_busy_search(
 		}
 	}
 	spin_unlock(&pag->pagb_lock);
-	trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
 	xfs_perag_put(pag);
 	return match;
 }
 
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_busy_extent	*busyp,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata)
+{
+	xfs_agblock_t		fend = fbno + flen;
+	xfs_agblock_t		bbno = busyp->bno;
+	xfs_agblock_t		bend = bbno + busyp->length;
+
+	/*
+	 * If there is a busy extent overlapping a user allocation, we have
+	 * no choice but to force the log and retry the search.
+	 *
+	 * Fortunately this does not happen during normal operation, but
+	 * only if the filesystem is very low on space and has to dip into
+	 * the AGFL for normal allocations.
+	 */
+	if (userdata)
+		goto out_force_log;
+
+	if (bbno < fbno && bend > fend) {
+		/*
+		 * Case 1:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +---------+
+		 *        fbno   fend
+		 */
+
+		/*
+		 * We would have to split the busy extent to be able to track
+		 * it correct, which we cannot do because we would have to
+		 * modify the list of busy extents attached to the transaction
+		 * or CIL context, which is immutable.
+		 *
+		 * Force out the log to clear the busy extent and retry the
+		 * search.
+		 */
+		goto out_force_log;
+	} else if (bbno >= fbno && bend <= fend) {
+		/*
+		 * Case 2:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------+
+		 *    fbno           fend
+		 *
+		 * Case 3:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 4:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +--------------------------+
+		 *    fbno                    fend
+		 *
+		 * Case 5:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +-----------------------------------+
+		 *    fbno                             fend
+		 *
+		 */
+
+		/*
+		 * The busy extent is fully covered by the extent we are
+		 * allocating, and can simply be removed from the rbtree.
+		 * However we cannot remove it from the immutable list
+		 * tracking busy extents in the transaction or CIL context,
+		 * so set the length to zero to mark it invalid.
+		 *
+		 * We also need to restart the busy extent search from the
+		 * tree root, because erasing the node can rearrange the
+		 * tree topology.
+		 */
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		busyp->length = 0;
+		return false;
+	} else if (fend < bend) {
+		/*
+		 * Case 6:
+		 *              bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *             +---------+
+		 *             fbno   fend
+		 *
+		 * Case 7:
+		 *             bbno           bend
+		 *             +BBBBBBBBBBBBBBBBB+
+		 *    +------------------+
+		 *    fbno            fend
+		 *
+		 */
+		busyp->bno = fend;
+	} else if (bbno < fbno) {
+		/*
+		 * Case 8:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +-------------+
+		 *        fbno       fend
+		 *
+		 * Case 9:
+		 *    bbno           bend
+		 *    +BBBBBBBBBBBBBBBBB+
+		 *        +----------------------+
+		 *        fbno                fend
+		 */
+		busyp->length = fbno - busyp->bno;
+	} else {
+		ASSERT(0);
+	}
+
+	trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+	return true;
+
+out_force_log:
+	spin_unlock(&pag->pagb_lock);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+	trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+	spin_lock(&pag->pagb_lock);
+	return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
 void
-xfs_alloc_busy_clear(
+xfs_alloc_busy_reuse(
 	struct xfs_mount	*mp,
-	struct xfs_busy_extent	*busyp)
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		fbno,
+	xfs_extlen_t		flen,
+	bool			userdata)
 {
 	struct xfs_perag	*pag;
+	struct rb_node		*rbp;
 
-	trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
-						busyp->length);
+	ASSERT(flen > 0);
 
-	ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
-						busyp->length) == 1);
+	pag = xfs_perag_get(mp, agno);
+	spin_lock(&pag->pagb_lock);
+restart:
+	rbp = pag->pagb_tree.rb_node;
+	while (rbp) {
+		struct xfs_busy_extent *busyp =
+			rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
 
-	list_del_init(&busyp->list);
+		if (fbno + flen <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
 
-	pag = xfs_perag_get(mp, busyp->agno);
-	spin_lock(&pag->pagb_lock);
-	rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
+						  userdata))
+			goto restart;
+	}
 	spin_unlock(&pag->pagb_lock);
 	xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+	struct xfs_alloc_arg	*args,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	xfs_agblock_t		*rbno,
+	xfs_extlen_t		*rlen)
+{
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	struct rb_node		*rbp;
+
+	ASSERT(len > 0);
 
+	spin_lock(&args->pag->pagb_lock);
+restart:
+	fbno = bno;
+	flen = len;
+	rbp = args->pag->pagb_tree.rb_node;
+	while (rbp && flen >= args->minlen) {
+		struct xfs_busy_extent *busyp =
+			rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		xfs_agblock_t	fend = fbno + flen;
+		xfs_agblock_t	bbno = busyp->bno;
+		xfs_agblock_t	bend = bbno + busyp->length;
+
+		if (fend <= bbno) {
+			rbp = rbp->rb_left;
+			continue;
+		} else if (fbno >= bend) {
+			rbp = rbp->rb_right;
+			continue;
+		}
+
+		/*
+		 * If this is a metadata allocation, try to reuse the busy
+		 * extent instead of trimming the allocation.
+		 */
+		if (!args->userdata) {
+			if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+							  busyp, fbno, flen,
+							  false))
+				goto restart;
+			continue;
+		}
+
+		if (bbno <= fbno) {
+			/* start overlap */
+
+			/*
+			 * Case 1:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +---------+
+			 *        fbno   fend
+			 *
+			 * Case 2:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-------------+
+			 *    fbno       fend
+			 *
+			 * Case 3:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +-------------+
+			 *        fbno       fend
+			 *
+			 * Case 4:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------+
+			 *    fbno           fend
+			 *
+			 * No unbusy region in extent, return failure.
+			 */
+			if (fend <= bend)
+				goto fail;
+
+			/*
+			 * Case 5:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *        +----------------------+
+			 *        fbno                fend
+			 *
+			 * Case 6:
+			 *    bbno           bend
+			 *    +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *                       +-------+
+			 *                       fbno fend
+			 */
+			fbno = bend;
+		} else if (bend >= fend) {
+			/* end overlap */
+
+			/*
+			 * Case 7:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +------------------+
+			 *    fbno            fend
+			 *
+			 * Case 8:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +--------------------------+
+			 *    fbno                    fend
+			 *
+			 * Needs to be trimmed to:
+			 *    +-------+
+			 *    fbno fend
+			 */
+			fend = bbno;
+		} else {
+			/* middle overlap */
+
+			/*
+			 * Case 9:
+			 *             bbno           bend
+			 *             +BBBBBBBBBBBBBBBBB+
+			 *    +-----------------------------------+
+			 *    fbno                             fend
+			 *
+			 * Can be trimmed to:
+			 *    +-------+        OR         +-------+
+			 *    fbno fend                   fbno fend
+			 *
+			 * Backward allocation leads to significant
+			 * fragmentation of directories, which degrades
+			 * directory performance, therefore we always want to
+			 * choose the option that produces forward allocation
+			 * patterns.
+			 * Preferring the lower bno extent will make the next
+			 * request use "fend" as the start of the next
+			 * allocation;  if the segment is no longer busy at
+			 * that point, we'll get a contiguous allocation, but
+			 * even if it is still busy, we will get a forward
+			 * allocation.
+			 * We try to avoid choosing the segment at "bend",
+			 * because that can lead to the next allocation
+			 * taking the segment at "fbno", which would be a
+			 * backward allocation.  We only use the segment at
+			 * "fbno" if it is much larger than the current
+			 * requested size, because in that case there's a
+			 * good chance subsequent allocations will be
+			 * contiguous.
+			 */
+			if (bbno - fbno >= args->maxlen) {
+				/* left candidate fits perfect */
+				fend = bbno;
+			} else if (fend - bend >= args->maxlen * 4) {
+				/* right candidate has enough free space */
+				fbno = bend;
+			} else if (bbno - fbno >= args->minlen) {
+				/* left candidate fits minimum requirement */
+				fend = bbno;
+			} else {
+				goto fail;
+			}
+		}
+
+		flen = fend - fbno;
+	}
+	spin_unlock(&args->pag->pagb_lock);
+
+	if (fbno != bno || flen != len) {
+		trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+					  fbno, flen);
+	}
+	*rbno = fbno;
+	*rlen = flen;
+	return;
+fail:
+	/*
+	 * Return a zero extent length as failure indications.  All callers
+	 * re-check if the trimmed extent satisfies the minlen requirement.
+	 */
+	spin_unlock(&args->pag->pagb_lock);
+	trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+	*rbno = fbno;
+	*rlen = 0;
+}
+
+static void
+xfs_alloc_busy_clear_one(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	struct xfs_busy_extent	*busyp)
+{
+	if (busyp->length) {
+		trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+						busyp->length);
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+	}
+
+	list_del_init(&busyp->list);
 	kmem_free(busyp);
 }
+
+void
+xfs_alloc_busy_clear(
+	struct xfs_mount	*mp,
+	struct list_head	*list)
+{
+	struct xfs_busy_extent	*busyp, *n;
+	struct xfs_perag	*pag = NULL;
+	xfs_agnumber_t		agno = NULLAGNUMBER;
+
+	list_for_each_entry_safe(busyp, n, list, list) {
+		if (busyp->agno != agno) {
+			if (pag) {
+				spin_unlock(&pag->pagb_lock);
+				xfs_perag_put(pag);
+			}
+			pag = xfs_perag_get(mp, busyp->agno);
+			spin_lock(&pag->pagb_lock);
+			agno = busyp->agno;
+		}
+
+		xfs_alloc_busy_clear_one(mp, pag, busyp);
+	}
+
+	if (pag) {
+		spin_unlock(&pag->pagb_lock);
+		xfs_perag_put(pag);
+	}
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	return container_of(a, struct xfs_busy_extent, list)->agno -
+		container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index d0b3bc7..240ad28 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -140,11 +140,24 @@ xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
 	xfs_agblock_t bno, xfs_extlen_t len);
 
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
 
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
 	xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+	list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
+
 #endif	/* __KERNEL__ */
 
 /*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3916925..8b469d5 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
 		return 0;
 	}
 
+	xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
 	xfs_trans_agbtree_delta(cur->bc_tp, 1);
 	new->s = cpu_to_be32(bno);
 
@@ -118,17 +120,6 @@ xfs_allocbt_free_block(
 	if (error)
 		return error;
 
-	/*
-	 * Since blocks move to the free list without the coordination used in
-	 * xfs_bmap_finish, we can't allow block to be available for
-	 * reallocation and non-transaction writing (user data) until we know
-	 * that the transaction that moved it to the free list is permanently
-	 * on disk. We track the blocks by declaring these blocks as "busy";
-	 * the busy list is maintained on a per-ag basis and each transaction
-	 * records which entries should be removed when the iclog commits to
-	 * disk. If a busy block is allocated, the iclog is pushed up to the
-	 * LSN that freed the block.
-	 */
 	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
 	return 0;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index be628677..9a84a85 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
 	xfs_inode_t	*tip,	/* tmp inode */
 	xfs_swapext_t	*sxp)
 {
-	xfs_mount_t	*mp;
+	xfs_mount_t	*mp = ip->i_mount;
 	xfs_trans_t	*tp;
 	xfs_bstat_t	*sbp = &sxp->sx_stat;
 	xfs_ifork_t	*tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
 	int		taforkblks = 0;
 	__uint64_t	tmp;
 
-	mp = ip->i_mount;
-
 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
 	if (!tempifp) {
 		error = XFS_ERROR(ENOMEM);
 		goto out;
 	}
 
-	sbp = &sxp->sx_stat;
-
 	/*
 	 * we have to do two separate lock calls here to keep lockdep
 	 * happy. If we try to get all the locks in one call, lock will
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a37480a..c8e3349 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1354,7 +1354,7 @@ xfs_itruncate_start(
 		return 0;
 	}
 	last_byte = xfs_file_last_byte(ip);
-	trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
+	trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
 	if (last_byte > toss_start) {
 		if (flags & XFS_ITRUNC_DEFINITE) {
 			xfs_tosspages(ip, toss_start,
@@ -1470,7 +1470,7 @@ xfs_itruncate_finish(
 	 * file but the log buffers containing the free and reallocation
 	 * don't, then we'd end up with garbage in the blocks being freed.
 	 * As long as we make the new_size permanent before actually
-	 * freeing any blocks it doesn't matter if they get writtten to.
+	 * freeing any blocks it doesn't matter if they get written to.
 	 *
 	 * The callers must signal into us whether or not the size
 	 * setting here must be synchronous.  There are a few cases
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 576fdfe..09983a3 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -970,7 +970,6 @@ xfs_iflush_abort(
 {
 	xfs_inode_log_item_t	*iip = ip->i_itemp;
 
-	iip = ip->i_itemp;
 	if (iip) {
 		struct xfs_ail	*ailp = iip->ili_item.li_ailp;
 		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b612ce4..2119302 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log)
 
 	xlog_cil_destroy(log);
 
+	/*
+	 * always need to ensure that the extra buffer does not point to memory
+	 * owned by another log buffer before we free it.
+	 */
+	xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+	xfs_buf_free(log->l_xbuf);
+
 	iclog = log->l_iclog;
 	for (i=0; i<log->l_iclog_bufs; i++) {
 		xfs_buf_free(iclog->ic_bp);
@@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log)
 	}
 	spinlock_destroy(&log->l_icloglock);
 
-	xfs_buf_free(log->l_xbuf);
 	log->l_mp->m_log = NULL;
 	kmem_free(log);
 }	/* xlog_dealloc_log */
@@ -3248,13 +3254,6 @@ xfs_log_ticket_get(
 	return ticket;
 }
 
-xlog_tid_t
-xfs_log_get_trans_ident(
-	struct xfs_trans	*tp)
-{
-	return tp->t_ticket->t_tid;
-}
-
 /*
  * Allocate and initialise a new log ticket.
  */
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3bd3291..78c9039 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,8 +189,6 @@ void	  xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 
-xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-
 void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				struct xfs_log_vec *log_vector,
 				xfs_lsn_t *commit_lsn, int flags);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9ca59be..7d56e88 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -361,13 +361,12 @@ xlog_cil_committed(
 	int	abort)
 {
 	struct xfs_cil_ctx	*ctx = args;
-	struct xfs_busy_extent	*busyp, *n;
 
 	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 					ctx->start_lsn, abort);
 
-	list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
-		xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+	xfs_alloc_busy_sort(&ctx->busy_extents);
+	xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
 
 	spin_lock(&ctx->cil->xc_cil_lock);
 	list_del(&ctx->committing);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 5864850..2d3b6a4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i)
 					   shutdown */
 #define XLOG_TAIL_WARN		0x10	/* log tail verify warning issued */
 
+typedef __uint32_t xlog_tid_t;
+
 #ifdef __KERNEL__
 /*
  * Below are states for covering allocation transactions.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5cc464a..04142ca 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -205,6 +205,35 @@ xlog_bread(
 }
 
 /*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,		/* block to read from */
+	int		nbblks,		/* blocks to read */
+	xfs_buf_t	*bp,
+	xfs_caddr_t	offset)
+{
+	xfs_caddr_t	orig_offset = XFS_BUF_PTR(bp);
+	int		orig_len = bp->b_buffer_length;
+	int		error, error2;
+
+	error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+	if (error)
+		return error;
+
+	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+	/* must reset buffer pointer even on error */
+	error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+	if (error)
+		return error;
+	return error2;
+}
+
+/*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
  * This can only be used for synchronous log writes.
@@ -1229,20 +1258,12 @@ xlog_write_log_records(
 		 */
 		ealign = round_down(end_block, sectbb);
 		if (j == 0 && (start_block + endcount > ealign)) {
-			offset = XFS_BUF_PTR(bp);
-			balign = BBTOB(ealign - start_block);
-			error = XFS_BUF_SET_PTR(bp, offset + balign,
-						BBTOB(sectbb));
+			offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+			error = xlog_bread_offset(log, ealign, sectbb,
+							bp, offset);
 			if (error)
 				break;
 
-			error = xlog_bread_noalign(log, ealign, sectbb, bp);
-			if (error)
-				break;
-
-			error = XFS_BUF_SET_PTR(bp, offset, bufblks);
-			if (error)
-				break;
 		}
 
 		offset = xlog_align(log, start_block, endcount, bp);
@@ -3448,19 +3469,9 @@ xlog_do_recovery_pass(
 				 *   - order is important.
 				 */
 				wrapped_hblks = hblks - split_hblks;
-				error = XFS_BUF_SET_PTR(hbp,
-						offset + BBTOB(split_hblks),
-						BBTOB(hblks - split_hblks));
-				if (error)
-					goto bread_err2;
-
-				error = xlog_bread_noalign(log, 0,
-							   wrapped_hblks, hbp);
-				if (error)
-					goto bread_err2;
-
-				error = XFS_BUF_SET_PTR(hbp, offset,
-							BBTOB(hblks));
+				error = xlog_bread_offset(log, 0,
+						wrapped_hblks, hbp,
+						offset + BBTOB(split_hblks));
 				if (error)
 					goto bread_err2;
 			}
@@ -3511,19 +3522,9 @@ xlog_do_recovery_pass(
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
-				error = XFS_BUF_SET_PTR(dbp,
-						offset + BBTOB(split_bblks),
-						BBTOB(bblks - split_bblks));
-				if (error)
-					goto bread_err2;
-
-				error = xlog_bread_noalign(log, wrapped_hblks,
-						bblks - split_bblks,
-						dbp);
-				if (error)
-					goto bread_err2;
-
-				error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+				error = xlog_bread_offset(log, 0,
+						bblks - split_bblks, hbp,
+						offset + BBTOB(split_bblks));
 				if (error)
 					goto bread_err2;
 			}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb3f9a7..b49b823 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch(
 	uint			nmsb,
 	int			rsvd)
 {
-	xfs_mod_sb_t		*msbp = &msb[0];
+	xfs_mod_sb_t		*msbp;
 	int			error = 0;
 
 	/*
@@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch(
 	 * changes will be atomic.
 	 */
 	spin_lock(&mp->m_sb_lock);
-	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+	for (msbp = msb; msbp < (msb + nmsb); msbp++) {
 		ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
 		       msbp->msb_field > XFS_SBS_FDBLOCKS);
 
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7692279..d1f2485 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
 xfs_trans_free(
 	struct xfs_trans	*tp)
 {
-	struct xfs_busy_extent	*busyp, *n;
-
-	list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
-		xfs_alloc_busy_clear(tp->t_mountp, busyp);
+	xfs_alloc_busy_sort(&tp->t_busy);
+	xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
 
 	atomic_dec(&tp->t_mountp->m_active_trans);
 	xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 26d1867..65584b5 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef	__uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
 typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 
-typedef __uint32_t	xlog_tid_t;	/* transaction ID type */
-
 /*
  * These types are 64 bits on disk but are either 32 or 64 bits in memory.
  * Disk based types: