1 files changed, 802 insertions, 254 deletions
diff --git a/sys/gnu/fs/xfs/xfs_mount.c b/sys/gnu/fs/xfs/xfs_mount.c
index 7c24dbb..7c6e478 100644
--- a/sys/gnu/fs/xfs/xfs_mount.c
+++ b/sys/gnu/fs/xfs/xfs_mount.c
@@ -1,40 +1,26 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
 #include "xfs.h"
-#include "xfs_macros.h"
+#include "xfs_fs.h"
 #include "xfs_types.h"
-#include "xfs_inum.h"
+#include "xfs_bit.h"
 #include "xfs_log.h"
+#include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -42,21 +28,20 @@
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_bit.h"
 #include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
@@ -64,14 +49,34 @@
 STATIC void	xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
+STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
+
 
+#ifdef HAVE_PERCPU_SB
+STATIC void	xfs_icsb_destroy_counters(xfs_mount_t *);
+STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int);
+STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
+STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
+						int, int);
+STATIC int	xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t,
+						int, int);
+STATIC int	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 
+#else
 
-static struct {
-    short offset;
-    short type;     /* 0 = integer
-		* 1 = binary / string (no translation)
-		*/
+#define xfs_icsb_destroy_counters(mp)			do { } while (0)
+#define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
+#define xfs_icsb_sync_counters(mp)			do { } while (0)
+#define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
+#define xfs_icsb_modify_counters_locked(mp, a, b, c)	do { } while (0)
+
+#endif
+
+static const struct {
+	short offset;
+	short type;	/* 0 = integer
+			 * 1 = binary / string (no translation)
+			 */
 } xfs_sb_info[] = {
     { offsetof(xfs_sb_t, sb_magicnum),   0 },
     { offsetof(xfs_sb_t, sb_blocksize),  0 },
@@ -117,6 +122,7 @@ static struct {
     { offsetof(xfs_sb_t, sb_logsectlog), 0 },
     { offsetof(xfs_sb_t, sb_logsectsize),0 },
     { offsetof(xfs_sb_t, sb_logsunit),	 0 },
+    { offsetof(xfs_sb_t, sb_features2),	 0 },
     { sizeof(xfs_sb_t),			 0 }
 };
 
@@ -128,30 +134,24 @@ xfs_mount_init(void)
 {
 	xfs_mount_t *mp;
 
-	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
+	mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
+
+	if (xfs_icsb_init_counters(mp)) {
+		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+	}
 
 	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
 	spinlock_init(&mp->m_sb_lock, "xfs_sb");
-	mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock");
+	/* FreeBSD specfic */
+	sx_init(&mp->m_ilock, "xfs_mnt");
 	initnsema(&mp->m_growlock, 1, "xfs_grow");
 	/*
 	 * Initialize the AIL.
 	 */
 	xfs_trans_ail_init(mp);
 
-	/* Init freeze sync structures */
-	spinlock_init(&mp->m_freeze_lock, "xfs_freeze");
-	init_sv(&mp->m_wait_unfreeze, SV_DEFAULT, "xfs_freeze", 0);
 	atomic_set(&mp->m_active_trans, 0);
 
-	/*
-	 * Moved from elsewhere. Linux initializes those 'on demand' part
-	 * way through mount code. This spells trouble for FreeBSD and
-	 * WITNESS because the mount can fail and we hit lock destruction
-	 * code for locks that might not even been initialized yet.
-	 */
-	spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock");
-
 	return mp;
 }
 
@@ -162,8 +162,8 @@ xfs_mount_init(void)
  */
 void
 xfs_mount_free(
-	xfs_mount_t *mp,
-	int	    remove_bhv)
+	xfs_mount_t	*mp,
+	int		remove_bhv)
 {
 	if (mp->m_ihash)
 		xfs_ihash_free(mp);
@@ -173,40 +173,38 @@ xfs_mount_free(
 	if (mp->m_perag) {
 		int	agno;
 
-		for (agno = 0; agno < mp->m_maxagi; agno++) {
-			if (mp->m_perag[agno].pagf_init)
-				spinlock_destroy(&mp->m_perag[agno].pagb_lock);
+		for (agno = 0; agno < mp->m_maxagi; agno++)
 			if (mp->m_perag[agno].pagb_list)
 				kmem_free(mp->m_perag[agno].pagb_list,
 						sizeof(xfs_perag_busy_t) *
 							XFS_PAGB_NUM_SLOTS);
-		}
 		kmem_free(mp->m_perag,
 			  sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
-
-		free_rwsem(&mp->m_peraglock);
 	}
 
 	AIL_LOCK_DESTROY(&mp->m_ail_lock);
 	spinlock_destroy(&mp->m_sb_lock);
-	spinlock_destroy(&mp->m_agirotor_lock);
-	mutex_destroy(&mp->m_ilock);
+	/* FreeBSD specfic */
+	sx_destroy(&mp->m_ilock);
 	freesema(&mp->m_growlock);
 	if (mp->m_quotainfo)
 		XFS_QM_DONE(mp);
 
 	if (mp->m_fsname != NULL)
 		kmem_free(mp->m_fsname, mp->m_fsname_len);
+	if (mp->m_rtname != NULL)
+		kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1);
+	if (mp->m_logname != NULL)
+		kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
 
 	if (remove_bhv) {
-		struct xfs_vfs	*vfsp = XFS_MTOVFS(mp);
+		xfs_vfs_t	*vfsp = XFS_MTOVFS(mp);
 
 		bhv_remove_all_vfsops(vfsp, 0);
 		VFS_REMOVEBHV(vfsp, &mp->m_bhv);
 	}
 
-	spinlock_destroy(&mp->m_freeze_lock);
-	sv_destroy(&mp->m_wait_unfreeze);
+	xfs_icsb_destroy_counters(mp);
 	kmem_free(mp, sizeof(xfs_mount_t));
 }
 
@@ -217,7 +215,8 @@ xfs_mount_free(
 STATIC int
 xfs_mount_validate_sb(
 	xfs_mount_t	*mp,
-	xfs_sb_t	*sbp)
+	xfs_sb_t	*sbp,
+	int		flags)
 {
 	/*
 	 * If the log device and data device have the
@@ -227,33 +226,29 @@ xfs_mount_validate_sb(
 	 * a volume filesystem in a non-volume manner.
 	 */
 	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-		cmn_err(CE_WARN, "XFS: bad magic number");
+		xfs_fs_mount_cmn_err(flags, "bad magic number");
 		return XFS_ERROR(EWRONGFS);
 	}
 
 	if (!XFS_SB_GOOD_VERSION(sbp)) {
-		cmn_err(CE_WARN, "XFS: bad version");
+		xfs_fs_mount_cmn_err(flags, "bad version");
 		return XFS_ERROR(EWRONGFS);
 	}
 
 	if (unlikely(
 	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-		cmn_err(CE_WARN,
-	"XFS: filesystem is marked as having an external log; "
-	"specify logdev on the\nmount command line.");
-		XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)",
-				     XFS_ERRLEVEL_HIGH, mp, sbp);
-		return XFS_ERROR(EFSCORRUPTED);
+		xfs_fs_mount_cmn_err(flags,
+			"filesystem is marked as having an external log; "
+			"specify logdev on the\nmount command line.");
+		return XFS_ERROR(EINVAL);
 	}
 
 	if (unlikely(
 	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-		cmn_err(CE_WARN,
-	"XFS: filesystem is marked as having an internal log; "
-	"don't specify logdev on\nthe mount command line.");
-		XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)",
-				     XFS_ERRLEVEL_HIGH, mp, sbp);
-		return XFS_ERROR(EFSCORRUPTED);
+		xfs_fs_mount_cmn_err(flags,
+			"filesystem is marked as having an internal log; "
+			"do not specify logdev on\nthe mount command line.");
+		return XFS_ERROR(EINVAL);
 	}
 
 	/*
@@ -272,12 +267,13 @@ xfs_mount_validate_sb(
 	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
 	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
 	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
+	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
+	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
+	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
-	    sbp->sb_imax_pct > 100)) {
-		cmn_err(CE_WARN, "XFS: SB sanity check 1 failed");
-		XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)",
-				     XFS_ERRLEVEL_LOW, mp, sbp);
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
+		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
@@ -290,40 +286,50 @@ xfs_mount_validate_sb(
 	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
 	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
 			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
-		cmn_err(CE_WARN, "XFS: SB sanity check 2 failed");
-		XFS_ERROR_REPORT("xfs_mount_validate_sb(4)",
-				 XFS_ERRLEVEL_LOW, mp);
+		xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
-#if !XFS_BIG_BLKNOS
+	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+	ASSERT(sbp->sb_blocklog >= BBSHIFT);
+
+#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
 	if (unlikely(
-	    (sbp->sb_dblocks << (__uint64_t)(sbp->sb_blocklog - BBSHIFT))
-		> UINT_MAX ||
-	    (sbp->sb_rblocks << (__uint64_t)(sbp->sb_blocklog - BBSHIFT))
-		> UINT_MAX)) {
-		cmn_err(CE_WARN,
-	"XFS: File system is too large to be mounted on this system.");
+	    (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX ||
+	    (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) {
+#else                  /* Limited by UINT_MAX of sectors */
+	if (unlikely(
+	    (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX ||
+	    (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) {
+#endif
+		xfs_fs_mount_cmn_err(flags,
+			"file system too large to be mounted on this system.");
 		return XFS_ERROR(E2BIG);
 	}
-#endif
 
 	if (unlikely(sbp->sb_inprogress)) {
-		cmn_err(CE_WARN, "XFS: file system busy");
-		XFS_ERROR_REPORT("xfs_mount_validate_sb(5)",
-				 XFS_ERRLEVEL_LOW, mp);
+		xfs_fs_mount_cmn_err(flags, "file system busy");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
 	/*
+	 * Version 1 directory format has never worked on Linux.
+	 */
+	if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) {
+		xfs_fs_mount_cmn_err(flags,
+			"file system using version 1 directory format");
+		return XFS_ERROR(ENOSYS);
+	}
+
+	/*
 	 * Until this is fixed only page-sized or smaller data blocks work.
 	 */
 	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-		cmn_err(CE_WARN,
-		"XFS: Attempted to mount file system with blocksize %d bytes",
+		xfs_fs_mount_cmn_err(flags,
+			"file system with blocksize %d bytes",
 			sbp->sb_blocksize);
-		cmn_err(CE_WARN,
-		"XFS: Only page-sized (%d) or less blocksizes currently work.",
+		xfs_fs_mount_cmn_err(flags,
+			"only pagesize (%ld) or less will currently work.",
 			PAGE_SIZE);
 		return XFS_ERROR(ENOSYS);
 	}
@@ -331,10 +337,13 @@ xfs_mount_validate_sb(
 	return 0;
 }
 
-void
-xfs_initialize_perag(xfs_mount_t *mp, int agcount)
+xfs_agnumber_t
+xfs_initialize_perag(
+	struct xfs_vfs	*vfs,
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agcount)
 {
-	int		index, max_metadata;
+	xfs_agnumber_t	index, max_metadata;
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
 	xfs_ino_t	ino;
@@ -348,7 +357,7 @@ xfs_initialize_perag(xfs_mount_t *mp, int agcount)
 	/* Clear the mount flag if no inode can overflow 32 bits
 	 * on this filesystem, or if specifically requested..
 	 */
-	if ((mp->m_flags & XFS_MOUNT_32BITINOOPT) && ino > max_inum) {
+	if ((vfs->vfs_flag & VFS_32BITINODES) && ino > max_inum) {
 		mp->m_flags |= XFS_MOUNT_32BITINODES;
 	} else {
 		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
@@ -365,7 +374,7 @@ xfs_initialize_perag(xfs_mount_t *mp, int agcount)
 			icount = sbp->sb_dblocks * sbp->sb_imax_pct;
 			do_div(icount, 100);
 			icount += sbp->sb_agblocks - 1;
-			do_div(icount, mp->m_ialloc_blks);
+			do_div(icount, sbp->sb_agblocks);
 			max_metadata = icount;
 		} else {
 			max_metadata = agcount;
@@ -377,7 +386,7 @@ xfs_initialize_perag(xfs_mount_t *mp, int agcount)
 				break;
 			}
 
-			/* This ag is prefered for inodes */
+			/* This ag is preferred for inodes */
 			pag = &mp->m_perag[index];
 			pag->pagi_inodeok = 1;
 			if (index < max_metadata)
@@ -390,7 +399,7 @@ xfs_initialize_perag(xfs_mount_t *mp, int agcount)
 			pag->pagi_inodeok = 1;
 		}
 	}
-	mp->m_maxagi = index;
+	return index;
 }
 
 /*
@@ -400,7 +409,6 @@ xfs_initialize_perag(xfs_mount_t *mp, int agcount)
  *     sb         - a superblock
  *     dir        - conversion direction: <0 - convert sb to buf
  *                                        >0 - convert buf to sb
- *     arch       - architecture to read/write from/to buf
  *     fields     - which fields to copy (bitmask)
  */
 void
@@ -408,7 +416,6 @@ xfs_xlatesb(
 	void		*data,
 	xfs_sb_t	*sb,
 	int		dir,
-	xfs_arch_t	arch,
 	__int64_t	fields)
 {
 	xfs_caddr_t	buf_ptr;
@@ -433,9 +440,7 @@ xfs_xlatesb(
 
 		ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
 
-		if (arch == ARCH_NOCONVERT ||
-		    size == 1 ||
-		    xfs_sb_info[f].type == 1) {
+		if (size == 1 || xfs_sb_info[f].type == 1) {
 			if (dir > 0) {
 				memcpy(mem_ptr + first, buf_ptr + first, size);
 			} else {
@@ -446,16 +451,16 @@ xfs_xlatesb(
 			case 2:
 				INT_XLATE(*(__uint16_t*)(buf_ptr+first),
 					  *(__uint16_t*)(mem_ptr+first),
-					  dir, arch);
+					  dir, ARCH_CONVERT);
 				break;
 			case 4:
 				INT_XLATE(*(__uint32_t*)(buf_ptr+first),
 					  *(__uint32_t*)(mem_ptr+first),
-					  dir, arch);
+					  dir, ARCH_CONVERT);
 				break;
 			case 8:
 				INT_XLATE(*(__uint64_t*)(buf_ptr+first),
-					  *(__uint64_t*)(mem_ptr+first), dir, arch);
+					  *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT);
 				break;
 			default:
 				ASSERT(0);
@@ -472,7 +477,7 @@ xfs_xlatesb(
  * Does the initial read of the superblock.
  */
 int
-xfs_readsb(xfs_mount_t *mp)
+xfs_readsb(xfs_mount_t *mp, int flags)
 {
 	unsigned int	sector_size;
 	unsigned int	extra_flags;
@@ -489,12 +494,12 @@ xfs_readsb(xfs_mount_t *mp)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+        extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+
+	bp = xfs_getsb(mp,0);
 
-	bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
-				BTOBB(sector_size), extra_flags);
 	if (!bp || XFS_BUF_ISERROR(bp)) {
-		cmn_err(CE_WARN, "XFS: SB read failed");
+		xfs_fs_mount_cmn_err(flags, "SB read failed");
 		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
 		goto fail;
 	}
@@ -506,12 +511,11 @@ xfs_readsb(xfs_mount_t *mp)
 	 * But first do some basic consistency checking.
 	 */
 	sbp = XFS_BUF_TO_SBP(bp);
-	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1,
-				ARCH_CONVERT, XFS_SB_ALL_BITS);
+	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS);
 
-	error = xfs_mount_validate_sb(mp, &(mp->m_sb));
+	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
 	if (error) {
-		cmn_err(CE_WARN, "XFS: SB validate failed");
+		xfs_fs_mount_cmn_err(flags, "SB validate failed");
 		goto fail;
 	}
 
@@ -519,8 +523,8 @@ xfs_readsb(xfs_mount_t *mp)
 	 * We must be able to do sector-sized and sector-aligned IO.
 	 */
 	if (sector_size > mp->m_sb.sb_sectsize) {
-		cmn_err(CE_WARN,
-			"XFS: device supports only %u byte sectors (not %u)",
+		xfs_fs_mount_cmn_err(flags,
+			"device supports only %u byte sectors (not %u)",
 			sector_size, mp->m_sb.sb_sectsize);
 		error = ENOSYS;
 		goto fail;
@@ -537,7 +541,7 @@ xfs_readsb(xfs_mount_t *mp)
 		bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
 					BTOBB(sector_size), extra_flags);
 		if (!bp || XFS_BUF_ISERROR(bp)) {
-			cmn_err(CE_WARN, "XFS: SB re-read failed");
+			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
 			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
 			goto fail;
 		}
@@ -545,6 +549,10 @@ xfs_readsb(xfs_mount_t *mp)
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 	}
 
+	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+
 	mp->m_sb_bp = bp;
 	xfs_buf_relse(bp);
 	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
@@ -572,6 +580,7 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 	int	i;
 
 	mp->m_agfrotor = mp->m_agirotor = 0;
+	spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock");
 	mp->m_maxagi = mp->m_sb.sb_agcount;
 	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
 	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
@@ -583,7 +592,9 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 	mp->m_blockmask = sbp->sb_blocksize - 1;
 	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
 	mp->m_blockwmask = mp->m_blockwsize - 1;
-
+#ifdef RMC
+	INIT_LIST_HEAD(&mp->m_del_inodes);
+#endif
 	TAILQ_INIT(&mp->m_del_inodes);
 
 	/*
@@ -594,12 +605,13 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 	ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
 	switch (sbp->sb_inodesize) {
 	case 256:
-		mp->m_attroffset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(2);
+		mp->m_attroffset = XFS_LITINO(mp) -
+				   XFS_BMDR_SPACE_CALC(MINABTPTRS);
 		break;
 	case 512:
 	case 1024:
 	case 2048:
-		mp->m_attroffset = XFS_BMDR_SPACE_CALC(12);
+		mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
 		break;
 	default:
 		ASSERT(0);
@@ -651,9 +663,8 @@ xfs_mountfs(
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
-	xfs_vnode_t	*rvp = 0;
+	xfs_vnode_t	*rvp = NULL;
 	int		readio_log, writeio_log;
-	vmap_t		vmap;
 	xfs_daddr_t	d;
 	__uint64_t	ret64;
 	__int64_t	update_flags;
@@ -663,8 +674,8 @@ xfs_mountfs(
 	int		error = 0;
 
 	if (mp->m_sb_bp == NULL) {
-		if ((error = xfs_readsb(mp))) {
-			return (error);
+		if ((error = xfs_readsb(mp, mfsi_flags))) {
+			return error;
 		}
 	}
 	xfs_mount_common(mp, sbp);
@@ -700,14 +711,21 @@ xfs_mountfs(
 					error = XFS_ERROR(EINVAL);
 					goto error1;
 				}
+				xfs_fs_cmn_err(CE_WARN, mp,
+"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+					mp->m_dalign, mp->m_swidth,
+					sbp->sb_agblocks);
+
 				mp->m_dalign = 0;
 				mp->m_swidth = 0;
 			} else if (mp->m_dalign) {
 				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
 			} else {
 				if (mp->m_flags & XFS_MOUNT_RETERR) {
-					cmn_err(CE_WARN,
-					"XFS: alignment check 3 failed");
+					xfs_fs_cmn_err(CE_WARN, mp,
+"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+                                        	mp->m_dalign,
+						mp->m_blockmask +1);
 					error = XFS_ERROR(EINVAL);
 					goto error1;
 				}
@@ -900,7 +918,7 @@ xfs_mountfs(
 	 * For client case we are done now
 	 */
 	if (mfsi_flags & XFS_MFSI_CLIENT) {
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -958,7 +976,7 @@ xfs_mountfs(
 	mp->m_perag =
 		kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
 
-	xfs_initialize_perag(mp, sbp->sb_agcount);
+	mp->m_maxagi = xfs_initialize_perag(vfsp, mp, sbp->sb_agcount);
 
 	/*
 	 * log's mount-time initialization. Perform 1st part recovery if needed
@@ -982,7 +1000,7 @@ xfs_mountfs(
 	 * Get and sanity-check the root inode.
 	 * Save the pointer to it in the mount structure.
 	 */
-	error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_ILOCK_EXCL, &rip, 0);
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: failed to read root inode");
 		goto error3;
@@ -990,12 +1008,11 @@ xfs_mountfs(
 
 	ASSERT(rip != NULL);
 	rvp = XFS_ITOV(rip);
-	VMAP(rvp, vmap);
 
 	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
 		cmn_err(CE_WARN, "XFS: corrupted root inode");
-		prdev("Root inode %llu is not a directory",
-		      mp->m_ddev_targp, (unsigned long long)rip->i_ino);
+		printf("Root inode %p is not a directory: %llu",
+		       mp->m_ddev_targp, (unsigned long long)rip->i_ino);
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);
 		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
 				 mp);
@@ -1044,7 +1061,7 @@ xfs_mountfs(
 	/*
 	 * Complete the quota initialisation, post-log-replay component.
 	 */
-	if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags)))
+	if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags)))
 		goto error4;
 
 	return 0;
@@ -1054,7 +1071,6 @@ xfs_mountfs(
 	 * Free up the root inode.
 	 */
 	VN_RELE(rvp);
-	vn_purge(rvp, &vmap);
  error3:
 	xfs_log_unmount_dealloc(mp);
  error2:
@@ -1088,10 +1104,9 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 	int64_t		fsid;
 #endif
 
-	xfs_iflush_all(mp, XFS_FLUSH_ALL);
+	xfs_iflush_all(mp);
 
-	XFS_QM_DQPURGEALL(mp,
-		XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING);
+	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
 	/*
 	 * Flush out the log synchronously so that we know for sure
@@ -1107,6 +1122,8 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 
 	xfs_unmountfs_writesb(mp);
 
+	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
+
 	xfs_log_unmount(mp);			/* Done! No more fs ops. */
 
 	xfs_freesb(mp);
@@ -1114,15 +1131,10 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 	/*
 	 * All inodes from this mount point should be freed.
 	 */
-	ASSERT(mp->m_inodes == NULL);
-
-	/*
-	 * We may have bufs that are in the process of getting written still.
-	 * We must wait for the I/O completion of those. The sync flag here
-	 * does a two pass iteration thru the bufcache.
-	 */
-	if (XFS_FORCED_SHUTDOWN(mp)) {
-		xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */
+	//ASSERT(mp->m_inodes == NULL);
+	if (mp->m_inodes != NULL ) {
+		printf("WRONG: mp->m_ireclaims: %d\n", mp->m_ireclaims);
+		printf("WRONG: mp->m_inodes: %p\n", mp->m_inodes);
 	}
 
 	xfs_unmountfs_close(mp, cr);
@@ -1144,23 +1156,21 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 void
 xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
 {
-	int		have_logdev = (mp->m_logdev_targp != mp->m_ddev_targp);
+	if (mp->m_logdev_targp != mp->m_ddev_targp)
+		xfs_free_buftarg(mp->m_logdev_targp, 1);
+	if (mp->m_rtdev_targp)
+		xfs_free_buftarg(mp->m_rtdev_targp, 1);
+	xfs_free_buftarg(mp->m_ddev_targp, 0);
+}
 
-	if (mp->m_ddev_targp) {
-		xfs_blkdev_put(mp->m_ddev_targp->specvp); /* FreeBSD non-portable */
-		xfs_free_buftarg(mp->m_ddev_targp);
-		mp->m_ddev_targp = NULL;
-	}
-	if (mp->m_rtdev_targp) {
-		xfs_blkdev_put(mp->m_rtdev_targp->specvp); /* FreeBSD non-portable */
-		xfs_free_buftarg(mp->m_rtdev_targp);
-		mp->m_rtdev_targp = NULL;
-	}
-	if (mp->m_logdev_targp && have_logdev) {
-		xfs_blkdev_put(mp->m_logdev_targp->specvp); /* FreeBSD non-portable */
-		xfs_free_buftarg(mp->m_logdev_targp);
-		mp->m_logdev_targp = NULL;
-	}
+STATIC void
+xfs_unmountfs_wait(xfs_mount_t *mp)
+{
+	if (mp->m_logdev_targp != mp->m_ddev_targp)
+		xfs_wait_buftarg(mp->m_logdev_targp);
+	if (mp->m_rtdev_targp)
+		xfs_wait_buftarg(mp->m_rtdev_targp);
+	xfs_wait_buftarg(mp->m_ddev_targp);
 }
 
 int
@@ -1177,6 +1187,9 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 	sbp = xfs_getsb(mp, 0);
 	if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
 		XFS_FORCED_SHUTDOWN(mp))) {
+
+		xfs_icsb_sync_counters(mp);
+
 		/*
 		 * mark shared-readonly if desired
 		 */
@@ -1189,7 +1202,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 			xfs_fs_cmn_err(CE_NOTE, mp,
 				"Unmounting, marking shared read-only");
 		}
-		/* Not on FreeBSD XFS_BUF_UNDONE(sbp); */
+		XFS_BUF_UNDONE(sbp);
 		XFS_BUF_UNREAD(sbp);
 		XFS_BUF_UNDELAYWRITE(sbp);
 		XFS_BUF_WRITE(sbp);
@@ -1205,7 +1218,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 			xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
 	}
 	xfs_buf_relse(sbp);
-	return (error);
+	return error;
 }
 
 /*
@@ -1236,7 +1249,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 
 	/* translate/copy */
 
-	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, ARCH_CONVERT, fields);
+	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields);
 
 	/* find modified range */
 
@@ -1250,7 +1263,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 
 	xfs_trans_log_buf(tp, bp, first, last);
 }
-
 /*
  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
  * a delta to a specified field in the in-core superblock.  Simply
@@ -1260,7 +1272,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
  *
  * The SB_LOCK must be held when this routine is called.
  */
-STATIC int
+int
 xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 			int delta, int rsvd)
 {
@@ -1280,19 +1292,19 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_icount = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_IFREE:
 		lcounter = (long long)mp->m_sb.sb_ifree;
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_ifree = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_FDBLOCKS:
 
 		lcounter = (long long)mp->m_sb.sb_fdblocks;
@@ -1319,101 +1331,101 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 				if (rsvd) {
 					lcounter = (long long)mp->m_resblks_avail + delta;
 					if (lcounter < 0) {
-						return (XFS_ERROR(ENOSPC));
+						return XFS_ERROR(ENOSPC);
 					}
 					mp->m_resblks_avail = lcounter;
-					return (0);
+					return 0;
 				} else {	/* not reserved */
-					return (XFS_ERROR(ENOSPC));
+					return XFS_ERROR(ENOSPC);
 				}
 			}
 		}
 
 		mp->m_sb.sb_fdblocks = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_FREXTENTS:
 		lcounter = (long long)mp->m_sb.sb_frextents;
 		lcounter += delta;
 		if (lcounter < 0) {
-			return (XFS_ERROR(ENOSPC));
+			return XFS_ERROR(ENOSPC);
 		}
 		mp->m_sb.sb_frextents = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_DBLOCKS:
 		lcounter = (long long)mp->m_sb.sb_dblocks;
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_dblocks = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_AGCOUNT:
 		scounter = mp->m_sb.sb_agcount;
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_agcount = scounter;
-		return (0);
+		return 0;
 	case XFS_SBS_IMAX_PCT:
 		scounter = mp->m_sb.sb_imax_pct;
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_imax_pct = scounter;
-		return (0);
+		return 0;
 	case XFS_SBS_REXTSIZE:
 		scounter = mp->m_sb.sb_rextsize;
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_rextsize = scounter;
-		return (0);
+		return 0;
 	case XFS_SBS_RBMBLOCKS:
 		scounter = mp->m_sb.sb_rbmblocks;
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_rbmblocks = scounter;
-		return (0);
+		return 0;
 	case XFS_SBS_RBLOCKS:
 		lcounter = (long long)mp->m_sb.sb_rblocks;
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_rblocks = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_REXTENTS:
 		lcounter = (long long)mp->m_sb.sb_rextents;
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_rextents = lcounter;
-		return (0);
+		return 0;
 	case XFS_SBS_REXTSLOG:
 		scounter = mp->m_sb.sb_rextslog;
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return (XFS_ERROR(EINVAL));
+			return XFS_ERROR(EINVAL);
 		}
 		mp->m_sb.sb_rextslog = scounter;
-		return (0);
+		return 0;
 	default:
 		ASSERT(0);
-		return (XFS_ERROR(EINVAL));
+		return XFS_ERROR(EINVAL);
 	}
 }
 
@@ -1429,10 +1441,27 @@ xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
 	unsigned long	s;
 	int	status;
 
-	s = XFS_SB_LOCK(mp);
-	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-	XFS_SB_UNLOCK(mp, s);
-	return (status);
+	/* check for per-cpu counters */
+	switch (field) {
+#ifdef HAVE_PERCPU_SB
+	case XFS_SBS_ICOUNT:
+	case XFS_SBS_IFREE:
+	case XFS_SBS_FDBLOCKS:
+		if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
+			status = xfs_icsb_modify_counters(mp, field,
+							delta, rsvd);
+			break;
+		}
+		/* FALLTHROUGH */
+#endif
+	default:
+		s = XFS_SB_LOCK(mp);
+		status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+		XFS_SB_UNLOCK(mp, s);
+		break;
+	}
+
+	return status;
 }
 
 /*
@@ -1468,8 +1497,26 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
 		 * from the loop so we'll fall into the undo loop
 		 * below.
 		 */
-		status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-						    msbp->msb_delta, rsvd);
+		switch (msbp->msb_field) {
+#ifdef HAVE_PERCPU_SB
+		case XFS_SBS_ICOUNT:
+		case XFS_SBS_IFREE:
+		case XFS_SBS_FDBLOCKS:
+			if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
+				status = xfs_icsb_modify_counters_locked(mp,
+							msbp->msb_field,
+							msbp->msb_delta, rsvd);
+				break;
+			}
+			/* FALLTHROUGH */
+#endif
+		default:
+			status = xfs_mod_incore_sb_unlocked(mp,
+						msbp->msb_field,
+						msbp->msb_delta, rsvd);
+			break;
+		}
+
 		if (status != 0) {
 			break;
 		}
@@ -1486,14 +1533,34 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
 	if (status != 0) {
 		msbp--;
 		while (msbp >= msb) {
-			status = xfs_mod_incore_sb_unlocked(mp,
-				    msbp->msb_field, -(msbp->msb_delta), rsvd);
+			switch (msbp->msb_field) {
+#ifdef HAVE_PERCPU_SB
+			case XFS_SBS_ICOUNT:
+			case XFS_SBS_IFREE:
+			case XFS_SBS_FDBLOCKS:
+				if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
+					status =
+					    xfs_icsb_modify_counters_locked(mp,
+							msbp->msb_field,
+							-(msbp->msb_delta),
+							rsvd);
+					break;
+				}
+				/* FALLTHROUGH */
+#endif
+			default:
+				status = xfs_mod_incore_sb_unlocked(mp,
+							msbp->msb_field,
+							-(msbp->msb_delta),
+							rsvd);
+				break;
+			}
 			ASSERT(status == 0);
 			msbp--;
 		}
 	}
 	XFS_SB_UNLOCK(mp, s);
-	return (status);
+	return status;
 }
 
 /*
@@ -1511,19 +1578,35 @@ xfs_getsb(
 	int		flags)
 {
 	xfs_buf_t	*bp;
+	int		extra_flags = 0;
+	unsigned int	sector_size;
+
 
-	ASSERT(mp->m_sb_bp != NULL);
 	bp = mp->m_sb_bp;
-	if (flags & XFS_BUF_TRYLOCK) {
-		if (!XFS_BUF_CPSEMA(bp)) {
-			return NULL;
-		}
-	} else {
-		XFS_BUF_PSEMA(bp, PRIBIO);
-	}
+	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+#ifdef NOT
+	/* MANAGED buf's appear broken in FreeBSD
+	 * but it's unclear if we need a persistant superblock?
+	 * since we now translate the ondisk superblock to
+	 * a separate translated structure and then translate that
+	 * structure back when we want to write the superblock
+	 */
+	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+	extra_flags = XFS_BUF_MANAGE;
+#endif
+
+	mp->m_sb_bp = bp
+	  = xfs_buf_read_flags(mp->m_ddev_targp,
+			       XFS_SB_DADDR,
+			       BTOBB(sector_size),
+			       extra_flags);
+
 	XFS_BUF_HOLD(bp);
 	ASSERT(XFS_BUF_ISDONE(bp));
-	return (bp);
+	if (!XFS_BUF_ISDONE(bp)){
+		printf("xfs_getsb: %p bp flags 0x%x\n",bp,bp->b_flags);
+	}
+	return bp;
 }
 
 /*
@@ -1601,59 +1684,524 @@ xfs_mount_log_sbunit(
 	xfs_trans_commit(tp, 0, NULL);
 }
 
-/* Functions to lock access out of the filesystem for forced
- * shutdown or snapshot.
+
+#ifdef HAVE_PERCPU_SB
+/*
+ * Per-cpu incore superblock counters
+ *
+ * Simple concept, difficult implementation
+ *
+ * Basically, replace the incore superblock counters with a distributed per cpu
+ * counter for contended fields (e.g.  free block count).
+ *
+ * Difficulties arise in that the incore sb is used for ENOSPC checking, and
+ * hence needs to be accurately read when we are running low on space. Hence
+ * there is a method to enable and disable the per-cpu counters based on how
+ * much "stuff" is available in them.
+ *
+ * Basically, a counter is enabled if there is enough free resource to justify
+ * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
+ * ENOSPC), then we disable the counters to synchronise all callers and
+ * re-distribute the available resources.
+ *
+ * If, once we redistributed the available resources, we still get a failure,
+ * we disable the per-cpu counter and go through the slow path.
+ *
+ * The slow path is the current xfs_mod_incore_sb() function.  This means that
+ * when we disable a per-cpu counter, we need to drain it's resources back to
+ * the global superblock. We do this after disabling the counter to prevent
+ * more threads from queueing up on the counter.
+ *
+ * Essentially, this means that we still need a lock in the fast path to enable
+ * synchronisation between the global counters and the per-cpu counters. This
+ * is not a problem because the lock will be local to a CPU almost all the time
+ * and have little contention except when we get to ENOSPC conditions.
+ *
+ * Basically, this lock becomes a barrier that enables us to lock out the fast
+ * path while we do things like enabling and disabling counters and
+ * synchronising the counters.
+ *
+ * Locking rules:
+ *
+ * 	1. XFS_SB_LOCK() before picking up per-cpu locks
+ * 	2. per-cpu locks always picked up via for_each_online_cpu() order
+ * 	3. accurate counter sync requires XFS_SB_LOCK + per cpu locks
+ * 	4. modifying per-cpu counters requires holding per-cpu lock
+ * 	5. modifying global counters requires holding XFS_SB_LOCK
+ *	6. enabling or disabling a counter requires holding the XFS_SB_LOCK
+ *	   and _none_ of the per-cpu locks.
+ *
+ * Disabled counters are only ever re-enabled by a balance operation
+ * that results in more free resources per CPU than a given threshold.
+ * To ensure counters don't remain disabled, they are rebalanced when
+ * the global resource goes above a higher threshold (i.e. some hysteresis
+ * is present to prevent thrashing).
  */
 
-void
-xfs_start_freeze(
+/*
+ * hot-plug CPU notifier support.
+ *
+ * We cannot use the hotcpu_register() function because it does
+ * not allow notifier instances. We need a notifier per filesystem
+ * as we need to be able to identify the filesystem to balance
+ * the counters out. This is achieved by having a notifier block
+ * embedded in the xfs_mount_t and doing pointer magic to get the
+ * mount pointer from the notifier block address.
+ */
+STATIC int
+xfs_icsb_cpu_notify(
+	struct notifier_block *nfb,
+	unsigned long action,
+	void *hcpu)
+{
+	xfs_icsb_cnts_t *cntp;
+	xfs_mount_t	*mp;
+	int		s;
+
+	mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
+	cntp = (xfs_icsb_cnts_t *)
+			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
+	switch (action) {
+	case CPU_UP_PREPARE:
+		/* Easy Case - initialize the area and locks, and
+		 * then rebalance when online does everything else for us. */
+		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+		break;
+	case CPU_ONLINE:
+		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+		break;
+	case CPU_DEAD:
+		/* Disable all the counters, then fold the dead cpu's
+		 * count into the total on the global superblock and
+		 * re-enable the counters. */
+		s = XFS_SB_LOCK(mp);
+		xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
+		xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
+		xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
+
+		mp->m_sb.sb_icount += cntp->icsb_icount;
+		mp->m_sb.sb_ifree += cntp->icsb_ifree;
+		mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
+
+		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+
+		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, XFS_ICSB_SB_LOCKED);
+		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, XFS_ICSB_SB_LOCKED);
+		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, XFS_ICSB_SB_LOCKED);
+		XFS_SB_UNLOCK(mp, s);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+int
+xfs_icsb_init_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
+	if (mp->m_sb_cnts == NULL)
+		return -ENOMEM;
+
+	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
+	mp->m_icsb_notifier.priority = 0;
+	register_cpu_notifier(&mp->m_icsb_notifier);
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+	}
+	/*
+	 * start with all counters disabled so that the
+	 * initial balance kicks us off correctly
+	 */
+	mp->m_icsb_counters = -1;
+	return 0;
+}
+
+STATIC void
+xfs_icsb_destroy_counters(
+	xfs_mount_t	*mp)
+{
+	if (mp->m_sb_cnts) {
+		unregister_cpu_notifier(&mp->m_icsb_notifier);
+		free_percpu(mp->m_sb_cnts);
+	}
+}
+
+STATIC inline void
+xfs_icsb_lock_cntr(
+	xfs_icsb_cnts_t	*icsbp)
+{
+	while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
+		ndelay(1000);
+	}
+}
+
+STATIC inline void
+xfs_icsb_unlock_cntr(
+	xfs_icsb_cnts_t	*icsbp)
+{
+	clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
+}
+
+
+STATIC inline void
+xfs_icsb_lock_all_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		xfs_icsb_lock_cntr(cntp);
+	}
+}
+
+STATIC inline void
+xfs_icsb_unlock_all_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		xfs_icsb_unlock_cntr(cntp);
+	}
+}
+
+STATIC void
+xfs_icsb_count(
 	xfs_mount_t	*mp,
-	int		level)
+	xfs_icsb_cnts_t	*cnt,
+	int		flags)
 {
-	unsigned long	s = mutex_spinlock(&mp->m_freeze_lock);
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
 
-	mp->m_frozen = level;
-	mutex_spinunlock(&mp->m_freeze_lock, s);
+	if (!(flags & XFS_ICSB_LAZY_COUNT))
+		xfs_icsb_lock_all_counters(mp);
 
-	if (level == XFS_FREEZE_TRANS) {
-		while (atomic_read(&mp->m_active_trans) > 0)
-			delay(100);
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		cnt->icsb_icount += cntp->icsb_icount;
+		cnt->icsb_ifree += cntp->icsb_ifree;
+		cnt->icsb_fdblocks += cntp->icsb_fdblocks;
 	}
+
+	if (!(flags & XFS_ICSB_LAZY_COUNT))
+		xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC int
+xfs_icsb_counter_disabled(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field)
+{
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+	return test_bit(field, &mp->m_icsb_counters);
+}
+
+STATIC int
+xfs_icsb_disable_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field)
+{
+	xfs_icsb_cnts_t	cnt;
+
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+	xfs_icsb_lock_all_counters(mp);
+	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
+		/* drain back to superblock */
+
+		xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT);
+		switch(field) {
+		case XFS_SBS_ICOUNT:
+			mp->m_sb.sb_icount = cnt.icsb_icount;
+			break;
+		case XFS_SBS_IFREE:
+			mp->m_sb.sb_ifree = cnt.icsb_ifree;
+			break;
+		case XFS_SBS_FDBLOCKS:
+			mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	xfs_icsb_unlock_all_counters(mp);
+
+	return 0;
+}
+
+STATIC void
+xfs_icsb_enable_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	uint64_t	count,
+	uint64_t	resid)
+{
+	xfs_icsb_cnts_t	*cntp;
+	int		i;
+
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+	xfs_icsb_lock_all_counters(mp);
+	for_each_online_cpu(i) {
+		cntp = per_cpu_ptr(mp->m_sb_cnts, i);
+		switch (field) {
+		case XFS_SBS_ICOUNT:
+			cntp->icsb_icount = count + resid;
+			break;
+		case XFS_SBS_IFREE:
+			cntp->icsb_ifree = count + resid;
+			break;
+		case XFS_SBS_FDBLOCKS:
+			cntp->icsb_fdblocks = count + resid;
+			break;
+		default:
+			BUG();
+			break;
+		}
+		resid = 0;
+	}
+	clear_bit(field, &mp->m_icsb_counters);
+	xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC void
+xfs_icsb_sync_counters_int(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	xfs_icsb_cnts_t	cnt;
+	int		s;
+
+	/* Pass 1: lock all counters */
+	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
+		s = XFS_SB_LOCK(mp);
+
+	xfs_icsb_count(mp, &cnt, flags);
+
+	/* Step 3: update mp->m_sb fields */
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
+		mp->m_sb.sb_icount = cnt.icsb_icount;
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
+		mp->m_sb.sb_ifree = cnt.icsb_ifree;
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
+		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+
+	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
+		XFS_SB_UNLOCK(mp, s);
+}
+
+/*
+ * Accurate update of per-cpu counters to incore superblock
+ */
+STATIC void
+xfs_icsb_sync_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_sync_counters_int(mp, 0);
 }
 
+/*
+ * lazy addition used for things like df, background sb syncs, etc
+ */
 void
-xfs_finish_freeze(
+xfs_icsb_sync_counters_lazy(
 	xfs_mount_t	*mp)
 {
-	unsigned long	s = mutex_spinlock(&mp->m_freeze_lock);
+	xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT);
+}
+
+/*
+ * Balance and enable/disable counters as necessary.
+ *
+ * Thresholds for re-enabling counters are somewhat magic.
+ * inode counts are chosen to be the same number as single
+ * on disk allocation chunk per CPU, and free blocks is
+ * something far enough zero that we aren't going thrash
+ * when we get near ENOSPC.
+ */
+#define XFS_ICSB_INO_CNTR_REENABLE	64
+#define XFS_ICSB_FDBLK_CNTR_REENABLE	512
+STATIC void
+xfs_icsb_balance_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t  field,
+	int		flags)
+{
+	uint64_t	count, resid = 0;
+	int		weight = num_online_cpus();
+	int		s;
+
+	if (!(flags & XFS_ICSB_SB_LOCKED))
+		s = XFS_SB_LOCK(mp);
+
+	/* disable counter and sync counter */
+	xfs_icsb_disable_counter(mp, field);
 
-	if (mp->m_frozen) {
-		mp->m_frozen = 0;
-		sv_broadcast(&mp->m_wait_unfreeze);
+	/* update counters  - first CPU gets residual*/
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		count = mp->m_sb.sb_icount;
+		resid = do_div(count, weight);
+		if (count < XFS_ICSB_INO_CNTR_REENABLE)
+			goto out;
+		break;
+	case XFS_SBS_IFREE:
+		count = mp->m_sb.sb_ifree;
+		resid = do_div(count, weight);
+		if (count < XFS_ICSB_INO_CNTR_REENABLE)
+			goto out;
+		break;
+	case XFS_SBS_FDBLOCKS:
+		count = mp->m_sb.sb_fdblocks;
+		resid = do_div(count, weight);
+		if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
+			goto out;
+		break;
+	default:
+		BUG();
+		break;
 	}
 
-	mutex_spinunlock(&mp->m_freeze_lock, s);
+	xfs_icsb_enable_counter(mp, field, count, resid);
+out:
+	if (!(flags & XFS_ICSB_SB_LOCKED))
+		XFS_SB_UNLOCK(mp, s);
 }
 
-void
-xfs_check_frozen(
+STATIC int
+xfs_icsb_modify_counters_int(
 	xfs_mount_t	*mp,
-	bhv_desc_t	*bdp,
-	int		level)
+	xfs_sb_field_t	field,
+	int		delta,
+	int		rsvd,
+	int		flags)
 {
-	unsigned long	s;
+	xfs_icsb_cnts_t	*icsbp;
+	long long	lcounter;	/* long counter for 64 bit fields */
+	int		cpu, s, locked = 0;
+	int		ret = 0, balance_done = 0;
 
-	if (mp->m_frozen) {
-		s = mutex_spinlock(&mp->m_freeze_lock);
+again:
+	cpu = get_cpu();
+	icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu),
+	xfs_icsb_lock_cntr(icsbp);
+	if (unlikely(xfs_icsb_counter_disabled(mp, field)))
+		goto slow_path;
 
-		if (mp->m_frozen < level) {
-			mutex_spinunlock(&mp->m_freeze_lock, s);
-		} else {
-			sv_wait(&mp->m_wait_unfreeze, 0, &mp->m_freeze_lock, s);
-		}
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		lcounter = icsbp->icsb_icount;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto slow_path;
+		icsbp->icsb_icount = lcounter;
+		break;
+
+	case XFS_SBS_IFREE:
+		lcounter = icsbp->icsb_ifree;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto slow_path;
+		icsbp->icsb_ifree = lcounter;
+		break;
+
+	case XFS_SBS_FDBLOCKS:
+		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
+
+		lcounter = icsbp->icsb_fdblocks;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto slow_path;
+		icsbp->icsb_fdblocks = lcounter;
+		break;
+	default:
+		BUG();
+		break;
 	}
+	xfs_icsb_unlock_cntr(icsbp);
+	put_cpu();
+	if (locked)
+		XFS_SB_UNLOCK(mp, s);
+	return 0;
 
-	if (level == XFS_FREEZE_TRANS)
-		atomic_inc(&mp->m_active_trans);
+	/*
+	 * The slow path needs to be run with the SBLOCK
+	 * held so that we prevent other threads from
+	 * attempting to run this path at the same time.
+	 * this provides exclusion for the balancing code,
+	 * and exclusive fallback if the balance does not
+	 * provide enough resources to continue in an unlocked
+	 * manner.
+	 */
+slow_path:
+	xfs_icsb_unlock_cntr(icsbp);
+	put_cpu();
+
+	/* need to hold superblock incase we need
+	 * to disable a counter */
+	if (!(flags & XFS_ICSB_SB_LOCKED)) {
+		s = XFS_SB_LOCK(mp);
+		locked = 1;
+		flags |= XFS_ICSB_SB_LOCKED;
+	}
+	if (!balance_done) {
+		xfs_icsb_balance_counter(mp, field, flags);
+		balance_done = 1;
+		goto again;
+	} else {
+		/*
+		 * we might not have enough on this local
+		 * cpu to allocate for a bulk request.
+		 * We need to drain this field from all CPUs
+		 * and disable the counter fastpath
+		 */
+		xfs_icsb_disable_counter(mp, field);
+	}
+
+	ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+
+	if (locked)
+		XFS_SB_UNLOCK(mp, s);
+	return ret;
+}
+
+STATIC int
+xfs_icsb_modify_counters(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int		delta,
+	int		rsvd)
+{
+	return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0);
 }
 
+/*
+ * Called when superblock is already locked
+ */
+STATIC int
+xfs_icsb_modify_counters_locked(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int		delta,
+	int		rsvd)
+{
+	return xfs_icsb_modify_counters_int(mp, field, delta,
+						rsvd, XFS_ICSB_SB_LOCKED);
+}
+#endif