1 files changed, 561 insertions, 235 deletions
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 43f8669..0dea7bd 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -35,7 +35,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  * $FreeBSD$
  */
 
@@ -75,7 +75,9 @@
 #ifdef DDB
 extern void	printlockedvnodes __P((void));
 #endif
-extern void	vclean __P((struct vnode *vp, int flags));
+static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
+extern void	vgonel __P((struct vnode *vp, struct proc *p));
+unsigned long	numvnodes;
 extern void	vfs_unmountroot __P((struct mount *rootfs));
 
 enum vtype iftovt_tab[16] = {
@@ -91,15 +93,19 @@ int vttoif_tab[9] = {
  * Insq/Remq for the vnode usage lists.
  */
 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
-#define	bufremvn(bp) {  \
-	LIST_REMOVE(bp, b_vnbufs); \
-	(bp)->b_vnbufs.le_next = NOLIST; \
+#define	bufremvn(bp) {							\
+	LIST_REMOVE(bp, b_vnbufs);					\
+	(bp)->b_vnbufs.le_next = NOLIST;				\
 }
-
 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
 static u_long freevnodes = 0;
 
 struct mntlist mountlist;	/* mounted filesystem list */
+struct simplelock mountlist_slock;
+static struct simplelock mntid_slock;
+struct simplelock mntvnode_slock;
+struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
 
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
@@ -117,164 +123,153 @@ vntblinit()
 {
 
 	desiredvnodes = maxproc + vm_object_cache_max;
+	simple_lock_init(&mntvnode_slock);
+	simple_lock_init(&mntid_slock);
+	simple_lock_init(&spechash_slock);
 	TAILQ_INIT(&vnode_free_list);
+	simple_lock_init(&vnode_free_list_slock);
 	CIRCLEQ_INIT(&mountlist);
 }
 
 /*
- * Lock a filesystem.
- * Used to prevent access to it while mounting and unmounting.
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
  */
 int
-vfs_lock(mp)
-	register struct mount *mp;
+vfs_busy(mp, flags, interlkp, p)
+	struct mount *mp;
+	int flags;
+	struct simplelock *interlkp;
+	struct proc *p;
 {
+	int lkflags;
 
-	while (mp->mnt_flag & MNT_MLOCK) {
+	if (mp->mnt_flag & MNT_UNMOUNT) {
+		if (flags & LK_NOWAIT)
+			return (ENOENT);
 		mp->mnt_flag |= MNT_MWAIT;
-		(void) tsleep((caddr_t) mp, PVFS, "vfslck", 0);
+		if (interlkp) {
+			simple_unlock(interlkp);
+		}
+		/*
+		 * Since all busy locks are shared except the exclusive
+		 * lock granted when unmounting, the only place that a
+		 * wakeup needs to be done is at the release of the
+		 * exclusive lock at the end of dounmount.
+		 */
+		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+		if (interlkp) {
+			simple_lock(interlkp);
+		}
+		return (ENOENT);
 	}
-	mp->mnt_flag |= MNT_MLOCK;
+	lkflags = LK_SHARED;
+	if (interlkp)
+		lkflags |= LK_INTERLOCK;
+	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
- * Unlock a locked filesystem.
- * Panic if filesystem is not locked.
+ * Free a busy filesystem.
  */
 void
-vfs_unlock(mp)
-	register struct mount *mp;
+vfs_unbusy(mp, p)
+	struct mount *mp;
+	struct proc *p;
 {
 
-	if ((mp->mnt_flag & MNT_MLOCK) == 0)
-		panic("vfs_unlock: not locked");
-	mp->mnt_flag &= ~MNT_MLOCK;
-	if (mp->mnt_flag & MNT_MWAIT) {
-		mp->mnt_flag &= ~MNT_MWAIT;
-		wakeup((caddr_t) mp);
-	}
+	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 }
 
 /*
- * Mark a mount point as busy.
- * Used to synchronize access and to delay unmounting.
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
  */
 int
-vfs_busy(mp)
-	register struct mount *mp;
+vfs_rootmountalloc(fstypename, devname, mpp)
+	char *fstypename;
+	char *devname;
+	struct mount **mpp;
 {
+	struct proc *p = curproc;	/* XXX */
+	struct vfsconf *vfsp;
+	struct mount *mp;
 
-	while (mp->mnt_flag & MNT_MPBUSY) {
-		mp->mnt_flag |= MNT_MPWANT;
-		(void) tsleep((caddr_t) &mp->mnt_flag, PVFS, "vfsbsy", 0);
-	}
-	if (mp->mnt_flag & MNT_UNMOUNT)
-		return (1);
-	mp->mnt_flag |= MNT_MPBUSY;
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL)
+		return (ENODEV);
+	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	LIST_INIT(&mp->mnt_vnodelist);
+	mp->mnt_vfc = vfsp;
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_flag = MNT_RDONLY;
+	mp->mnt_vnodecovered = NULLVP;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_stat.f_mntonname[0] = '/';
+	mp->mnt_stat.f_mntonname[1] = 0;
+	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+	*mpp = mp;
 	return (0);
 }
 
 /*
- * Free a busy filesystem.
- * Panic if filesystem is not busy.
- */
-void
-vfs_unbusy(mp)
-	register struct mount *mp;
-{
-
-	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
-		panic("vfs_unbusy: not busy");
-	mp->mnt_flag &= ~MNT_MPBUSY;
-	if (mp->mnt_flag & MNT_MPWANT) {
-		mp->mnt_flag &= ~MNT_MPWANT;
-		wakeup((caddr_t) &mp->mnt_flag);
-	}
-}
-
-void
-vfs_unmountroot(struct mount *rootfs)
-{
-	struct mount *mp = rootfs;
-	int error;
-
-	if (vfs_busy(mp)) {
-		printf("failed to unmount root\n");
-		return;
-	}
-	mp->mnt_flag |= MNT_UNMOUNT;
-	if ((error = vfs_lock(mp))) {
-		printf("lock of root filesystem failed (%d)\n", error);
-		return;
-	}
-	vnode_pager_umount(mp);	/* release cached vnodes */
-	cache_purgevfs(mp);	/* remove cache entries for this file sys */
-
-	if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc)))
-		printf("sync of root filesystem failed (%d)\n", error);
-
-	if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) {
-		printf("unmount of root filesystem failed (");
-		if (error == EBUSY)
-			printf("BUSY)\n");
-		else
-			printf("%d)\n", error);
-	}
-	mp->mnt_flag &= ~MNT_UNMOUNT;
-	vfs_unbusy(mp);
-}
-
-/*
- * Unmount all filesystems.  Should only be called by halt().
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
  */
-void
-vfs_unmountall()
+#ifdef notdef	/* XXX JH */
+int
+lite2_vfs_mountroot(void)
 {
-	struct mount *mp, *nmp, *rootfs = NULL;
+	struct vfsconf *vfsp;
+	extern int (*lite2_mountroot)(void);
 	int error;
 
-	/* unmount all but rootfs */
-	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
-		nmp = mp->mnt_list.cqe_prev;
-
-		if (mp->mnt_flag & MNT_ROOTFS) {
-			rootfs = mp;
+	if (lite2_mountroot != NULL)
+		return ((*lite2_mountroot)());
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		if (vfsp->vfc_mountroot == NULL)
 			continue;
-		}
-		error = dounmount(mp, MNT_FORCE, initproc);
-		if (error) {
-			printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
-			if (error == EBUSY)
-				printf("BUSY)\n");
-			else
-				printf("%d)\n", error);
-		}
-	}
-
-	/* and finally... */
-	if (rootfs) {
-		vfs_unmountroot(rootfs);
-	} else {
-		printf("no root filesystem\n");
+		if ((error = (*vfsp->vfc_mountroot)()) == 0)
+			return (0);
+		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 	}
+	return (ENODEV);
 }
+#endif
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
-getvfs(fsid)
+vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
+	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
-		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			simple_unlock(&mountlist_slock);
 			return (mp);
+	    }
 	}
+	simple_unlock(&mountlist_slock);
 	return ((struct mount *) 0);
 }
 
@@ -282,14 +277,16 @@ getvfs(fsid)
  * Get a new unique fsid
  */
 void
-getnewfsid(mp, mtype)
+vfs_getnewfsid(mp)
 	struct mount *mp;
-	int mtype;
 {
 	static u_short xxxfs_mntid;
 
 	fsid_t tfsid;
+	int mtype;
 
+	simple_lock(&mntid_slock); 
+	mtype = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
 	mp->mnt_stat.f_fsid.val[1] = mtype;
 	if (xxxfs_mntid == 0)
@@ -297,12 +294,13 @@ getnewfsid(mp, mtype)
 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
 	tfsid.val[1] = mtype;
 	if (mountlist.cqh_first != (void *)&mountlist) {
-		while (getvfs(&tfsid)) {
+		while (vfs_getvfs(&tfsid)) {
 			tfsid.val[0]++;
 			xxxfs_mntid++;
 		}
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	simple_unlock(&mntid_slock);
 }
 
 /*
@@ -326,6 +324,35 @@ vattr_null(vap)
 	vap->va_vaflags = 0;
 }
 
+void
+vfs_unmountroot(struct mount *rootfs)
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp = rootfs;
+	int error;
+
+	if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+		printf("failed to unmount root\n");
+		return;
+	}
+	mp->mnt_flag |= MNT_UNMOUNT;
+	vnode_pager_umount(mp);	/* release cached vnodes */
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+
+	if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc)))
+		printf("sync of root filesystem failed (%d)\n", error);
+
+	if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) {
+		printf("unmount of root filesystem failed (");
+		if (error == EBUSY)
+			printf("BUSY)\n");
+		else
+			printf("%d)\n", error);
+	}
+	mp->mnt_flag &= ~MNT_UNMOUNT;
+	vfs_unbusy(mp, p);
+}
+
 /*
  * Routines having to do with the management of the vnode table.
  */
@@ -341,10 +368,11 @@ getnewvnode(tag, mp, vops, vpp)
 	vop_t **vops;
 	struct vnode **vpp;
 {
-	register struct vnode *vp;
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp;
 
+	simple_lock(&vnode_free_list_slock);
 retry:
-	vp = vnode_free_list.tqh_first;
 	/*
 	 * we allocate a new vnode if
 	 * 	1. we don't have any free
@@ -357,12 +385,31 @@ retry:
 	 */
 	if (freevnodes < (numvnodes >> 2) ||
 	    numvnodes < desiredvnodes ||
-	    vp == NULL) {
+	    vnode_free_list.tqh_first == NULL) {
+		simple_unlock(&vnode_free_list_slock);
 		vp = (struct vnode *) malloc((u_long) sizeof *vp,
 		    M_VNODE, M_WAITOK);
 		bzero((char *) vp, sizeof *vp);
 		numvnodes++;
 	} else {
+		for (vp = vnode_free_list.tqh_first;
+				vp != NULLVP; vp = vp->v_freelist.tqe_next) {
+			if (simple_lock_try(&vp->v_interlock))
+				break;
+		}
+		/*
+		 * Unless this is a bad time of the month, at most
+		 * the first NCPUS items on the free list are
+		 * locked, so this is close enough to being empty.
+		 */
+		if (vp == NULLVP) {
+			simple_unlock(&vnode_free_list_slock);
+			tablefull("vnode");
+			*vpp = 0;
+			return (ENFILE);
+		}
+		if (vp->v_usecount)
+			panic("free vnode isn't");
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		if (vp->v_usage > 0) {
 			--vp->v_usage;
@@ -370,14 +417,16 @@ retry:
 			goto retry;
 		}
 		freevnodes--;
-		if (vp->v_usecount)
-			panic("free vnode isn't");
 
 		/* see comment on why 0xdeadb is set at end of vgone (below) */
 		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
+		simple_unlock(&vnode_free_list_slock);
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD)
-			vgone(vp);
+			vgonel(vp, p);
+		else {
+			simple_unlock(&vp->v_interlock);
+		}
 
 #ifdef DIAGNOSTIC
 		{
@@ -421,6 +470,7 @@ insmntque(vp, mp)
 	register struct mount *mp;
 {
 
+	simple_lock(&mntvnode_slock);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
@@ -429,9 +479,12 @@ insmntque(vp, mp)
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
-	if ((vp->v_mount = mp) == NULL)
+	if ((vp->v_mount = mp) == NULL) {
+		simple_unlock(&mntvnode_slock);
 		return;
+	}
 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+	simple_unlock(&mntvnode_slock);
 }
 
 /*
@@ -723,7 +776,8 @@ checkalias(nvp, nvp_rdev, mp)
 	dev_t nvp_rdev;
 	struct mount *mp;
 {
-	register struct vnode *vp;
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp;
 	struct vnode **vpp;
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
@@ -731,18 +785,24 @@ checkalias(nvp, nvp_rdev, mp)
 
 	vpp = &speclisth[SPECHASH(nvp_rdev)];
 loop:
+	simple_lock(&spechash_slock);
 	for (vp = *vpp; vp; vp = vp->v_specnext) {
 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
+		simple_lock(&vp->v_interlock);
 		if (vp->v_usecount == 0) {
-			vgone(vp);
+			simple_unlock(&spechash_slock);
+			vgonel(vp, p);
 			goto loop;
 		}
-		if (vget(vp, 1))
+		simple_unlock(&spechash_slock);
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
 			goto loop;
+		}
+		simple_lock(&spechash_slock);
 		break;
 	}
 
@@ -753,16 +813,19 @@ loop:
 		nvp->v_hashchain = vpp;
 		nvp->v_specnext = *vpp;
 		nvp->v_specflags = 0;
+		simple_unlock(&spechash_slock);
 		*vpp = nvp;
-		if (vp != NULL) {
+		if (vp != NULLVP) {
 			nvp->v_flag |= VALIASED;
 			vp->v_flag |= VALIASED;
 			vput(vp);
 		}
 		return (NULLVP);
 	}
-	VOP_UNLOCK(vp);
-	vclean(vp, 0);
+	simple_unlock(&spechash_slock);
+	VOP_UNLOCK(vp, 0, p);
+	simple_lock(&vp->v_interlock);
+	vclean(vp, 0, p);
 	vp->v_op = nvp->v_op;
 	vp->v_tag = nvp->v_tag;
 	nvp->v_type = VNON;
@@ -779,47 +842,162 @@ loop:
  * been changed to a new file system type).
  */
 int
-vget(vp, lockflag)
+vget(vp, flags, p)
 	register struct vnode *vp;
-	int lockflag;
+	int flags;
+	struct proc *p;
 {
+	int error;
 
 	/*
-	 * If the vnode is in the process of being cleaned out for another
-	 * use, we wait for the cleaning to finish and then return failure.
-	 * Cleaning is determined either by checking that the VXLOCK flag is
-	 * set, or that the use count is zero with the back pointer set to
-	 * show that it has been removed from the free list by getnewvnode.
-	 * The VXLOCK flag may not have been set yet because vclean is blocked
-	 * in the VOP_LOCK call waiting for the VOP_INACTIVE to complete.
+	 * If the vnode is in the process of being cleaned out for
+	 * another use, we wait for the cleaning to finish and then
+	 * return failure. Cleaning is determined by checking that
+	 * the VXLOCK flag is set.
 	 */
-	if ((vp->v_flag & VXLOCK) ||
-	    (vp->v_usecount == 0 &&
-		vp->v_freelist.tqe_prev == (struct vnode **) 0xdeadb)) {
+	if ((flags & LK_INTERLOCK) == 0) {
+		simple_lock(&vp->v_interlock);
+	}
+	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
-		(void) tsleep((caddr_t) vp, PINOD, "vget", 0);
-		return (1);
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vget", 0);
+		return (ENOENT);
 	}
 	if (vp->v_usecount == 0) {
+		simple_lock(&vnode_free_list_slock);
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		simple_unlock(&vnode_free_list_slock);
 		freevnodes--;
 	}
 	vp->v_usecount++;
-
 	/*
 	 * Create the VM object, if needed
 	 */
 	if ((vp->v_type == VREG) &&
 		((vp->v_object == NULL) ||
 			(vp->v_object->flags & OBJ_VFS_REF) == 0)) {
+		/*
+		 * XXX vfs_object_create probably needs the interlock.
+		 */
+		simple_unlock(&vp->v_interlock);
 		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+		simple_lock(&vp->v_interlock);
+	}
+	if (flags & LK_TYPE_MASK) {
+		if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
+			vrele(vp);
+		return (error);
 	}
-	if (lockflag)
-		VOP_LOCK(vp);
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+#ifdef notyet
+	/*
+	 * This code cannot be used until all the non-locking filesystems
+	 * (notably NFS) are converted to properly lock and release nodes.
+	 * Also, certain vnode operations change the locking state within
+	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+	 * and symlink). Ideally these operations should not change the
+	 * lock state, but should be changed to let the caller of the
+	 * function unlock them. Otherwise all intermediate vnode layers
+	 * (such as union, umapfs, etc) must catch these functions to do
+	 * the necessary locking at their layer. Note that the inactive
+	 * and lookup operations also change their lock state, but this 
+	 * cannot be avoided, so these two operations will always need
+	 * to be handled in intermediate layers.
+	 */
+	struct vnode *vp = ap->a_vp;
+	int vnflags, flags = ap->a_flags;
 
+	if (vp->v_vnlock == NULL) {
+		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+			return (0);
+		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+		    M_VNODE, M_WAITOK);
+		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
+	}
+	switch (flags & LK_TYPE_MASK) {
+	case LK_DRAIN:
+		vnflags = LK_DRAIN;
+		break;
+	case LK_EXCLUSIVE:
+	case LK_SHARED:
+		vnflags = LK_SHARED;
+		break;
+	case LK_UPGRADE:
+	case LK_EXCLUPGRADE:
+	case LK_DOWNGRADE:
+		return (0);
+	case LK_RELEASE:
+	default:
+		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+	}
+	if (flags & LK_INTERLOCK)
+		vnflags |= LK_INTERLOCK;
+	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else /* for now */
+	/*
+	 * Since we are not using the lock manager, we must clear
+	 * the interlock here.
+	 */
+	if (ap->a_flags & LK_INTERLOCK) {
+		simple_unlock(&ap->a_vp->v_interlock);
+	}
 	return (0);
+#endif
+}
+
+/*
+ * Decrement the active use count.
+ */
+int
+vop_nounlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	if (vp->v_vnlock == NULL)
+		return (0);
+	return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	if (vp->v_vnlock == NULL)
+		return (0);
+	return (lockstatus(vp->v_vnlock));
 }
 
+/* #ifdef DIAGNOSTIC */
 /*
  * Vnode reference, just increment the count
  */
@@ -827,6 +1005,7 @@ void
 vref(vp)
 	struct vnode *vp;
 {
+	simple_lock(&vp->v_interlock);
 	if (vp->v_usecount <= 0)
 		panic("vref used where vget required");
 
@@ -840,8 +1019,11 @@ vref(vp)
 		 * the object is created.  This is necessary to
 		 * keep the system from re-entrantly doing it
 		 * multiple times.
+		 * XXX vfs_object_create probably needs the interlock?
 		 */
+		simple_unlock(&vp->v_interlock);
 		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+		simple_lock(&vp->v_interlock);
 	}
 }
 
@@ -850,9 +1032,9 @@ vref(vp)
  */
 void
 vput(vp)
-	register struct vnode *vp;
+	struct vnode *vp;
 {
-	VOP_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0, curproc);
 	vrele(vp);
 }
 
@@ -862,33 +1044,38 @@ vput(vp)
  */
 void
 vrele(vp)
-	register struct vnode *vp;
+	struct vnode *vp;
 {
+	struct proc *p = curproc;	/* XXX */
 
 #ifdef DIAGNOSTIC
 	if (vp == NULL)
 		panic("vrele: null vp");
 #endif
-
+	simple_lock(&vp->v_interlock);
 	vp->v_usecount--;
 
 	if ((vp->v_usecount == 1) &&
 		vp->v_object &&
 		(vp->v_object->flags & OBJ_VFS_REF)) {
 		vp->v_object->flags &= ~OBJ_VFS_REF;
+		simple_unlock(&vp->v_interlock);
 		vm_object_deallocate(vp->v_object);
 		return;
 	}
 
-	if (vp->v_usecount > 0)
+	if (vp->v_usecount > 0) {
+		simple_unlock(&vp->v_interlock);
 		return;
+	}
 
 	if (vp->v_usecount < 0) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
-		panic("vrele: negative reference cnt");
+		panic("vrele: negative ref cnt");
 	}
+	simple_lock(&vnode_free_list_slock);
 	if (vp->v_flag & VAGE) {
 		if(vp->v_tag != VT_TFS)
 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
@@ -898,9 +1085,12 @@ vrele(vp)
 		if(vp->v_tag != VT_TFS)
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
+	simple_unlock(&vnode_free_list_slock);
+
 	freevnodes++;
 
-	VOP_INACTIVE(vp);
+	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0)
+		VOP_INACTIVE(vp, p);
 }
 
 #ifdef DIAGNOSTIC
@@ -912,7 +1102,9 @@ vhold(vp)
 	register struct vnode *vp;
 {
 
+	simple_lock(&vp->v_interlock);
 	vp->v_holdcnt++;
+	simple_unlock(&vp->v_interlock);
 }
 
 /*
@@ -923,9 +1115,11 @@ holdrele(vp)
 	register struct vnode *vp;
 {
 
+	simple_lock(&vp->v_interlock);
 	if (vp->v_holdcnt <= 0)
 		panic("holdrele: holdcnt");
 	vp->v_holdcnt--;
+	simple_unlock(&vp->v_interlock);
 }
 #endif /* DIAGNOSTIC */
 
@@ -948,11 +1142,11 @@ vflush(mp, skipvp, flags)
 	struct vnode *skipvp;
 	int flags;
 {
-	register struct vnode *vp, *nvp;
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp, *nvp;
 	int busy = 0;
 
-	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
-		panic("vflush: not busy");
+	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		/*
@@ -967,24 +1161,34 @@ loop:
 		 */
 		if (vp == skipvp)
 			continue;
+
+		simple_lock(&vp->v_interlock);
 		/*
 		 * Skip over a vnodes marked VSYSTEM.
 		 */
-		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM))
+		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+			simple_unlock(&vp->v_interlock);
 			continue;
+		}
 		/*
 		 * If WRITECLOSE is set, only flush out regular file vnodes
 		 * open for writing.
 		 */
 		if ((flags & WRITECLOSE) &&
-		    (vp->v_writecount == 0 || vp->v_type != VREG))
+		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
+			simple_unlock(&vp->v_interlock);
 			continue;
+		}
 
 		if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
+			simple_unlock(&vp->v_interlock);
+			simple_unlock(&mntvnode_slock);
 			vm_object_reference(vp->v_object);
 			pager_cache(vp->v_object, FALSE);
 			vp->v_object->flags &= ~OBJ_VFS_REF;
 			vm_object_deallocate(vp->v_object);
+			simple_lock(&mntvnode_slock);
+			simple_lock(&vp->v_interlock);
 		}
 
 		/*
@@ -992,7 +1196,9 @@ loop:
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
-			vgone(vp);
+			simple_unlock(&mntvnode_slock);
+			vgonel(vp, p);
+			simple_lock(&mntvnode_slock);
 			continue;
 		}
 
@@ -1002,21 +1208,25 @@ loop:
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
+			simple_unlock(&mntvnode_slock);
 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
-				vgone(vp);
+				vgonel(vp, p);
 			} else {
-				vclean(vp, 0);
+				vclean(vp, 0, p);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
+			simple_lock(&mntvnode_slock);
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
+		simple_unlock(&vp->v_interlock);
 		busy++;
 	}
+	simple_unlock(&mntvnode_slock);
 	if (busy)
 		return (EBUSY);
 	return (0);
@@ -1025,8 +1235,8 @@ loop:
 /*
  * Disassociate the underlying file system from a vnode.
  */
-void
-vclean(struct vnode *vp, int flags)
+static void
+vclean(struct vnode *vp, int flags, struct proc *p)
 {
 	int active;
 
@@ -1036,15 +1246,7 @@ vclean(struct vnode *vp, int flags)
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
-		VREF(vp);
-	/*
-	 * Even if the count is zero, the VOP_INACTIVE routine may still have
-	 * the object locked while it cleans it out. The VOP_LOCK ensures that
-	 * the VOP_INACTIVE routine is done with its work. For active vnodes,
-	 * it ensures that no other activity can occur while the underlying
-	 * object is being cleaned out.
-	 */
-	VOP_LOCK(vp);
+		vp->v_usecount++;
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
@@ -1053,31 +1255,48 @@ vclean(struct vnode *vp, int flags)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
 	/*
-	 * Clean out any buffers associated with the vnode.
+	 * Even if the count is zero, the VOP_INACTIVE routine may still
+	 * have the object locked while it cleans it out. The VOP_LOCK
+	 * ensures that the VOP_INACTIVE routine is done with its work.
+	 * For active vnodes, it ensures that no other activity can
+	 * occur while the underlying object is being cleaned out.
 	 */
-	if (flags & DOCLOSE)
-		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
+	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
 	/*
-	 * Any other processes trying to obtain this lock must first wait for
-	 * VXLOCK to clear, then call the new lock operation.
+	 * Clean out any buffers associated with the vnode.
 	 */
-	VOP_UNLOCK(vp);
+	if (flags & DOCLOSE)
+		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
 	/*
-	 * If purging an active vnode, it must be closed and deactivated
-	 * before being reclaimed.
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed. Note that the
+	 * VOP_INACTIVE will unlock the vnode.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
-			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
-		VOP_INACTIVE(vp);
+			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
+		VOP_INACTIVE(vp, p);
+	} else {
+		/*
+		 * Any other processes trying to obtain this lock must first
+		 * wait for VXLOCK to clear, then call the new lock operation.
+		 */
+		VOP_UNLOCK(vp, 0, p);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
-	if (VOP_RECLAIM(vp))
+	if (VOP_RECLAIM(vp, p))
 		panic("vclean: cannot reclaim");
 	if (active)
 		vrele(vp);
+	cache_purge(vp);
+	if (vp->v_vnlock) {
+		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+			vprint("vclean: lock not drained", vp);
+		FREE(vp->v_vnlock, M_VNODE);
+		vp->v_vnlock = NULL;
+	}
 
 	/*
 	 * Done with purge, notify sleepers of the grim news.
@@ -1092,46 +1311,91 @@ vclean(struct vnode *vp, int flags)
 }
 
 /*
- * Eliminate all activity associated with  the requested vnode
+ * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
-void
-vgoneall(vp)
-	register struct vnode *vp;
+int
+vop_revoke(ap)
+	struct vop_revoke_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
 {
-	register struct vnode *vq;
+	struct vnode *vp, *vq;
+	struct proc *p = curproc;	/* XXX */
+
+#ifdef DIAGNOSTIC
+	if ((ap->a_flags & REVOKEALL) == 0)
+		panic("vop_revoke");
+#endif
+
+	vp = ap->a_vp;
+	simple_lock(&vp->v_interlock);
 
 	if (vp->v_flag & VALIASED) {
 		/*
-		 * If a vgone (or vclean) is already in progress, wait until
-		 * it is done and return.
+		 * If a vgone (or vclean) is already in progress,
+		 * wait until it is done and return.
 		 */
 		if (vp->v_flag & VXLOCK) {
 			vp->v_flag |= VXWANT;
-			(void) tsleep((caddr_t) vp, PINOD, "vgall", 0);
-			return;
+			simple_unlock(&vp->v_interlock);
+			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+			return (0);
 		}
 		/*
-		 * Ensure that vp will not be vgone'd while we are eliminating
-		 * its aliases.
+		 * Ensure that vp will not be vgone'd while we
+		 * are eliminating its aliases.
 		 */
 		vp->v_flag |= VXLOCK;
+		simple_unlock(&vp->v_interlock);
 		while (vp->v_flag & VALIASED) {
+			simple_lock(&spechash_slock);
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type || vp == vq)
 					continue;
+				simple_unlock(&spechash_slock);
 				vgone(vq);
 				break;
 			}
+			if (vq == NULLVP) {
+				simple_unlock(&spechash_slock);
+			}
 		}
 		/*
-		 * Remove the lock so that vgone below will really eliminate
-		 * the vnode after which time vgone will awaken any sleepers.
+		 * Remove the lock so that vgone below will
+		 * really eliminate the vnode after which time
+		 * vgone will awaken any sleepers.
 		 */
+		simple_lock(&vp->v_interlock);
 		vp->v_flag &= ~VXLOCK;
 	}
-	vgone(vp);
+	vgonel(vp, p);
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+	struct vnode *vp;
+	struct simplelock *inter_lkp;
+	struct proc *p;
+{
+
+	simple_lock(&vp->v_interlock);
+	if (vp->v_usecount == 0) {
+		if (inter_lkp) {
+			simple_unlock(inter_lkp);
+		}
+		vgonel(vp, p);
+		return (1);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
 }
 
 /*
@@ -1142,16 +1406,31 @@ void
 vgone(vp)
 	register struct vnode *vp;
 {
-	register struct vnode *vq;
+	struct proc *p = curproc;	/* XXX */
+
+	simple_lock(&vp->v_interlock);
+	vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(vp, p)
+	struct vnode *vp;
+	struct proc *p;
+{
+	struct vnode *vq;
 	struct vnode *vx;
 
 	/*
-	 * If a vgone (or vclean) is already in progress, wait until it is
-	 * done and return.
+	 * If a vgone (or vclean) is already in progress,
+	 * wait until it is done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
-		(void) tsleep((caddr_t) vp, PINOD, "vgone", 0);
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vgone", 0);
 		return;
 	}
 
@@ -1162,18 +1441,18 @@ vgone(vp)
 	/*
 	 * Clean out the filesystem specific data.
 	 */
-	vclean(vp, DOCLOSE);
+	vclean(vp, DOCLOSE, p);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
-	if (vp->v_mount != NULL) {
-		LIST_REMOVE(vp, v_mntvnodes);
-		vp->v_mount = NULL;
-	}
+	if (vp->v_mount != NULL)
+		insmntque(vp, (struct mount *)0);
 	/*
-	 * If special device, remove it from special device alias list.
+	 * If special device, remove it from special device alias list
+	 * if it is on one.
 	 */
-	if (vp->v_type == VBLK || vp->v_type == VCHR) {
+	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+		simple_lock(&spechash_slock);
 		if (*vp->v_hashchain == vp) {
 			*vp->v_hashchain = vp->v_specnext;
 		} else {
@@ -1202,28 +1481,34 @@ vgone(vp)
 				vx->v_flag &= ~VALIASED;
 			vp->v_flag &= ~VALIASED;
 		}
+		simple_unlock(&spechash_slock);
 		FREE(vp->v_specinfo, M_VNODE);
 		vp->v_specinfo = NULL;
 	}
+
 	/*
-	 * If it is on the freelist and not already at the head, move it to
-	 * the head of the list. The test of the back pointer and the
-	 * reference count of zero is because it will be removed from the free
-	 * list by getnewvnode, but will not have its reference count
-	 * incremented until after calling vgone. If the reference count were
-	 * incremented first, vgone would (incorrectly) try to close the
-	 * previous instance of the underlying object. So, the back pointer is
-	 * explicitly set to `0xdeadb' in getnewvnode after removing it from
-	 * the freelist to ensure that we do not try to move it here.
+	 * If it is on the freelist and not already at the head,
+	 * move it to the head of the list. The test of the back
+	 * pointer and the reference count of zero is because
+	 * it will be removed from the free list by getnewvnode,
+	 * but will not have its reference count incremented until
+	 * after calling vgone. If the reference count were
+	 * incremented first, vgone would (incorrectly) try to
+	 * close the previous instance of the underlying object.
+	 * So, the back pointer is explicitly set to `0xdeadb' in
+	 * getnewvnode after removing it from the freelist to ensure
+	 * that we do not try to move it here.
 	 */
-	if (vp->v_usecount == 0 &&
-	    vp->v_freelist.tqe_prev != (struct vnode **) 0xdeadb &&
-	    vnode_free_list.tqh_first != vp) {
-		if(vp->v_tag != VT_TFS) {
+	if (vp->v_usecount == 0) {
+		simple_lock(&vnode_free_list_slock);
+		if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
+			vnode_free_list.tqh_first != vp) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		}
+		simple_unlock(&vnode_free_list_slock);
 	}
+
 	vp->v_type = VBAD;
 }
 
@@ -1254,7 +1539,7 @@ int
 vcount(vp)
 	register struct vnode *vp;
 {
-	register struct vnode *vq, *vnext;
+	struct vnode *vq, *vnext;
 	int count;
 
 loop:
@@ -1354,6 +1639,7 @@ int kinfo_vgetfailed;
 static int
 sysctl_vnode SYSCTL_HANDLER_ARGS
 {
+	struct proc *p = curproc;	/* XXX */
 	register struct mount *mp, *nmp;
 	struct vnode *vp;
 	int error;
@@ -1368,7 +1654,7 @@ sysctl_vnode SYSCTL_HANDLER_ARGS
 
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_next;
-		if (vfs_busy(mp))
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p))
 			continue;
 again:
 		for (vp = mp->mnt_vnodelist.lh_first;
@@ -1386,11 +1672,11 @@ again:
 			}
 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 			    (error = SYSCTL_OUT(req, vp, VNODESZ))) {
-				vfs_unbusy(mp);
+				vfs_unbusy(mp, p);
 				return (error);
 			}
 		}
-		vfs_unbusy(mp);
+		vfs_unbusy(mp, p);
 	}
 
 	return (0);
@@ -1404,22 +1690,63 @@ SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
  */
 int
 vfs_mountedon(vp)
-	register struct vnode *vp;
+	struct vnode *vp;
 {
-	register struct vnode *vq;
+	struct vnode *vq;
+	int error = 0;
 
 	if (vp->v_specflags & SI_MOUNTEDON)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
+		simple_lock(&spechash_slock);
 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
-			if (vq->v_specflags & SI_MOUNTEDON)
-				return (EBUSY);
+			if (vq->v_specflags & SI_MOUNTEDON) {
+				error = EBUSY;
+				break;
+			}
 		}
+		simple_unlock(&spechash_slock);
+	}
+	return (error);
+}
+
+/*
+ * Unmount all filesystems.  The list is traversed in reverse order
+ * of mounting to avoid dependencies.  Should only be called by halt().
+ */
+void
+vfs_unmountall()
+{
+	struct mount *mp, *nmp, *rootfs = NULL;
+	int error;
+
+	/* unmount all but rootfs */
+	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+		nmp = mp->mnt_list.cqe_prev;
+
+		if (mp->mnt_flag & MNT_ROOTFS) {
+			rootfs = mp;
+			continue;
+		}
+		error = dounmount(mp, MNT_FORCE, initproc);
+		if (error) {
+			printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
+			if (error == EBUSY)
+				printf("BUSY)\n");
+			else
+				printf("%d)\n", error);
+		}
+	}
+
+	/* and finally... */
+	if (rootfs) {
+		vfs_unmountroot(rootfs);
+	} else {
+		printf("no root filesystem\n");
 	}
-	return (0);
 }
 
 /*
@@ -1565,8 +1892,8 @@ vfs_export_lookup(mp, nep, nam)
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
 				np = (struct netcred *)
-				    (*rnh->rnh_matchaddr) ((caddr_t) saddr,
-				    rnh);
+					(*rnh->rnh_matchaddr)((caddr_t)saddr,
+							      rnh);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
@@ -1580,7 +1907,6 @@ vfs_export_lookup(mp, nep, nam)
 	return (np);
 }
 
-
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
@@ -1639,10 +1965,10 @@ retry:
 	} else {
 		if (object->flags & OBJ_DEAD) {
 			if (waslocked)
-				VOP_UNLOCK(vp);
+				VOP_UNLOCK(vp, 0, p);
 			tsleep(object, PVM, "vodead", 0);
 			if (waslocked)
-				VOP_LOCK(vp);
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			goto retry;
 		}
 		if ((object->flags & OBJ_VFS_REF) == 0) {