Improve VFS locking:

- Implement real draining for vfs consumers by not relying on the mnt_lock and using instead a refcount in order to keep track of lock requesters. - Due to the change above, remove the mnt_lock lockmgr because it is now useless. - Due to the change above, vfs_busy() is no more linked to a lockmgr. Change so its KPI by removing the interlock argument and defining 2 new flags for it: MBF_NOWAIT which basically replaces the LK_NOWAIT of the old version (which was unlinked from the lockmgr alredy) and MBF_MNTLSTLOCK which provides the ability to drop the mountlist_mtx once the mnt interlock is held (ability still desired by most consumers). - The stub used into vfs_mount_destroy(), that allows to override the mnt_ref if running for more than 3 seconds, make it totally useless. Remove it as it was thought to work into older versions. If a problem of "refcount held never going away" should appear, we will need to fix properly instead than trust on such hackish solution. - Fix a bug where returning (with an error) from dounmount() was still leaving the MNTK_MWAIT flag on even if it the waiters were actually woken up. Just a place in vfs_mount_destroy() is left because it is going to recycle the structure in any case, so it doesn't matter. - Remove the markercnt refcount as it is useless. This patch modifies VFS ABI and breaks KPI for vfs_busy() so manpages and __FreeBSD_version will be modified accordingly. Discussed with: kib Tested by: pho
author: attilio <attilio@FreeBSD.org> 2008-11-02 10:15:42 +0000
committer: attilio <attilio@FreeBSD.org> 2008-11-02 10:15:42 +0000
commit: e1f493235eaeaf3c0cd0e029936db2b5b6e32bf7 (patch)
tree: 904da44fc29dcc675df2dfc69f644d6a90befc3c /sys/kern
parent: 92bf4795aa726561a195b11458e9f2fc6ee74454 (diff)
download: FreeBSD-src-e1f493235eaeaf3c0cd0e029936db2b5b6e32bf7.zip
FreeBSD-src-e1f493235eaeaf3c0cd0e029936db2b5b6e32bf7.tar.gz
5 files changed, 78 insertions, 62 deletions
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index aa9cfbf..f009dac 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -684,7 +684,7 @@ unionlookup:
 	 */
 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
-		if (vfs_busy(mp, 0, 0))
+		if (vfs_busy(mp, 0))
 			continue;
 		vput(dp);
 		VFS_UNLOCK_GIANT(vfslocked);
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 9e4ed46..5a216c9 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -450,7 +450,6 @@ mount_init(void *mem, int size, int flags)
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
-	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 	return (0);
 }
@@ -462,7 +461,6 @@ mount_fini(void *mem, int size)
 
 	mp = (struct mount *)mem;
 	lockdestroy(&mp->mnt_explock);
-	lockdestroy(&mp->mnt_lock);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
@@ -481,7 +479,7 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	mp->mnt_ref = 0;
-	(void) vfs_busy(mp, LK_NOWAIT, 0);
+	(void) vfs_busy(mp, MBF_NOWAIT);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;	/* XXX Unlocked */
@@ -507,19 +505,10 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
 void
 vfs_mount_destroy(struct mount *mp)
 {
-	int i;
 
 	MNT_ILOCK(mp);
-	for (i = 0; mp->mnt_ref && i < 3; i++)
-		msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
-	/*
-	 * This will always cause a 3 second delay in rebooting due to
-	 * refs on the root mountpoint that never go away.  Most of these
-	 * are held by init which never exits.
-	 */
-	if (i == 3 && (!rebooting || bootverbose))
-		printf("Mount point %s had %d dangling refs\n",
-		    mp->mnt_stat.f_mntonname, mp->mnt_ref);
+	while (mp->mnt_ref)
+		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 	if (mp->mnt_holdcnt != 0) {
 		printf("Waiting for mount point to be unheld\n");
 		while (mp->mnt_holdcnt != 0) {
@@ -928,7 +917,7 @@ vfs_domount(
 			vput(vp);
 			return (error);
 		}
-		if (vfs_busy(mp, LK_NOWAIT, 0)) {
+		if (vfs_busy(mp, MBF_NOWAIT)) {
 			vput(vp);
 			return (EBUSY);
 		}
@@ -1245,19 +1234,31 @@ dounmount(mp, flags, td)
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE)
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
-	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
-	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp));
-	if (error) {
-		MNT_ILOCK(mp);
-		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
-		    MNTK_UNMOUNTF);
-		if (mp->mnt_kern_flag & MNTK_MWAIT)
-			wakeup(mp);
-		MNT_IUNLOCK(mp);
-		if (coveredvp)
-			VOP_UNLOCK(coveredvp, 0);
-		return (error);
+	error = 0;
+	if (mp->mnt_lockref) {
+		if (flags & MNT_FORCE) {
+			mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
+			    MNTK_UNMOUNTF);
+			if (mp->mnt_kern_flag & MNTK_MWAIT) {
+				mp->mnt_kern_flag &= ~MNTK_MWAIT;
+				wakeup(mp);
+			}
+			MNT_IUNLOCK(mp);
+			if (coveredvp)
+				VOP_UNLOCK(coveredvp, 0);
+			return (EBUSY);
+		}
+		mp->mnt_kern_flag |= MNTK_DRAINING;
+		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
+		    "mount drain", 0);
 	}
+	MNT_IUNLOCK(mp);
+	KASSERT(mp->mnt_lockref == 0,
+	    ("%s: invalid lock refcount in the drain path @ %s:%d",
+	    __func__, __FILE__, __LINE__));
+	KASSERT(error == 0,
+	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
+	    __func__, __FILE__, __LINE__));
 	vn_start_write(NULL, &mp, V_WAIT);
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
@@ -1321,9 +1322,10 @@ dounmount(mp, flags, td)
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
-		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
-		if (mp->mnt_kern_flag & MNTK_MWAIT)
+		if (mp->mnt_kern_flag & MNTK_MWAIT) {
+			mp->mnt_kern_flag &= ~MNTK_MWAIT;
 			wakeup(mp);
+		}
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0);
@@ -1337,7 +1339,6 @@ dounmount(mp, flags, td)
 		vput(coveredvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
-	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
 	vfs_mount_destroy(mp);
 	return (0);
 }
@@ -2070,7 +2071,6 @@ __mnt_vnode_first(struct vnode **mvp, struct mount *mp)
 			wakeup(&mp->mnt_holdcnt);
 		return (NULL);
 	}
-	mp->mnt_markercnt++;
 	(*mvp)->v_mount = mp;
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	return (vp);
@@ -2093,7 +2093,6 @@ __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
 	MNT_ILOCK(mp);
 	*mvp = NULL;
 
-	mp->mnt_markercnt--;
 	mp->mnt_holdcnt--;
 	if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
 		wakeup(&mp->mnt_holdcnt);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5698532..30ee61a 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -335,42 +335,36 @@ SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
- * unmounting. Interlock is not released on failure.
+ * unmounting. Eventually, mountlist_mtx is not released on failure.
  */
 int
-vfs_busy(struct mount *mp, int flags, struct mtx *interlkp)
+vfs_busy(struct mount *mp, int flags)
 {
-	int lkflags;
+
+	MPASS((flags & ~MBF_MASK) == 0);
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
-		if (flags & LK_NOWAIT) {
+		if (flags & MBF_NOWAIT) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			return (ENOENT);
 		}
-		if (interlkp)
-			mtx_unlock(interlkp);
+		if (flags & MBF_MNTLSTLOCK)
+			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
-		/*
-		 * Since all busy locks are shared except the exclusive
-		 * lock granted when unmounting, the only place that a
-		 * wakeup needs to be done is at the release of the
-		 * exclusive lock at the end of dounmount.
-		 */
 		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
-		if (interlkp)
-			mtx_lock(interlkp);
+		if (flags & MBF_MNTLSTLOCK)
+			mtx_lock(&mountlist_mtx);
 		return (ENOENT);
 	}
-	if (interlkp)
-		mtx_unlock(interlkp);
-	lkflags = LK_SHARED | LK_INTERLOCK | LK_NOWAIT;
-	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp)))
-		panic("vfs_busy: unexpected lock failure");
+	if (flags & MBF_MNTLSTLOCK)
+		mtx_unlock(&mountlist_mtx);
+	mp->mnt_lockref++;
+	MNT_IUNLOCK(mp);
 	return (0);
 }
 
@@ -381,8 +375,15 @@ void
 vfs_unbusy(struct mount *mp)
 {
 
-	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
-	vfs_rel(mp);
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	mp->mnt_lockref--;
+	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
+		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
+		mp->mnt_kern_flag &= ~MNTK_DRAINING;
+		wakeup(&mp->mnt_lockref);
+	}
+	MNT_IUNLOCK(mp);
 }
 
 /*
@@ -747,7 +748,7 @@ vnlru_proc(void)
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			int vfsunlocked;
-			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx)) {
+			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
@@ -2837,7 +2838,6 @@ DB_SHOW_COMMAND(mount, db_show_mount)
 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
-	db_printf("    mnt_markercnt = %d\n", mp->mnt_markercnt);
 	db_printf("    mnt_holdcnt = %d\n", mp->mnt_holdcnt);
 	db_printf("    mnt_holdcntwaiters = %d\n", mp->mnt_holdcntwaiters);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
@@ -2999,7 +2999,7 @@ sysctl_vnode(SYSCTL_HANDLER_ARGS)
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
-		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx))
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
@@ -3361,7 +3361,7 @@ sync_fsync(struct vop_fsync_args *ap)
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
-	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx) != 0) {
+	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 4d40f35..32dc4d1 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -124,7 +124,7 @@ sync(td, uap)
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
-		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx)) {
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
@@ -197,7 +197,7 @@ quotactl(td, uap)
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
-	if ((error = vfs_busy(mp, 0, NULL))) {
+	if ((error = vfs_busy(mp, 0))) {
 		vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
@@ -479,7 +479,7 @@ kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
 			continue;
 		}
 #endif
-		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx)) {
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
@@ -741,7 +741,7 @@ fchdir(td, uap)
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		int tvfslocked;
-		if (vfs_busy(mp, 0, 0))
+		if (vfs_busy(mp, 0))
 			continue;
 		tvfslocked = VFS_LOCK_GIANT(mp);
 		error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index ef22413..b272a94 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -957,9 +957,18 @@ vn_start_write(vp, mpp, flags)
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
+
+	/*
+	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+	 * a vfs_ref().
+	 * As long as a vnode is not provided we need to acquire a
+	 * refcount for the provided mountpoint too, in order to
+	 * emulate a vfs_ref().
+	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
+
 	/*
 	 * Check on status of suspension.
 	 */
@@ -1021,6 +1030,14 @@ vn_start_secondary_write(vp, mpp, flags)
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
+
+	/*
+	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+	 * a vfs_ref().
+	 * As long as a vnode is not provided we need to acquire a
+	 * refcount for the provided mountpoint too, in order to
+	 * emulate a vfs_ref().
+	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
author	attilio <attilio@FreeBSD.org>	2008-11-02 10:15:42 +0000
committer	attilio <attilio@FreeBSD.org>	2008-11-02 10:15:42 +0000
commit	e1f493235eaeaf3c0cd0e029936db2b5b6e32bf7 (patch)
tree	904da44fc29dcc675df2dfc69f644d6a90befc3c /sys/kern
parent	92bf4795aa726561a195b11458e9f2fc6ee74454 (diff)
download	FreeBSD-src-e1f493235eaeaf3c0cd0e029936db2b5b6e32bf7.zip FreeBSD-src-e1f493235eaeaf3c0cd0e029936db2b5b6e32bf7.tar.gz