Asynchronously release vnodes to avoid blocking on range locks when calling back in to zfs.

This is based on a fix that went in to opensolaris on March 9th. However, it uses a dedicated thread instead of a Solaris' taskq to avoid doing a blocking memory allocation with the vnode interlock held. This fixes a long-time deadlock in ZFS. This is not, strictly speaking, an LOR. The spa_zio thread releases a vnode, this calls in to vn_reclaim which in turn needs to acquire range locks to sync dirty data out to disk. The range locks are already held by a user-level process waiting on a condition variable that it the process is waiting on a spa_zio thread to signal it on. The process could not be signalled because the spa_zio thread could not proceed. The nature of this problem was not apparent due to ZFS locks opting out of witness which meant that DDB did not know about the locks that were held by ZFS. Reviewed by: pjd MFC after: 7 days
author: kmacy <kmacy@FreeBSD.org> 2009-05-07 20:28:06 +0000
committer: kmacy <kmacy@FreeBSD.org> 2009-05-07 20:28:06 +0000
commit: 54e76e600e5758a0c21cdeeca3790d960269bd4d (patch)
tree: 1b45eff9b57ba71565020f61aa08a49d943f2ad1 /sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
parent: 35d0919f73b2c1f73aa5af18b5242d425502e00e (diff)
download: FreeBSD-src-54e76e600e5758a0c21cdeeca3790d960269bd4d.zip
FreeBSD-src-54e76e600e5758a0c21cdeeca3790d960269bd4d.tar.gz
1 files changed, 137 insertions, 0 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
index 00a10aa..bf613e5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -41,6 +41,7 @@
 
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/proc.h>
 #include <sys/vnode.h>
 
 /* Extensible attribute (xva) routines. */
@@ -72,3 +73,139 @@ xva_getxoptattr(xvattr_t *xvap)
 		xoap = &xvap->xva_xoptattrs;
 	return (xoap);
 }
+
+static STAILQ_HEAD(, vnode) vn_rele_async_list;
+static struct mtx vn_rele_async_lock;
+static struct cv vn_rele_async_cv;
+static int vn_rele_list_length;
+static int vn_rele_async_thread_exit;
+
+typedef struct  {
+	struct vnode *stqe_next;
+} vnode_link_t;
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq /* unused */)
+{
+	
+	KASSERT(vp != NULL, ("vrele: null vp"));
+	VFS_ASSERT_GIANT(vp->v_mount);
+	VI_LOCK(vp);
+
+	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
+	    vp->v_usecount == 1)) {
+		vp->v_usecount--;
+		vdropl(vp);
+		return;
+	}	
+	if (vp->v_usecount != 1) {
+#ifdef DIAGNOSTIC
+		vprint("vrele: negative ref count", vp);
+#endif
+		VI_UNLOCK(vp);
+		panic("vrele: negative ref cnt");
+	}
+	/*
+	 * We are exiting
+	 */
+	if (vn_rele_async_thread_exit != 0) {
+		vrele(vp);
+		return;
+	}
+	
+	mtx_lock(&vn_rele_async_lock);
+
+	/*  STAILQ_INSERT_TAIL 			*/
+	(*(vnode_link_t *)&vp->v_cstart).stqe_next = NULL;
+	*vn_rele_async_list.stqh_last = vp;
+	vn_rele_async_list.stqh_last =
+	    &((vnode_link_t *)&vp->v_cstart)->stqe_next;
+
+	/****************************************/
+	vn_rele_list_length++;
+	if ((vn_rele_list_length % 100) == 0)
+		cv_signal(&vn_rele_async_cv);
+	mtx_unlock(&vn_rele_async_lock);
+	VI_UNLOCK(vp);
+}
+
+static void
+vn_rele_async_init(void *arg)
+{
+
+	mtx_init(&vn_rele_async_lock, "valock", NULL, MTX_DEF);
+	STAILQ_INIT(&vn_rele_async_list);
+
+	/* cv_init(&vn_rele_async_cv, "vacv"); */
+	vn_rele_async_cv.cv_description = "vacv";
+	vn_rele_async_cv.cv_waiters = 0;
+}
+
+void
+vn_rele_async_fini(void)
+{
+
+	mtx_lock(&vn_rele_async_lock);
+	vn_rele_async_thread_exit = 1;
+	cv_signal(&vn_rele_async_cv);
+	while (vn_rele_async_thread_exit != 0)
+		cv_wait(&vn_rele_async_cv, &vn_rele_async_lock);
+	mtx_unlock(&vn_rele_async_lock);
+	mtx_destroy(&vn_rele_async_lock);
+}
+
+
+static void
+vn_rele_async_cleaner(void)
+{
+	STAILQ_HEAD(, vnode) vn_tmp_list;
+	struct vnode *curvnode;
+
+	STAILQ_INIT(&vn_tmp_list);
+	mtx_lock(&vn_rele_async_lock);
+	while (vn_rele_async_thread_exit == 0) {
+		STAILQ_CONCAT(&vn_tmp_list, &vn_rele_async_list);
+		vn_rele_list_length = 0;
+		mtx_unlock(&vn_rele_async_lock);
+		
+		while (!STAILQ_EMPTY(&vn_tmp_list)) {
+			curvnode = STAILQ_FIRST(&vn_tmp_list);
+
+			/*   STAILQ_REMOVE_HEAD */
+			STAILQ_FIRST(&vn_tmp_list) =
+			    ((vnode_link_t *)&curvnode->v_cstart)->stqe_next;
+			if (STAILQ_FIRST(&vn_tmp_list) == NULL)
+				         vn_tmp_list.stqh_last = &STAILQ_FIRST(&vn_tmp_list);
+			/***********************/
+			vrele(curvnode);
+		}
+		mtx_lock(&vn_rele_async_lock);
+		if (vn_rele_list_length == 0)
+			cv_timedwait(&vn_rele_async_cv, &vn_rele_async_lock,
+			    hz/10);
+	}
+
+	vn_rele_async_thread_exit = 0;
+	cv_broadcast(&vn_rele_async_cv);
+	mtx_unlock(&vn_rele_async_lock);
+	thread_exit();
+}
+
+static struct proc *vn_rele_async_proc;
+static struct kproc_desc up_kp = {
+	"vaclean",
+	vn_rele_async_cleaner,
+	&vn_rele_async_proc
+};
+SYSINIT(vaclean, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
+SYSINIT(vn_rele_async_setup, SI_SUB_VFS, SI_ORDER_FIRST, vn_rele_async_init, NULL);
author	kmacy <kmacy@FreeBSD.org>	2009-05-07 20:28:06 +0000
committer	kmacy <kmacy@FreeBSD.org>	2009-05-07 20:28:06 +0000
commit	54e76e600e5758a0c21cdeeca3790d960269bd4d (patch)
tree	1b45eff9b57ba71565020f61aa08a49d943f2ad1 /sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
parent	35d0919f73b2c1f73aa5af18b5242d425502e00e (diff)
download	FreeBSD-src-54e76e600e5758a0c21cdeeca3790d960269bd4d.zip FreeBSD-src-54e76e600e5758a0c21cdeeca3790d960269bd4d.tar.gz