1 files changed, 228 insertions, 65 deletions
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index 5692df6..0b7cb96 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -176,6 +176,8 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
@@ -183,13 +185,22 @@
 #include <sys/malloc.h>
 #include <miscfs/nullfs/null.h>
 
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
 static int null_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */
 SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, 
 	&null_bug_bypass, 0, "");
 
 static int	null_access(struct vop_access_args *ap);
+static int	null_createvobject(struct vop_createvobject_args *ap);
+static int	null_destroyvobject(struct vop_destroyvobject_args *ap);
 static int	null_getattr(struct vop_getattr_args *ap);
+static int	null_getvobject(struct vop_getvobject_args *ap);
 static int	null_inactive(struct vop_inactive_args *ap);
+static int	null_islocked(struct vop_islocked_args *ap);
 static int	null_lock(struct vop_lock_args *ap);
 static int	null_lookup(struct vop_lookup_args *ap);
 static int	null_open(struct vop_open_args *ap);
@@ -277,7 +288,7 @@ null_bypass(ap)
 			 * of vrele'ing their vp's.  We must account for
 			 * that.  (This should go away in the future.)
 			 */
-			if (reles & 1)
+			if (reles & VDESC_VP0_WILLRELE)
 				VREF(*this_vp_p);
 		}
 
@@ -287,7 +298,12 @@ null_bypass(ap)
 	 * Call the operation on the lower layer
 	 * with the modified argument structure.
 	 */
-	error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap);
+	if (vps_p[0] && *vps_p[0])
+		error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap);
+	else {
+		printf("null_bypass: no map for %s\n", descp->vdesc_name);
+		error = EINVAL;
+	}
 
 	/*
 	 * Maintain the illusion of call-by-value
@@ -300,7 +316,11 @@ null_bypass(ap)
 			break;   /* bail out at end of list */
 		if (old_vps[i]) {
 			*(vps_p[i]) = old_vps[i];
-			if (reles & 1)
+#if 0
+			if (reles & VDESC_VP0_WILLUNLOCK)
+				VOP_UNLOCK(*(vps_p[i]), LK_THISLAYER, curproc);
+#endif
+			if (reles & VDESC_VP0_WILLRELE)
 				vrele(*(vps_p[i]));
 		}
 	}
@@ -345,44 +365,43 @@ null_lookup(ap)
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
+	struct vnode *dvp = ap->a_dvp;
 	struct proc *p = cnp->cn_proc;
 	int flags = cnp->cn_flags;
-	struct vop_lock_args lockargs;
-	struct vop_unlock_args unlockargs;
-	struct vnode *dvp, *vp;
+	struct vnode *vp, *ldvp, *lvp;
 	int error;
 
-	if ((flags & ISLASTCN) && (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
-	error = null_bypass((struct vop_generic_args *)ap);
+	/*
+	 * Although it is possible to call null_bypass(), we'll do
+	 * a direct call to reduce overhead
+	 */
+	ldvp = NULLVPTOLOWERVP(dvp);
+	vp = lvp = NULL;
+	error = VOP_LOOKUP(ldvp, &lvp, cnp);
 	if (error == EJUSTRETURN && (flags & ISLASTCN) &&
-	    (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
 		error = EROFS;
+
 	/*
-	 * We must do the same locking and unlocking at this layer as 
-	 * is done in the layers below us. We could figure this out 
-	 * based on the error return and the LASTCN, LOCKPARENT, and
-	 * LOCKLEAF flags. However, it is more expidient to just find 
-	 * out the state of the lower level vnodes and set ours to the
-	 * same state.
+	 * Rely only on the PDIRUNLOCK flag which should be carefully
+	 * tracked by underlying filesystem.
 	 */
-	dvp = ap->a_dvp;
-	vp = *ap->a_vpp;
-	if (dvp == vp)
-		return (error);
-	if (!VOP_ISLOCKED(dvp, NULL)) {
-		unlockargs.a_vp = dvp;
-		unlockargs.a_flags = 0;
-		unlockargs.a_p = p;
-		vop_nounlock(&unlockargs);
-	}
-	if (vp != NULLVP && VOP_ISLOCKED(vp, NULL)) {
-		lockargs.a_vp = vp;
-		lockargs.a_flags = LK_SHARED;
-		lockargs.a_p = p;
-		vop_nolock(&lockargs);
+	if (cnp->cn_flags & PDIRUNLOCK)
+		VOP_UNLOCK(dvp, LK_THISLAYER, p);
+	if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) {
+		if (ldvp == lvp) {
+			*ap->a_vpp = dvp;
+			VREF(dvp);
+			vrele(lvp);
+		} else {
+			error = null_node_create(dvp->v_mount, lvp, &vp);
+			if (error == 0)
+				*ap->a_vpp = vp;
+		}
 	}
 	return (error);
 }
@@ -430,6 +449,7 @@ null_setattr(ap)
 				return (EROFS);
 		}
 	}
+
 	return (null_bypass((struct vop_generic_args *)ap));
 }
 
@@ -454,6 +474,9 @@ null_getattr(ap)
 	return (0);
 }
 
+/*
+ * Handle to disallow write access if mounted read-only.
+ */
 static int
 null_access(ap)
 	struct vop_access_args /* {
@@ -559,12 +582,62 @@ null_lock(ap)
 		struct proc *a_p;
 	} */ *ap;
 {
+	struct vnode *vp = ap->a_vp;
+	int flags = ap->a_flags;
+	struct proc *p = ap->a_p;
+	struct vnode *lvp;
+	int error;
 
-	vop_nolock(ap);
-	if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN)
-		return (0);
-	ap->a_flags &= ~LK_INTERLOCK;
-	return (null_bypass((struct vop_generic_args *)ap));
+	if (flags & LK_THISLAYER) {
+		if (vp->v_vnlock != NULL)
+			return 0;	/* lock is shared across layers */
+		error = lockmgr(&vp->v_lock, flags & ~LK_THISLAYER,
+		    &vp->v_interlock, p);
+		return (error);
+	}
+
+	if (vp->v_vnlock != NULL) {
+		/*
+		 * The lower level has exported a struct lock to us. Use
+		 * it so that all vnodes in the stack lock and unlock
+		 * simultaneously. Note: we don't DRAIN the lock as DRAIN
+		 * decommissions the lock - just because our vnode is
+		 * going away doesn't mean the struct lock below us is.
+		 * LK_EXCLUSIVE is fine.
+		 */
+		if ((flags & LK_TYPE_MASK) == LK_DRAIN) {
+			NULLFSDEBUG("null_lock: avoiding LK_DRAIN\n");
+			return(lockmgr(vp->v_vnlock,
+				(flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE,
+				&vp->v_interlock, p));
+		}
+		return(lockmgr(vp->v_vnlock, flags, &vp->v_interlock, p));
+	} else {
+		/*
+		 * To prevent race conditions involving doing a lookup
+		 * on "..", we have to lock the lower node, then lock our
+		 * node. Most of the time it won't matter that we lock our
+		 * node (as any locking would need the lower one locked
+		 * first). But we can LK_DRAIN the upper lock as a step
+		 * towards decomissioning it.
+		 */
+		lvp = NULLVPTOLOWERVP(vp);
+		if (flags & LK_INTERLOCK) {
+			simple_unlock(&vp->v_interlock);
+			flags &= ~LK_INTERLOCK;
+		}
+		if ((flags & LK_TYPE_MASK) == LK_DRAIN) {
+			error = VOP_LOCK(lvp,
+				(flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE, p);
+		} else
+			error = VOP_LOCK(lvp, flags, p);
+		if (error)
+			return (error);	
+		error = lockmgr(&vp->v_lock, flags, &vp->v_interlock, p);
+		if (error)
+			VOP_UNLOCK(lvp, 0, p);
+		return (error);
+	}
 }
 
 /*
@@ -580,11 +653,46 @@ null_unlock(ap)
 		struct proc *a_p;
 	} */ *ap;
 {
-	vop_nounlock(ap);
-	ap->a_flags &= ~LK_INTERLOCK;
-	return (null_bypass((struct vop_generic_args *)ap));
+	struct vnode *vp = ap->a_vp;
+	int flags = ap->a_flags;
+	struct proc *p = ap->a_p;
+
+	if (vp->v_vnlock != NULL) {
+		if (flags & LK_THISLAYER)
+			return 0;	/* the lock is shared across layers */
+		flags &= ~LK_THISLAYER;
+		return (lockmgr(vp->v_vnlock, flags | LK_RELEASE,
+			&vp->v_interlock, p));
+	}
+	if ((flags & LK_THISLAYER) == 0) {
+		if (flags & LK_INTERLOCK)
+			simple_unlock(&vp->v_interlock);
+		VOP_UNLOCK(NULLVPTOLOWERVP(vp), flags & ~LK_INTERLOCK, p);
+	} else
+		flags &= ~LK_THISLAYER;
+	return (lockmgr(&vp->v_lock, flags | LK_RELEASE, &vp->v_interlock, p));
+}
+
+static int
+null_islocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct proc *p = ap->a_p;
+
+	if (vp->v_vnlock != NULL)
+		return (lockstatus(vp->v_vnlock, p));
+	return (lockstatus(&vp->v_lock, p));
 }
 
+/*
+ * There is no way to tell that someone issued remove/rmdir operation
+ * on the underlying filesystem. For now we just have to release lowevrp
+ * as soon as possible.
+ */
 static int
 null_inactive(ap)
 	struct vop_inactive_args /* {
@@ -593,27 +701,34 @@ null_inactive(ap)
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
+	struct proc *p = ap->a_p;
 	struct null_node *xp = VTONULL(vp);
 	struct vnode *lowervp = xp->null_lowervp;
+
+	lockmgr(&null_hashlock, LK_EXCLUSIVE, NULL, p);
+	LIST_REMOVE(xp, null_hash);
+	lockmgr(&null_hashlock, LK_RELEASE, NULL, p);
+
+	xp->null_lowervp = NULLVP;
+	if (vp->v_vnlock != NULL) {
+		vp->v_vnlock = &vp->v_lock;	/* we no longer share the lock */
+	} else
+		VOP_UNLOCK(vp, LK_THISLAYER, p);
+
+	vput(lowervp);
 	/*
-	 * Do nothing (and _don't_ bypass).
-	 * Wait to vrele lowervp until reclaim,
-	 * so that until then our null_node is in the
-	 * cache and reusable.
-	 * We still have to tell the lower layer the vnode
-	 * is now inactive though.
-	 *
-	 * NEEDSWORK: Someday, consider inactive'ing
-	 * the lowervp and then trying to reactivate it
-	 * with capabilities (v_id)
-	 * like they do in the name lookup cache code.
-	 * That's too much work for now.
+	 * Now it is safe to drop references to the lower vnode.
+	 * VOP_INACTIVE() will be called by vrele() if necessary.
 	 */
-	VOP_INACTIVE(lowervp, ap->a_p);
-	VOP_UNLOCK(ap->a_vp, 0, ap->a_p);
+	vrele (lowervp);
+
 	return (0);
 }
 
+/*
+ * We can free memory in null_inactive, but we do this
+ * here. (Possible to guard vp->v_data to point somewhere)
+ */
 static int
 null_reclaim(ap)
 	struct vop_reclaim_args /* {
@@ -622,21 +737,11 @@ null_reclaim(ap)
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
-	struct null_node *xp = VTONULL(vp);
-	struct vnode *lowervp = xp->null_lowervp;
+	void *vdata = vp->v_data;
 
-	/*
-	 * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p,
-	 * so we can't call VOPs on ourself.
-	 */
-	/* After this assignment, this node will not be re-used. */
-	xp->null_lowervp = NULLVP;
-	lockmgr(&null_hashlock, LK_EXCLUSIVE, NULL, ap->a_p);
-	LIST_REMOVE(xp, null_hash);
-	lockmgr(&null_hashlock, LK_RELEASE, NULL, ap->a_p);
-	FREE(vp->v_data, M_TEMP);
 	vp->v_data = NULL;
-	vrele (lowervp);
+	FREE(vdata, M_NULLFSNODE);
+
 	return (0);
 }
 
@@ -652,16 +757,74 @@ null_print(ap)
 }
 
 /*
+ * Let an underlying filesystem do the work
+ */
+static int
+null_createvobject(ap)
+	struct vop_createvobject_args /* {
+		struct vnode *vp;
+		struct ucred *cred;
+		struct proc *p;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode *lowervp = VTONULL(vp) ? NULLVPTOLOWERVP(vp) : NULL;
+	int error;
+
+	if (vp->v_type == VNON || lowervp == NULL)
+		return 0;
+	error = VOP_CREATEVOBJECT(lowervp, ap->a_cred, ap->a_p);
+	if (error)
+		return (error);
+	vp->v_flag |= VOBJBUF;
+	return (0);
+}
+
+/*
+ * We have nothing to destroy and this operation shouldn't be bypassed.
+ */
+static int
+null_destroyvobject(ap)
+	struct vop_destroyvobject_args /* {
+		struct vnode *vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	vp->v_flag &= ~VOBJBUF;
+	return (0);
+}
+
+static int
+null_getvobject(ap)
+	struct vop_getvobject_args /* {
+		struct vnode *vp;
+		struct vm_object **objpp;
+	} */ *ap;
+{
+	struct vnode *lvp = NULLVPTOLOWERVP(ap->a_vp);
+
+	if (lvp == NULL)
+		return EINVAL;
+	return (VOP_GETVOBJECT(lvp, ap->a_objpp));
+}
+
+/*
  * Global vfs data structures
  */
 vop_t **null_vnodeop_p;
 static struct vnodeopv_entry_desc null_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) null_bypass },
+
 	{ &vop_access_desc,		(vop_t *) null_access },
 	{ &vop_bmap_desc,		(vop_t *) vop_eopnotsupp },
+	{ &vop_createvobject_desc,	(vop_t *) null_createvobject },
+	{ &vop_destroyvobject_desc,	(vop_t *) null_destroyvobject },
 	{ &vop_getattr_desc,		(vop_t *) null_getattr },
+	{ &vop_getvobject_desc,		(vop_t *) null_getvobject },
 	{ &vop_getwritemount_desc,	(vop_t *) vop_stdgetwritemount},
 	{ &vop_inactive_desc,		(vop_t *) null_inactive },
+	{ &vop_islocked_desc,		(vop_t *) null_islocked },
 	{ &vop_lock_desc,		(vop_t *) null_lock },
 	{ &vop_lookup_desc,		(vop_t *) null_lookup },
 	{ &vop_open_desc,		(vop_t *) null_open },