Rewrite of the NFS client's reply handling. We now have NFS socket

upcalls which do RPC header parsing and match up the reply with the request. NFS calls now sleep on the nfsreq structure. This enables us to eliminate the NFS recvlock. Submitted by: Mohan Srinivasan mohans at yahoo-inc dot com
author: ps <ps@FreeBSD.org> 2004-12-06 21:11:15 +0000
committer: ps <ps@FreeBSD.org> 2004-12-06 21:11:15 +0000
commit: eeccf3813d92c16c33f1985fb7edf26c7c20c808 (patch)
tree: 83e874ad8aa3b9b3120558628b84b3010046974d /sys/nfsclient
parent: 2b5157cd0d38228760370d2fcb197f915ce3b93d (diff)
download: FreeBSD-src-eeccf3813d92c16c33f1985fb7edf26c7c20c808.zip
FreeBSD-src-eeccf3813d92c16c33f1985fb7edf26c7c20c808.tar.gz
8 files changed, 571 insertions, 418 deletions
diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index 648120b..4743ea2 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -90,8 +90,6 @@
 #define NFSSTA_GOTFSINFO	0x00100000  /* Got the V3 fsinfo */
 #define	NFSSTA_SNDLOCK		0x01000000  /* Send socket lock */
 #define	NFSSTA_WANTSND		0x02000000  /* Want above */
-#define	NFSSTA_RCVLOCK		0x04000000  /* Rcv socket lock */
-#define	NFSSTA_WANTRCV		0x08000000  /* Want above */
 #define	NFSSTA_TIMEO		0x10000000  /* Experiencing a timeout */
 
 
@@ -151,19 +149,6 @@ struct vattr;
 struct nameidata;
 
 /*
- * The set of signals that interrupt an I/O in progress for NFSMNT_INT mounts.
- * What should be in this set is open to debate, but I believe that since
- * I/O system calls on ufs are never interrupted by signals the set should
- * be minimal. My reasoning is that many current programs that use signals
- * such as SIGALRM will not expect file I/O system calls to be interrupted
- * by them and break.
- */
-#define	NFSINT_SIGMASK(set) 						\
-	(SIGISMEMBER(set, SIGINT) || SIGISMEMBER(set, SIGTERM) ||	\
-	 SIGISMEMBER(set, SIGHUP) || SIGISMEMBER(set, SIGKILL) ||	\
-	 SIGISMEMBER(set, SIGQUIT))
-
-/*
  * Socket errors ignored for connectionless sockets??
  * For now, ignore them all
  */
@@ -321,6 +306,13 @@ int	nfs_fsinfo(struct nfsmount *, struct vnode *, struct ucred *,
 int	nfs_meta_setsize (struct vnode *, struct ucred *,
 	    struct thread *, u_quad_t);
 
+void    nfs_set_sigmask __P((struct thread *td, sigset_t *oldset));
+void    nfs_restore_sigmask __P((struct thread *td, sigset_t *set));
+int     nfs_tsleep __P((struct thread *td, void *ident, int priority, char *wmesg, 
+			int timo));
+int     nfs_msleep __P((struct thread *td, void *ident, struct mtx *mtx, int priority, 
+			char *wmesg, int timo));
+
 #endif	/* _KERNEL */
 
 #endif
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index bc2cf53..9e9af49 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -1036,7 +1036,11 @@ nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
+ 		sigset_t oldset;
+
+ 		nfs_set_sigmask(td, &oldset);
 		bp = getblk(vp, bn, size, PCATCH, 0, 0);
+ 		nfs_restore_sigmask(td, &oldset);
 		while (bp == NULL) {
 			if (nfs_sigintr(nmp, NULL, td))
 				return (NULL);
@@ -1208,8 +1212,8 @@ again:
 			NFS_DPF(ASYNCIO,
 				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
-			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
-				       "nfsaio", slptimeo);
+ 			error = nfs_tsleep(td, &nmp->nm_bufq, slpflag | PRIBIO,
+ 					   "nfsaio", slptimeo);
 			if (error) {
 				error2 = nfs_sigintr(nmp, NULL, td);
 				if (error2)
@@ -1511,6 +1515,8 @@ nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad
 		lbn = nsize / biosize;
 		bufsize = nsize & (biosize - 1);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
+ 		if (!bp)
+ 			return EINTR;
 		if (bp->b_dirtyoff > bp->b_bcount)
 			bp->b_dirtyoff = bp->b_bcount;
 		if (bp->b_dirtyend > bp->b_bcount)
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index 094a2e1..8002908 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
@@ -142,14 +143,15 @@ static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 struct callout	nfs_callout;
 
 static int	nfs_msg(struct thread *, const char *, const char *, int);
-static int	nfs_rcvlock(struct nfsreq *);
-static void	nfs_rcvunlock(struct nfsreq *);
-static void	nfs_realign(struct mbuf **pm, int hsiz);
-static int	nfs_receive(struct nfsreq *rep, struct sockaddr **aname,
-		    struct mbuf **mp);
+static int	nfs_realign(struct mbuf **pm, int hsiz);
 static int	nfs_reply(struct nfsreq *);
 static void	nfs_softterm(struct nfsreq *rep);
 static int	nfs_reconnect(struct nfsreq *rep);
+static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
+static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
+
+extern struct mtx nfs_reqq_mtx;
+extern struct mtx nfs_reply_mtx;
 
 /*
  * Initialize sockets and congestion for a new NFS connection.
@@ -166,6 +168,12 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 
 	NET_ASSERT_GIANT();
 
+	if (nmp->nm_sotype == SOCK_STREAM) {
+		mtx_lock(&nmp->nm_nfstcpstate.mtx);
+ 		nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
+ 		nmp->nm_nfstcpstate.rpcresid = 0;
+		mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+ 	}	
 	nmp->nm_so = NULL;
 	saddr = nmp->nm_nam;
 	error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
@@ -324,6 +332,12 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 		goto bad;
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags |= SB_NOINTR;
+ 	so->so_upcallarg = (caddr_t)nmp;
+ 	if (so->so_type == SOCK_STREAM)
+ 		so->so_upcall = nfs_clnt_tcp_soupcall;
+ 	else	
+ 		so->so_upcall = nfs_clnt_udp_soupcall;
+ 	so->so_rcv.sb_flags |= SB_UPCALL;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags |= SB_NOINTR;
@@ -370,14 +384,29 @@ nfs_reconnect(struct nfsreq *rep)
 		(void) tsleep(&lbolt, PSOCK, "nfscon", 0);
 	}
 
+  	/*
+ 	 * Clear the FORCE_RECONNECT flag only after the connect 
+ 	 * succeeds. To prevent races between multiple processes 
+ 	 * waiting on the mountpoint where the connection is being
+ 	 * torn down. The first one to acquire the sndlock will 
+ 	 * retry the connection. The others block on the sndlock
+ 	 * until the connection is established successfully, and 
+ 	 * the re-transmit the request.
+ 	 */
+	mtx_lock(&nmp->nm_nfstcpstate.mtx);
+	nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
+	mtx_unlock(&nmp->nm_nfstcpstate.mtx);	
+
 	/*
 	 * Loop through outstanding request list and fix up all requests
 	 * on old socket.
 	 */
+	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 		if (rp->r_nmp == nmp)
 			rp->r_flags |= R_MUSTRESEND;
 	}
+	mtx_unlock(&nfs_reqq_mtx);
 	return (0);
 }
 
@@ -394,7 +423,12 @@ nfs_disconnect(struct nfsmount *nmp)
 	if (nmp->nm_so) {
 		so = nmp->nm_so;
 		nmp->nm_so = NULL;
-		soshutdown(so, SHUT_RDWR);
+		SOCKBUF_LOCK(&so->so_rcv);
+ 		so->so_upcallarg = NULL;
+ 		so->so_upcall = NULL;
+ 		so->so_rcv.sb_flags &= ~SB_UPCALL;
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		soshutdown(so, SHUT_WR);
 		soclose(so);
 	}
 }
@@ -406,9 +440,7 @@ nfs_safedisconnect(struct nfsmount *nmp)
 
 	bzero(&dummyreq, sizeof(dummyreq));
 	dummyreq.r_nmp = nmp;
-	nfs_rcvlock(&dummyreq);
 	nfs_disconnect(nmp);
-	nfs_rcvunlock(&dummyreq);
 }
 
 /*
@@ -488,79 +520,54 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
 	return (error);
 }
 
-/*
- * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
- * done by soreceive(), but for SOCK_STREAM we must deal with the Record
- * Mark and consolidate the data into a new mbuf list.
- * nb: Sometimes TCP passes the data up to soreceive() in long lists of
- *     small mbufs.
- * For SOCK_STREAM we must be very careful to read an entire record once
- * we have read any of it, even if the system call has been interrupted.
- */
-static int
-nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
+int nfs_mrep_before_tsleep = 0;
+
+int
+nfs_reply(struct nfsreq *rep)
 {
-	struct socket *so;
-	struct uio auio;
-	struct iovec aio;
-	struct mbuf *m;
-	struct mbuf *control;
-	u_int32_t len;
-	struct sockaddr **getnam;
-	int error, error2, sotype, rcvflg;
-	struct thread *td = curthread;	/* XXX */
+	register struct socket *so;
+	register struct mbuf *m;
+	int error, sotype, slpflag;
 
 	NET_ASSERT_GIANT();
 
-	/*
-	 * Set up arguments for soreceive()
-	 */
-	*mp = NULL;
-	*aname = NULL;
 	sotype = rep->r_nmp->nm_sotype;
-
 	/*
 	 * For reliable protocols, lock against other senders/receivers
 	 * in case a reconnect is necessary.
-	 * For SOCK_STREAM, first get the Record Mark to find out how much
-	 * more there is to get.
-	 * We must lock the socket against other receivers
-	 * until we have an entire rpc request/reply.
 	 */
 	if (sotype != SOCK_DGRAM) {
 		error = nfs_sndlock(rep);
 		if (error)
 			return (error);
 tryagain:
-		/*
-		 * Check for fatal errors and resending request.
-		 */
-		/*
-		 * Ugh: If a reconnect attempt just happened, nm_so
-		 * would have changed. NULL indicates a failed
-		 * attempt that has essentially shut down this
-		 * mount point.
-		 */
-		if (rep->r_mrep || (error = NFS_SIGREP(rep)) != 0) {
+		if (rep->r_mrep) {
 			nfs_sndunlock(rep);
-			return (error == 0 ? EINTR : error);
+			return (0);
+		}
+		if (rep->r_flags & R_SOFTTERM) {
+			nfs_sndunlock(rep);
+			return (EINTR);
 		}
 		so = rep->r_nmp->nm_so;
-		if (!so) {
+		mtx_lock(&rep->r_nmp->nm_nfstcpstate.mtx);
+		if (!so || 
+		    (rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
+			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
 			error = nfs_reconnect(rep);
 			if (error) {
 				nfs_sndunlock(rep);
 				return (error);
 			}
 			goto tryagain;
-		}
+		} else
+			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
 		while (rep->r_flags & R_MUSTRESEND) {
-			m = m_copym(rep->r_mreq, 0, M_COPYALL, M_TRYWAIT);
+			m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
 			nfsstats.rpcretries++;
 			error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 			if (error) {
 				if (error == EINTR || error == ERESTART ||
-				    error == EIO ||
 				    (error = nfs_reconnect(rep)) != 0) {
 					nfs_sndunlock(rep);
 					return (error);
@@ -569,325 +576,330 @@ tryagain:
 			}
 		}
 		nfs_sndunlock(rep);
-		if (sotype == SOCK_STREAM) {
-			aio.iov_base = (caddr_t) &len;
-			aio.iov_len = sizeof(u_int32_t);
-			auio.uio_iov = &aio;
-			auio.uio_iovcnt = 1;
-			auio.uio_segflg = UIO_SYSSPACE;
-			auio.uio_rw = UIO_READ;
-			auio.uio_offset = 0;
-			auio.uio_resid = sizeof(u_int32_t);
-			auio.uio_td = td;
-			do {
-			   rcvflg = MSG_WAITALL;
-			   error = so->so_proto->pr_usrreqs->pru_soreceive
-				   (so, NULL, &auio, NULL, NULL, &rcvflg);
-			   if (error == EWOULDBLOCK) {
-				   error2 = NFS_SIGREP(rep);
-				   if (error2)
-					   return (error2);
-			   }
-			} while (0);
-			if (!error && auio.uio_resid > 0) {
-			    /*
-			     * Don't log a 0 byte receive; it means
-			     * that the socket has been closed, and
-			     * can happen during normal operation
-			     * (forcible unmount or Solaris server).
-			     */
-			    if (auio.uio_resid != sizeof (u_int32_t))
-			    log(LOG_INFO,
-				 "short receive (%d/%d) from nfs server %s\n",
-				 (int)(sizeof(u_int32_t) - auio.uio_resid),
-				 (int)sizeof(u_int32_t),
-				 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
-			    error = EPIPE;
-			}
-			if (error)
-				goto errout;
-			len = ntohl(len) & ~0x80000000;
-			/*
-			 * This is SERIOUS! We are out of sync with the sender
-			 * and forcing a disconnect/reconnect is all I can do.
-			 */
-			if (len > NFS_MAXPACKET) {
-			    log(LOG_ERR, "%s (%d) from nfs server %s\n",
-				"impossible packet length",
-				len,
-				rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
-			    error = EFBIG;
-			    goto errout;
-			}
-			auio.uio_resid = len;
-			do {
-			    rcvflg = MSG_WAITALL;
-			    error =  so->so_proto->pr_usrreqs->pru_soreceive
-				    (so, NULL,
-				     &auio, mp, NULL, &rcvflg);
-			} while (0);
-			if (!error && auio.uio_resid > 0) {
-			    if (len != auio.uio_resid)
-			    log(LOG_INFO,
-				"short receive (%d/%d) from nfs server %s\n",
-				len - auio.uio_resid, len,
-				rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
-			    error = EPIPE;
-			}
-		} else {
-			/*
-			 * NB: Since uio_resid is big, MSG_WAITALL is ignored
-			 * and soreceive() will return when it has either a
-			 * control msg or a data msg.
-			 * We have no use for control msg., but must grab them
-			 * and then throw them away so we know what is going
-			 * on.
-			 */
-			auio.uio_resid = len = 100000000; /* Anything Big */
-			auio.uio_td = td;
-			do {
-			    rcvflg = 0;
-			    error =  so->so_proto->pr_usrreqs->pru_soreceive
-				    (so, NULL,
-				&auio, mp, &control, &rcvflg);
-			    if (control)
-				m_freem(control);
-			    if (error == EWOULDBLOCK && rep) {
-				   error2 = NFS_SIGREP(rep);
-				   if (error2)
-					   return (error2);
-			    }
-			} while (!error && *mp == NULL && control);
-			if ((rcvflg & MSG_EOR) == 0)
-				printf("Egad!!\n");
-			if (!error && *mp == NULL)
-				error = EPIPE;
-			len -= auio.uio_resid;
-		}
-errout:
-		if (error && error != EINTR && error != EIO &&
-		    error != ERESTART) {
-			m_freem(*mp);
-			*mp = NULL;
-			if (error != EPIPE && error != EWOULDBLOCK)
-				log(LOG_INFO,
-				    "receive error %d from nfs server %s\n",
-				    error,
-				 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
-			error = nfs_sndlock(rep);
-			if (!error) {
-				error = nfs_reconnect(rep);
-				if (!error)
-					goto tryagain;
-				else
-					nfs_sndunlock(rep);
-			}
-		}
-	} else {
+	}
+	slpflag = 0;
+	if (rep->r_nmp->nm_flag & NFSMNT_INT)
+		slpflag = PCATCH;
+	mtx_lock(&nfs_reply_mtx);
+	if (rep->r_mrep != NULL) {
 		/*
-		 * We may have failed while rebinding the datagram socket
-		 * so attempt a rebind here.
+		 * This is a very rare race, but it does occur. The reply 
+		 * could come in and the wakeup could happen before the 
+		 * process tsleeps(). Blocking here without checking for 
+		 * this results in a missed wakeup(), blocking this request 
+		 * forever. The 2 reasons why this could happen are a context
+		 * switch in the stack after the request is sent out, or heavy
+		 * interrupt activity pinning down the process within the window.
+		 * (after the request is sent).
 		 */
-		if ((so = rep->r_nmp->nm_so) == NULL) {
+		mtx_unlock(&nfs_reply_mtx);
+		nfs_mrep_before_tsleep++;
+		return (0);
+	}
+	error = msleep((caddr_t)rep, &nfs_reply_mtx, 
+		       slpflag | (PZERO - 1), "nfsreq", 0);
+	mtx_unlock(&nfs_reply_mtx);
+	if (error == EINTR || error == ERESTART)
+		/* NFS operations aren't restartable. Map ERESTART to EINTR */
+		return (EINTR);
+	if (rep->r_flags & R_SOFTTERM)
+		/* Request was terminated because we exceeded the retries (soft mount) */
+		return (ETIMEDOUT);
+	if (sotype == SOCK_STREAM) {
+		mtx_lock(&rep->r_nmp->nm_nfstcpstate.mtx);
+		if (((rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) || 
+		     (rep->r_flags & R_MUSTRESEND))) {
+			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);	
 			error = nfs_sndlock(rep);
-			if (!error) {
-				error = nfs_reconnect(rep);
-				nfs_sndunlock(rep);
-			}
 			if (error)
 				return (error);
-			so = rep->r_nmp->nm_so;
-		}
-		if (so->so_state & SS_ISCONNECTED)
-			getnam = NULL;
-		else
-			getnam = aname;
-		auio.uio_resid = len = 1000000;
-		auio.uio_td = td;
-		do {
-			rcvflg = 0;
-			error =  so->so_proto->pr_usrreqs->pru_soreceive
-				(so, getnam, &auio, mp,
-				NULL, &rcvflg);
-			if (error) {
-				error2 = NFS_SIGREP(rep);
-				if (error2) {
-					error = error2;
-					goto dgramout;
-				}
-			}
-			if (error) {
-				error2 = nfs_sndlock(rep);
-				if (!error2) {
-					error2 = nfs_reconnect(rep);
-					if (error2)
-						error = error2;
-					else
-						so = rep->r_nmp->nm_so;
-					nfs_sndunlock(rep);
-				} else {
-					error = error2;
-				}
-			}
-		} while (error == EWOULDBLOCK);
-dgramout:
-		len -= auio.uio_resid;
-	}
-	if (error) {
-		m_freem(*mp);
-		*mp = NULL;
+			goto tryagain;
+		} else
+			mtx_unlock(&rep->r_nmp->nm_nfstcpstate.mtx);
 	}
-	/*
-	 * Search for any mbufs that are not a multiple of 4 bytes long
-	 * or with m_data not longword aligned.
-	 * These could cause pointer alignment problems, so copy them to
-	 * well aligned mbufs.
-	 */
-	nfs_realign(mp, 5 * NFSX_UNSIGNED);
 	return (error);
 }
 
 /*
- * Implement receipt of reply on a socket.
- * We must search through the list of received datagrams matching them
- * with outstanding requests using the xid, until ours is found.
+ * XXX TO DO
+ * Make nfs_realign() non-blocking. Also make nfsm_dissect() nonblocking.
  */
-/* ARGSUSED */
-static int
-nfs_reply(struct nfsreq *myrep)
+static void
+nfs_clnt_match_xid(struct socket *so, 
+		   struct nfsmount *nmp, 
+		   struct mbuf *mrep)
 {
-	struct nfsreq *rep;
-	struct nfsmount *nmp = myrep->r_nmp;
-	int32_t t1;
-	struct mbuf *mrep, *md;
-	struct sockaddr *nam;
-	u_int32_t rxid, *tl;
+	struct mbuf *md;
 	caddr_t dpos;
+	u_int32_t rxid, *tl;
+	struct nfsreq *rep;
+	register int32_t t1;
 	int error;
-
+	
 	/*
-	 * Loop around until we get our own reply
+	 * Search for any mbufs that are not a multiple of 4 bytes long
+	 * or with m_data not longword aligned.
+	 * These could cause pointer alignment problems, so copy them to
+	 * well aligned mbufs.
 	 */
-	for (;;) {
-		/*
-		 * Lock against other receivers so that I don't get stuck in
-		 * sbwait() after someone else has received my reply for me.
-		 * Also necessary for connection based protocols to avoid
-		 * race conditions during a reconnect.
-		 * If nfs_rcvlock() returns EALREADY, that means that
-		 * the reply has already been recieved by another
-		 * process and we can return immediately.  In this
-		 * case, the lock is not taken to avoid races with
-		 * other processes.
-		 */
-		error = nfs_rcvlock(myrep);
-		if (error == EALREADY)
-			return (0);
-		if (error)
-			return (error);
-		/*
-		 * Get the next Rpc reply off the socket
-		 */
-		error = nfs_receive(myrep, &nam, &mrep);
-		nfs_rcvunlock(myrep);
-		if (error) {
+	if (nfs_realign(&mrep, 5 * NFSX_UNSIGNED) == ENOMEM) {
+		m_freem(mrep);
+		nfsstats.rpcinvalid++;
+		return;
+	}
+	
+	/*
+	 * Get the xid and check that it is an rpc reply
+	 */
+	md = mrep;
+	dpos = mtod(md, caddr_t);
+	tl = nfsm_dissect_nonblock(u_int32_t *, 2*NFSX_UNSIGNED);
+	rxid = *tl++;
+	if (*tl != rpc_reply) {
+		m_freem(mrep);
+nfsmout:
+		nfsstats.rpcinvalid++;
+		return;
+	}
 
+	mtx_lock(&nfs_reqq_mtx);
+	/*
+	 * Loop through the request list to match up the reply
+	 * Iff no match, just drop the datagram
+	 */
+	TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
+		if (rep->r_mrep == NULL && rxid == rep->r_xid) {
+			/* Found it.. */
+			rep->r_mrep = mrep;
+			rep->r_md = md;
+			rep->r_dpos = dpos;
 			/*
-			 * Ignore routing errors on connectionless protocols??
+			 * Update congestion window.
+			 * Do the additive increase of
+			 * one rpc/rtt.
 			 */
-			if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
-				nmp->nm_so->so_error = 0;
-				if (myrep->r_flags & R_GETONEREP)
-					return (0);
-				continue;
+			if (nmp->nm_cwnd <= nmp->nm_sent) {
+				nmp->nm_cwnd +=
+					(NFS_CWNDSCALE * NFS_CWNDSCALE +
+					 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
+				if (nmp->nm_cwnd > NFS_MAXCWND)
+					nmp->nm_cwnd = NFS_MAXCWND;
+			}	
+			if (rep->r_flags & R_SENT) {
+				rep->r_flags &= ~R_SENT;
+				nmp->nm_sent -= NFS_CWNDSCALE;
 			}
-			return (error);
+			/*
+			 * Update rtt using a gain of 0.125 on the mean
+			 * and a gain of 0.25 on the deviation.
+			 */
+			if (rep->r_flags & R_TIMING) {
+				/*
+				 * Since the timer resolution of
+				 * NFS_HZ is so course, it can often
+				 * result in r_rtt == 0. Since
+				 * r_rtt == N means that the actual
+				 * rtt is between N+dt and N+2-dt ticks,
+				 * add 1.
+				 */
+				t1 = rep->r_rtt + 1;
+				t1 -= (NFS_SRTT(rep) >> 3);
+				NFS_SRTT(rep) += t1;
+				if (t1 < 0)
+					t1 = -t1;
+				t1 -= (NFS_SDRTT(rep) >> 2);
+				NFS_SDRTT(rep) += t1;
+			}
+			nmp->nm_timeouts = 0;
+			break;
 		}
-		if (nam)
-			FREE(nam, M_SONAME);
+	}
+	/*
+	 * If not matched to a request, drop it.
+	 * If it's mine, wake up requestor.
+	 */
+	if (rep == 0) {
+		nfsstats.rpcunexpected++;
+		m_freem(mrep);
+	} else {
+		mtx_lock(&nfs_reply_mtx);
+		wakeup((caddr_t)rep);		
+		mtx_unlock(&nfs_reply_mtx);
+	}
+	mtx_unlock(&nfs_reqq_mtx);
+}
 
-		/*
-		 * Get the xid and check that it is an rpc reply
-		 */
-		md = mrep;
-		dpos = mtod(md, caddr_t);
-		tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
-		rxid = *tl++;
-		if (*tl != rpc_reply) {
-			nfsstats.rpcinvalid++;
-			m_freem(mrep);
-nfsmout:
-			if (myrep->r_flags & R_GETONEREP)
-				return (0);
-			continue;
-		}
+static void
+nfs_mark_for_reconnect(struct nfsmount *nmp)
+{
+	struct nfsreq *rp;
 
-		/*
-		 * Loop through the request list to match up the reply
-		 * Iff no match, just drop the datagram
-		 */
-		TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
-			if (rep->r_mrep == NULL && rxid == rep->r_xid) {
-				/* Found it.. */
-				rep->r_mrep = mrep;
-				rep->r_md = md;
-				rep->r_dpos = dpos;
-				/*
-				 * Update congestion window.
-				 * Do the additive increase of
-				 * one rpc/rtt.
-				 */
-				if (nmp->nm_cwnd <= nmp->nm_sent) {
-					nmp->nm_cwnd +=
-					   (NFS_CWNDSCALE * NFS_CWNDSCALE +
-					   (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
-					if (nmp->nm_cwnd > NFS_MAXCWND)
-						nmp->nm_cwnd = NFS_MAXCWND;
-				}
-				if (rep->r_flags & R_SENT) {
-					rep->r_flags &= ~R_SENT;
-					nmp->nm_sent -= NFS_CWNDSCALE;
-				}
-				/*
-				 * Update rtt using a gain of 0.125 on the mean
-				 * and a gain of 0.25 on the deviation.
-				 */
-				if (rep->r_flags & R_TIMING) {
-					/*
-					 * Since the timer resolution of
-					 * NFS_HZ is so course, it can often
-					 * result in r_rtt == 0. Since
-					 * r_rtt == N means that the actual
-					 * rtt is between N+dt and N+2-dt ticks,
-					 * add 1.
-					 */
-					t1 = rep->r_rtt + 1;
-					t1 -= (NFS_SRTT(rep) >> 3);
-					NFS_SRTT(rep) += t1;
-					if (t1 < 0)
-						t1 = -t1;
-					t1 -= (NFS_SDRTT(rep) >> 2);
-					NFS_SDRTT(rep) += t1;
+	mtx_lock(&nmp->nm_nfstcpstate.mtx);
+	nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
+	mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+	/* 
+	 * Wakeup all processes that are waiting for replies 
+	 * on this mount point. One of them does the reconnect.
+	 */
+	mtx_lock(&nfs_reqq_mtx);
+	TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
+		if (rp->r_nmp == nmp)
+			wakeup(rp);
+	}
+	mtx_unlock(&nfs_reqq_mtx);
+}
+
+static int
+nfstcp_readable(struct socket *so, int bytes)
+{
+	int retval;
+	
+	SOCKBUF_LOCK(&so->so_rcv);
+	retval = (so->so_rcv.sb_cc >= (bytes) ||
+		  (so->so_state & SBS_CANTRCVMORE) ||
+		  so->so_error);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	return (retval);
+}
+
+#define nfstcp_marker_readable(so)	nfstcp_readable(so, sizeof(u_int32_t))
+
+static void
+nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag)
+{
+	struct nfsmount *nmp = (struct nfsmount *)arg;
+	struct mbuf *mp = NULL;
+	struct uio auio;
+	int error;
+	u_int32_t len;
+	int rcvflg;
+
+	/*
+	 * Don't pick any more data from the socket if we've marked the 
+	 * mountpoint for reconnect.
+	 */
+	mtx_lock(&nmp->nm_nfstcpstate.mtx);
+	if (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) {
+		mtx_unlock(&nmp->nm_nfstcpstate.mtx);		
+		return;
+	} else			
+		mtx_unlock(&nmp->nm_nfstcpstate.mtx);
+	auio.uio_td = curthread;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_rw = UIO_READ;
+	for ( ; ; ) {
+		if (nmp->nm_nfstcpstate.flags & NFS_TCP_EXPECT_RPCMARKER) {
+			if (!nfstcp_marker_readable(so)) {
+				/* Marker is not readable */
+				return;
+			}
+			auio.uio_resid = sizeof(u_int32_t);
+			auio.uio_iov = NULL;
+			auio.uio_iovcnt = 0;
+			mp = NULL;
+			rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
+			error =  so->so_proto->pr_usrreqs->pru_soreceive
+				(so, (struct sockaddr **)0,
+				 &auio, &mp, (struct mbuf **)0, &rcvflg);
+			/*
+			 * We've already tested that the socket is readable. 2 cases 
+			 * here, we either read 0 bytes (client closed connection), 
+			 * or got some other error. In both cases, we tear down the 
+			 * connection.
+			 */
+			if (error || auio.uio_resid > 0) {
+				if (auio.uio_resid > 0) {
+					log(LOG_ERR, 
+					    "nfs/tcp clnt: Peer closed connection, tearing down TCP connection\n");
+				} else {
+					log(LOG_ERR, 
+					    "nfs/tcp clnt: Error %d reading socket, tearing down TCP connection\n",
+					    error);
 				}
-				nmp->nm_timeouts = 0;
-				break;
+				goto mark_reconnect;
 			}
+			if (mp == NULL)
+				panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
+			len = ntohl(*mtod(mp, u_int32_t *)) & ~0x80000000;
+			m_freem(mp);
+			/*
+			 * This is SERIOUS! We are out of sync with the sender
+			 * and forcing a disconnect/reconnect is all I can do.
+			 */
+			if (len > NFS_MAXPACKET) {
+				log(LOG_ERR, "%s (%d) from nfs server %s\n",
+				    "impossible packet length",
+				    len,
+				    nmp->nm_mountp->mnt_stat.f_mntfromname);
+				goto mark_reconnect;
+			}
+			nmp->nm_nfstcpstate.rpcresid = len;
+			nmp->nm_nfstcpstate.flags &= ~(NFS_TCP_EXPECT_RPCMARKER);
 		}
-		/*
-		 * If not matched to a request, drop it.
-		 * If it's mine, get out.
+		/* 
+		 * Processed RPC marker or no RPC marker to process. 
+		 * Pull in and process data.
 		 */
-		if (rep == 0) {
-			nfsstats.rpcunexpected++;
-			m_freem(mrep);
-		} else if (rep == myrep) {
-			if (rep->r_mrep == NULL)
-				panic("nfsreply nil");
-			return (0);
+		if (nmp->nm_nfstcpstate.rpcresid > 0) {
+			if (!nfstcp_readable(so, nmp->nm_nfstcpstate.rpcresid)) {
+				/* All data not readable */
+				return;
+			}
+			auio.uio_resid = nmp->nm_nfstcpstate.rpcresid;
+			auio.uio_iov = NULL;
+			auio.uio_iovcnt = 0;
+			mp = NULL;
+			rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
+			error =  so->so_proto->pr_usrreqs->pru_soreceive
+				(so, (struct sockaddr **)0,
+				 &auio, &mp, (struct mbuf **)0, &rcvflg);
+			if (error || auio.uio_resid > 0) {
+				if (auio.uio_resid > 0) {
+					log(LOG_ERR, 
+					    "nfs/tcp clnt: Peer closed connection, tearing down TCP connection\n");
+				} else {
+					log(LOG_ERR, 
+					    "nfs/tcp clnt: Error %d reading socket, tearing down TCP connection\n",
+					    error);
+				}
+				goto mark_reconnect;				
+			}
+			if (mp == NULL)
+				panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
+			nmp->nm_nfstcpstate.rpcresid = 0;
+			nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
+			/* We got the entire RPC reply. Match XIDs and wake up requestor */
+			nfs_clnt_match_xid(so, nmp, mp);
 		}
-		if (myrep->r_flags & R_GETONEREP)
-			return (0);
 	}
+
+mark_reconnect:
+	nfs_mark_for_reconnect(nmp);
+}
+
+static void
+nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag)
+{
+	struct nfsmount *nmp = (struct nfsmount *)arg;
+	struct uio auio;
+	struct mbuf *mp = NULL;
+	struct mbuf *control = NULL;
+	int error, rcvflag;
+
+	auio.uio_resid = 1000000;
+	auio.uio_td = curthread;
+	rcvflag = MSG_DONTWAIT;
+	auio.uio_resid = 1000000000;
+	do {
+		mp = control = NULL;
+		error = so->so_proto->pr_usrreqs->pru_soreceive(so,
+					NULL, &auio, &mp,
+					&control, &rcvflag);
+		if (control)
+			m_freem(control);
+		if (mp)
+			nfs_clnt_match_xid(so, nmp, mp);
+	} while (mp && !error);
 }
 
 /*
@@ -930,6 +942,7 @@ nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
 	if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 		return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
 	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
+	rep->r_mrep = rep->r_md = NULL;
 	rep->r_nmp = nmp;
 	rep->r_vp = vp;
 	rep->r_td = td;
@@ -983,9 +996,11 @@ tryagain:
 	 * to put it LAST so timer finds oldest requests first.
 	 */
 	s = splsoftclock();
+	mtx_lock(&nfs_reqq_mtx);
 	if (TAILQ_EMPTY(&nfs_reqq))
 		callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 	TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
+	mtx_unlock(&nfs_reqq_mtx);
 
 	/*
 	 * If backing off another request or avoiding congestion, don't
@@ -1021,9 +1036,11 @@ tryagain:
 	 * RPC done, unlink the request.
 	 */
 	s = splsoftclock();
+	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
 	if (TAILQ_EMPTY(&nfs_reqq))
 		callout_stop(&nfs_callout);
+	mtx_unlock(&nfs_reqq_mtx);
 	splx(s);
 
 	/*
@@ -1044,11 +1061,22 @@ tryagain:
 	md = rep->r_md;
 	dpos = rep->r_dpos;
 	if (error) {
+                /*
+		 * If we got interrupted by a signal in nfs_reply(), there's
+		 * a very small window where the reply could've come in before
+		 * this process got scheduled in. To handle that case, we need 
+		 * to free the reply if it was delivered.
+		 */
+		if (rep->r_mrep != NULL)
+			m_freem(rep->r_mrep);
 		m_freem(rep->r_mreq);
 		free((caddr_t)rep, M_NFSREQ);
 		return (error);
 	}
 
+	if (rep->r_mrep == NULL)
+		panic("nfs_request: rep->r_mrep shouldn't be NULL if no error\n");
+
 	/*
 	 * break down the rpc header and check if ok
 	 */
@@ -1129,6 +1157,20 @@ nfsmout:
  * Scan the nfsreq list and retranmit any requests that have timed out
  * To avoid retransmission attempts on STREAM sockets (in the future) make
  * sure to set the r_retry field to 0 (implies nm_retry == 0).
+ * 
+ * XXX - 
+ * For now, since we don't register MPSAFE callouts for the NFS client -
+ * softclock() acquires Giant before calling us. That prevents req entries
+ * from being removed from the list (from nfs_request()). But we still 
+ * acquire the nfs reqq mutex to make sure the state of individual req
+ * entries is not modified from RPC reply handling (from socket callback)
+ * while nfs_timer is walking the list of reqs.
+ * The nfs reqq lock cannot be held while we do the pru_send() because of a
+ * lock ordering violation. The NFS client socket callback acquires 
+ * inp_lock->nfsreq mutex and pru_send acquires inp_lock. So we drop the 
+ * reqq mutex (and reacquire it after the pru_send()). This won't work
+ * when we move to fine grained locking for NFS. When we get to that point, 
+ * a rewrite of nfs_timer() will be needed.
  */
 void
 nfs_timer(void *arg)
@@ -1143,6 +1185,7 @@ nfs_timer(void *arg)
 
 	getmicrouptime(&now);
 	s = splnet();
+	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 		nmp = rep->r_nmp;
 		if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
@@ -1202,12 +1245,14 @@ nfs_timer(void *arg)
 		    (rep->r_flags & R_SENT) ||
 		    nmp->nm_sent < nmp->nm_cwnd) &&
 		   (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
+			mtx_unlock(&nfs_reqq_mtx);
 			if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
 			    error = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, NULL, NULL, curthread);
 			else
 			    error = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, nmp->nm_nam, NULL, curthread);
+			mtx_lock(&nfs_reqq_mtx);
 			if (error) {
 				if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
 					so->so_error = 0;
@@ -1235,6 +1280,7 @@ nfs_timer(void *arg)
 			}
 		}
 	}
+	mtx_unlock(&nfs_reqq_mtx);
 	splx(s);
 	callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 }
@@ -1252,20 +1298,24 @@ nfs_nmcancelreqs(nmp)
 	int i, s;
 
 	s = splnet();
+	mtx_lock(&nfs_reqq_mtx);
 	TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 		if (nmp != req->r_nmp || req->r_mrep != NULL ||
 		    (req->r_flags & R_SOFTTERM))
 			continue;
 		nfs_softterm(req);
 	}
+	mtx_unlock(&nfs_reqq_mtx);
 	splx(s);
 
 	for (i = 0; i < 30; i++) {
 		s = splnet();
+		mtx_lock(&nfs_reqq_mtx);
 		TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 			if (nmp == req->r_nmp)
 				break;
 		}
+		mtx_unlock(&nfs_reqq_mtx);
 		splx(s);
 		if (req == NULL)
 			return (0);
@@ -1289,6 +1339,125 @@ nfs_softterm(struct nfsreq *rep)
 		rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
 		rep->r_flags &= ~R_SENT;
 	}
+	/* 
+	 * Request terminated, wakeup the blocked process, so that we
+	 * can return EINTR back.
+	 */
+	wakeup((caddr_t)rep);
+}
+
+/*
+ * Any signal that can interrupt an NFS operation in an intr mount
+ * should be added to this set.
+ */
+int nfs_sig_set[] = {
+	SIGINT,
+	SIGTERM,
+	SIGHUP,
+	SIGKILL,
+	SIGQUIT
+};
+
+/*
+ * Check to see if one of the signals in our subset is pending on
+ * the process (in an intr mount).
+ */
+static int
+nfs_sig_pending(sigset_t set)
+{
+	int i;
+	
+	for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++)
+		if (SIGISMEMBER(set, nfs_sig_set[i]))
+			return (1);
+	return (0);
+}
+ 
+/*
+ * The set/restore sigmask functions are used to (temporarily) overwrite
+ * the process p_sigmask during an RPC call (for example). These are also
+ * used in other places in the NFS client that might tsleep().
+ */
+void
+nfs_set_sigmask(struct thread *td, sigset_t *oldset)
+{
+	sigset_t newset;
+	int i;
+	struct proc *p;
+	
+	SIGFILLSET(newset);
+	if (td == NULL)
+		td = curthread; /* XXX */
+	p = td->td_proc;
+	/* Remove the NFS set of signals from newset */
+	PROC_LOCK(p);
+	mtx_lock(&p->p_sigacts->ps_mtx);
+	for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++) {
+		/*
+		 * But make sure we leave the ones already masked
+		 * by the process, ie. remove the signal from the
+		 * temporary signalmask only if it wasn't already
+		 * in p_sigmask.
+		 */
+		if (!SIGISMEMBER(td->td_sigmask, nfs_sig_set[i]) &&
+		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, nfs_sig_set[i]))
+			SIGDELSET(newset, nfs_sig_set[i]);
+	}
+	mtx_unlock(&p->p_sigacts->ps_mtx);
+	PROC_UNLOCK(p);
+	kern_sigprocmask(td, SIG_SETMASK, &newset, oldset, 0);
+}
+
+void
+nfs_restore_sigmask(struct thread *td, sigset_t *set)
+{
+	if (td == NULL)
+		td = curthread; /* XXX */
+	kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
+}
+
+/*
+ * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
+ * old one after msleep() returns.
+ */
+int
+nfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
+{
+	sigset_t oldset;
+	int error;
+	struct proc *p;
+	
+	if ((priority & PCATCH) == 0)
+		return msleep(ident, mtx, priority, wmesg, timo);
+	if (td == NULL)
+		td = curthread; /* XXX */
+	nfs_set_sigmask(td, &oldset);
+	error = msleep(ident, mtx, priority, wmesg, timo);
+	nfs_restore_sigmask(td, &oldset);
+	p = td->td_proc;
+	return (error);
+}
+
+/*
+ * NFS wrapper to tsleep(), that shoves a new p_sigmask and restores the
+ * old one after tsleep() returns.
+ */
+int
+nfs_tsleep(struct thread *td, void *ident, int priority, char *wmesg, int timo)
+{
+	sigset_t oldset;
+	int error;
+	struct proc *p;
+	
+	if ((priority & PCATCH) == 0)
+		return tsleep(ident, priority, wmesg, timo);
+	if (td == NULL)
+		td = curthread; /* XXX */
+	nfs_set_sigmask(td, &oldset);
+	error = tsleep(ident, priority, wmesg, timo);
+	nfs_restore_sigmask(td, &oldset);
+	p = td->td_proc;
+	return (error);
 }
 
 /*
@@ -1320,7 +1489,7 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 	mtx_unlock(&p->p_sigacts->ps_mtx);
-	if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset)) {
+	if (SIGNOTEMPTY(p->p_siglist) && nfs_sig_pending(tmpset)) {
 		PROC_UNLOCK(p);
 		return (EINTR);
 	}
@@ -1378,60 +1547,6 @@ nfs_sndunlock(struct nfsreq *rep)
 	}
 }
 
-static int
-nfs_rcvlock(struct nfsreq *rep)
-{
-	int *statep = &rep->r_nmp->nm_state;
-	int error, slpflag, slptimeo = 0;
-
-	if (rep->r_nmp->nm_flag & NFSMNT_INT)
-		slpflag = PCATCH;
-	else
-		slpflag = 0;
-	while (*statep & NFSSTA_RCVLOCK) {
-		error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
-		if (error)
-			return (error);
-		*statep |= NFSSTA_WANTRCV;
-		(void) tsleep(statep, slpflag | (PZERO - 1), "nfsrcvlk",
-			slptimeo);
-		/*
-		 * If our reply was recieved while we were sleeping,
-		 * then just return without taking the lock to avoid a
-		 * situation where a single iod could 'capture' the
-		 * recieve lock.
-		 */
-		if (rep->r_mrep != NULL)
-			return (EALREADY);
-		if (slpflag == PCATCH) {
-			slpflag = 0;
-			slptimeo = 2 * hz;
-		}
-	}
-	/* Always fail if our request has been cancelled. */
-	if (rep != NULL && (error = NFS_SIGREP(rep)) != 0)
-		return (error);
-	*statep |= NFSSTA_RCVLOCK;
-	return (0);
-}
-
-/*
- * Unlock the stream socket for others.
- */
-static void
-nfs_rcvunlock(struct nfsreq *rep)
-{
-	int *statep = &rep->r_nmp->nm_state;
-
-	if ((*statep & NFSSTA_RCVLOCK) == 0)
-		panic("nfs rcvunlock");
-	*statep &= ~NFSSTA_RCVLOCK;
-	if (*statep & NFSSTA_WANTRCV) {
-		*statep &= ~NFSSTA_WANTRCV;
-		wakeup(statep);
-	}
-}
-
 /*
  *	nfs_realign:
  *
@@ -1446,8 +1561,15 @@ nfs_rcvunlock(struct nfsreq *rep)
  *	We would prefer to avoid this situation entirely.  The situation does
  *	not occur with NFS/UDP and is supposed to only occassionally occur
  *	with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
+ *
+ * XXX - This still looks buggy. If there are multiple mbufs in the mbuf chain
+ * passed in that are unaligned, the first loop will allocate multiple new
+ * mbufs. But then, it doesn't seem to chain these together. So, if there are
+ * multiple unaligned mbufs, we're looking at a pretty serious mbuf leak.
+ * But, this has been how it is, perhaps the misalignment only happens in the head
+ * of the chain.
  */
-static void
+static int
 nfs_realign(struct mbuf **pm, int hsiz)
 {
 	struct mbuf *m;
@@ -1457,9 +1579,15 @@ nfs_realign(struct mbuf **pm, int hsiz)
 	++nfs_realign_test;
 	while ((m = *pm) != NULL) {
 		if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-			MGET(n, M_TRYWAIT, MT_DATA);
+			MGET(n, M_DONTWAIT, MT_DATA);
+			if (n == NULL)
+				return (ENOMEM);
 			if (m->m_len >= MINCLSIZE) {
-				MCLGET(n, M_TRYWAIT);
+				MCLGET(n, M_DONTWAIT);
+				if (n->m_ext.ext_buf == NULL) {
+					m_freem(n);
+					return (ENOMEM);
+				}
 			}
 			n->m_len = 0;
 			break;
@@ -1480,6 +1608,7 @@ nfs_realign(struct mbuf **pm, int hsiz)
 		m_freem(*pm);
 		*pm = n;
 	}
+	return (0);
 }
 
 
diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c
index 8668a99..9552f95 100644
--- a/sys/nfsclient/nfs_subs.c
+++ b/sys/nfsclient/nfs_subs.c
@@ -94,6 +94,8 @@ int		nfs_ticks;
 int		nfs_pbuf_freecnt = -1;	/* start out unlimited */
 
 struct nfs_reqq	nfs_reqq;
+struct mtx nfs_reqq_mtx;
+struct mtx nfs_reply_mtx;
 struct nfs_bufq	nfs_bufq;
 
 /*
@@ -412,6 +414,8 @@ nfs_init(struct vfsconf *vfsp)
 	 */
 	TAILQ_INIT(&nfs_reqq);
 	callout_init(&nfs_callout, 0);
+	mtx_init(&nfs_reqq_mtx, "NFS reqq lock", NULL, MTX_DEF);
+	mtx_init(&nfs_reply_mtx, "Synch NFS reply posting", NULL, MTX_DEF);
 
 	nfs_pbuf_freecnt = nswbuf / 2 + 1;
 
diff --git a/sys/nfsclient/nfs_vfsops.c b/sys/nfsclient/nfs_vfsops.c
index 801335b..ff15eb1 100644
--- a/sys/nfsclient/nfs_vfsops.c
+++ b/sys/nfsclient/nfs_vfsops.c
@@ -817,6 +817,10 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
 
 	nfs_decode_args(nmp, argp);
 
+	if (nmp->nm_sotype == SOCK_STREAM)
+		mtx_init(&nmp->nm_nfstcpstate.mtx, "NFS/TCP state lock", 
+			 NULL, MTX_DEF);		
+
 	/*
 	 * For Connection based sockets (TCP,...) defer the connect until
 	 * the first request, in case the server is not responding.
@@ -862,6 +866,8 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
 
 	return (0);
 bad:
+	if (nmp->nm_sotype == SOCK_STREAM)
+		mtx_destroy(&nmp->nm_nfstcpstate.mtx);
 	nfs_disconnect(nmp);
 	uma_zfree(nfsmount_zone, nmp);
 	FREE(nam, M_SONAME);
@@ -903,6 +909,9 @@ nfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 	nfs_disconnect(nmp);
 	FREE(nmp->nm_nam, M_SONAME);
 
+	if (nmp->nm_sotype == SOCK_STREAM)
+		mtx_destroy(&nmp->nm_nfstcpstate.mtx);
+	
 	uma_zfree(nfsmount_zone, nmp);
 	return (0);
 }
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index a4dbe18..93d1853 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
+#include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
diff --git a/sys/nfsclient/nfsm_subs.h b/sys/nfsclient/nfsm_subs.h
index a91777e..9ab9b9e 100644
--- a/sys/nfsclient/nfsm_subs.h
+++ b/sys/nfsclient/nfsm_subs.h
@@ -134,7 +134,10 @@ do { \
 
 #define	nfsm_request(v, t, p, c) \
 do { \
+	sigset_t oldset; \
+	nfs_set_sigmask(p, &oldset); \
 	error = nfs_request((v), mreq, (t), (p), (c), &mrep, &md, &dpos); \
+	nfs_restore_sigmask(p, &oldset); \
 	if (error != 0) { \
 		if (error & NFSERR_RETERR) \
 			error &= ~NFSERR_RETERR; \
diff --git a/sys/nfsclient/nfsmount.h b/sys/nfsclient/nfsmount.h
index 5064370..7bc593c 100644
--- a/sys/nfsclient/nfsmount.h
+++ b/sys/nfsclient/nfsmount.h
@@ -36,6 +36,14 @@
 #ifndef _NFSCLIENT_NFSMOUNT_H_
 #define _NFSCLIENT_NFSMOUNT_H_
 
+struct nfs_tcp_mountstate {
+ 	int rpcresid;
+#define NFS_TCP_EXPECT_RPCMARKER 	0x0001 /* Expect to see a RPC/TCP marker next */
+#define NFS_TCP_FORCE_RECONNECT 	0x0002 /* Force a TCP reconnect */
+ 	int flags;
+	struct mtx mtx;
+};
+
 /*
  * Mount structure.
  * One allocated on every NFS mount.
@@ -79,6 +87,7 @@ struct	nfsmount {
 	struct nfs_rpcops *nm_rpcops;
 	int	nm_tprintf_initial_delay;	/* initial delay */
 	int	nm_tprintf_delay;		/* interval for messages */
+	struct nfs_tcp_mountstate nm_nfstcpstate;
 
 	/* NFSv4 */
 	uint64_t nm_clientid;
author	ps <ps@FreeBSD.org>	2004-12-06 21:11:15 +0000
committer	ps <ps@FreeBSD.org>	2004-12-06 21:11:15 +0000
commit	eeccf3813d92c16c33f1985fb7edf26c7c20c808 (patch)
tree	83e874ad8aa3b9b3120558628b84b3010046974d /sys/nfsclient
parent	2b5157cd0d38228760370d2fcb197f915ce3b93d (diff)
download	FreeBSD-src-eeccf3813d92c16c33f1985fb7edf26c7c20c808.zip FreeBSD-src-eeccf3813d92c16c33f1985fb7edf26c7c20c808.tar.gz