This fixes a problem with the nfs socket handling code which happens

if a single process is performing a large number of requests (in this case writing a large file). The writing process could monopolise the recieve lock and prevent any other processes from recieving their replies. It also adds a new sysctl variable 'vfs.nfs.dwrite' which controls the behaviour which originally pointed out the problem. When a process writes to a file over NFS, it usually arranges for another process (the 'iod') to perform the request. If no iods are available, then it turns the write into a 'delayed write' which is later picked up by the next iod to do a write request for that file. This can cause that particular iod to do a disproportionate number of requests from a single process which can harm performance on some NFS servers. The alternative is to perform the write synchronously in the context of the original writing process if no iod is avaiable for asynchronous writing. The 'delayed write' behaviour is selected when vfs.nfs.dwrite=1 and the non-delayed behaviour is selected when vfs.nfs.dwrite=0. The default is vfs.nfs.dwrite=1; if many people tell me that performance is better if vfs.nfs.dwrite=0 then I will change the default. Submitted by: Hidetoshi Shimokawa <simokawa@sat.t.u-tokyo.ac.jp>
author: dfr <dfr@FreeBSD.org> 1996-10-11 10:15:33 +0000
committer: dfr <dfr@FreeBSD.org> 1996-10-11 10:15:33 +0000
commit: de60fb9205631ede6a176edd5d010140cfa266d3 (patch)
tree: 7d089c2d16ff47871a314141686d876a406a3741
parent: 2d340b514c77a95e6ab924510db5f79b7d3f448a (diff)
download: FreeBSD-src-de60fb9205631ede6a176edd5d010140cfa266d3.zip
FreeBSD-src-de60fb9205631ede6a176edd5d010140cfa266d3.tar.gz
5 files changed, 74 insertions, 20 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index 61e6d00..ce4b896 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
- * $Id: nfs_bio.c,v 1.24 1996/07/16 10:19:43 dfr Exp $
+ * $Id: nfs_bio.c,v 1.25 1996/09/19 18:20:54 nate Exp $
  */
 
 #include <sys/param.h>
@@ -46,6 +46,7 @@
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -65,6 +66,9 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern int nfs_numasync;
 extern struct nfsstats nfsstats;
 
+int nfs_dwrite = 1;
+SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, "");
+
 /*
  * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
  * that this isn't done inside getblk() and brelse() so these calls
@@ -753,6 +757,14 @@ nfs_asyncio(bp, cred)
 		return (EIO);
 
 	/*
+	 * Allow the administrator to override the choice of using a delayed
+	 * write since it is a pessimization for some servers, notably some
+	 * Solaris servers.
+	 */
+	if (!nfs_dwrite)
+		return (EIO);
+
+	/*
 	 * Just turn the async write into a delayed write, instead of
 	 * doing in synchronously. Hopefully, at least one of the nfsiods
 	 * is currently doing a write for this file and will pick up the
diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c
index fb29864..6cafa8a 100644
--- a/sys/nfs/nfs_socket.c
+++ b/sys/nfs/nfs_socket.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.3 (Berkeley) 1/12/94
- * $Id: nfs_socket.c,v 1.16 1996/06/14 11:13:18 phk Exp $
+ * $Id: nfs_socket.c,v 1.17 1996/07/11 16:32:45 wollman Exp $
  */
 
 /*
@@ -681,15 +681,17 @@ nfs_reply(myrep)
 		 * sbwait() after someone else has received my reply for me.
 		 * Also necessary for connection based protocols to avoid
 		 * race conditions during a reconnect.
+		 * If nfs_rcvlock() returns EALREADY, that means that
+		 * the reply has already been recieved by another
+		 * process and we can return immediately.  In this
+		 * case, the lock is not taken to avoid races with
+		 * other processes.
 		 */
 		error = nfs_rcvlock(myrep);
+		if (error == EALREADY)
+			return (0);
 		if (error)
 			return (error);
-		/* Already received, bye bye */
-		if (myrep->r_mrep != NULL) {
-			nfs_rcvunlock(&nmp->nm_flag);
-			return (0);
-		}
 		/*
 		 * Get the next Rpc reply off the socket
 		 */
@@ -1494,6 +1496,14 @@ nfs_rcvlock(rep)
 		*flagp |= NFSMNT_WANTRCV;
 		(void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk",
 			slptimeo);
+		/*
+		 * If our reply was recieved while we were sleeping,
+		 * then just return without taking the lock to avoid a
+		 * situation where a single iod could 'capture' the
+		 * recieve lock.
+		 */
+		if (rep->r_mrep != NULL)
+			return (EALREADY);
 		if (slpflag == PCATCH) {
 			slpflag = 0;
 			slptimeo = 2 * hz;
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index 61e6d00..ce4b896 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
- * $Id: nfs_bio.c,v 1.24 1996/07/16 10:19:43 dfr Exp $
+ * $Id: nfs_bio.c,v 1.25 1996/09/19 18:20:54 nate Exp $
  */
 
 #include <sys/param.h>
@@ -46,6 +46,7 @@
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -65,6 +66,9 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern int nfs_numasync;
 extern struct nfsstats nfsstats;
 
+int nfs_dwrite = 1;
+SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, "");
+
 /*
  * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
  * that this isn't done inside getblk() and brelse() so these calls
@@ -753,6 +757,14 @@ nfs_asyncio(bp, cred)
 		return (EIO);
 
 	/*
+	 * Allow the administrator to override the choice of using a delayed
+	 * write since it is a pessimization for some servers, notably some
+	 * Solaris servers.
+	 */
+	if (!nfs_dwrite)
+		return (EIO);
+
+	/*
 	 * Just turn the async write into a delayed write, instead of
 	 * doing in synchronously. Hopefully, at least one of the nfsiods
 	 * is currently doing a write for this file and will pick up the
diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c
index fb29864..6cafa8a 100644
--- a/sys/nfsclient/nfs_socket.c
+++ b/sys/nfsclient/nfs_socket.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.3 (Berkeley) 1/12/94
- * $Id: nfs_socket.c,v 1.16 1996/06/14 11:13:18 phk Exp $
+ * $Id: nfs_socket.c,v 1.17 1996/07/11 16:32:45 wollman Exp $
  */
 
 /*
@@ -681,15 +681,17 @@ nfs_reply(myrep)
 		 * sbwait() after someone else has received my reply for me.
 		 * Also necessary for connection based protocols to avoid
 		 * race conditions during a reconnect.
+		 * If nfs_rcvlock() returns EALREADY, that means that
+		 * the reply has already been recieved by another
+		 * process and we can return immediately.  In this
+		 * case, the lock is not taken to avoid races with
+		 * other processes.
 		 */
 		error = nfs_rcvlock(myrep);
+		if (error == EALREADY)
+			return (0);
 		if (error)
 			return (error);
-		/* Already received, bye bye */
-		if (myrep->r_mrep != NULL) {
-			nfs_rcvunlock(&nmp->nm_flag);
-			return (0);
-		}
 		/*
 		 * Get the next Rpc reply off the socket
 		 */
@@ -1494,6 +1496,14 @@ nfs_rcvlock(rep)
 		*flagp |= NFSMNT_WANTRCV;
 		(void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk",
 			slptimeo);
+		/*
+		 * If our reply was recieved while we were sleeping,
+		 * then just return without taking the lock to avoid a
+		 * situation where a single iod could 'capture' the
+		 * recieve lock.
+		 */
+		if (rep->r_mrep != NULL)
+			return (EALREADY);
 		if (slpflag == PCATCH) {
 			slpflag = 0;
 			slptimeo = 2 * hz;
diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c
index fb29864..6cafa8a 100644
--- a/sys/nfsserver/nfs_srvsock.c
+++ b/sys/nfsserver/nfs_srvsock.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_socket.c	8.3 (Berkeley) 1/12/94
- * $Id: nfs_socket.c,v 1.16 1996/06/14 11:13:18 phk Exp $
+ * $Id: nfs_socket.c,v 1.17 1996/07/11 16:32:45 wollman Exp $
  */
 
 /*
@@ -681,15 +681,17 @@ nfs_reply(myrep)
 		 * sbwait() after someone else has received my reply for me.
 		 * Also necessary for connection based protocols to avoid
 		 * race conditions during a reconnect.
+		 * If nfs_rcvlock() returns EALREADY, that means that
+		 * the reply has already been recieved by another
+		 * process and we can return immediately.  In this
+		 * case, the lock is not taken to avoid races with
+		 * other processes.
 		 */
 		error = nfs_rcvlock(myrep);
+		if (error == EALREADY)
+			return (0);
 		if (error)
 			return (error);
-		/* Already received, bye bye */
-		if (myrep->r_mrep != NULL) {
-			nfs_rcvunlock(&nmp->nm_flag);
-			return (0);
-		}
 		/*
 		 * Get the next Rpc reply off the socket
 		 */
@@ -1494,6 +1496,14 @@ nfs_rcvlock(rep)
 		*flagp |= NFSMNT_WANTRCV;
 		(void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk",
 			slptimeo);
+		/*
+		 * If our reply was recieved while we were sleeping,
+		 * then just return without taking the lock to avoid a
+		 * situation where a single iod could 'capture' the
+		 * recieve lock.
+		 */
+		if (rep->r_mrep != NULL)
+			return (EALREADY);
 		if (slpflag == PCATCH) {
 			slpflag = 0;
 			slptimeo = 2 * hz;
author	dfr <dfr@FreeBSD.org>	1996-10-11 10:15:33 +0000
committer	dfr <dfr@FreeBSD.org>	1996-10-11 10:15:33 +0000
commit	de60fb9205631ede6a176edd5d010140cfa266d3 (patch)
tree	7d089c2d16ff47871a314141686d876a406a3741
parent	2d340b514c77a95e6ab924510db5f79b7d3f448a (diff)
download	FreeBSD-src-de60fb9205631ede6a176edd5d010140cfa266d3.zip FreeBSD-src-de60fb9205631ede6a176edd5d010140cfa266d3.tar.gz