diff options
author | dfr <dfr@FreeBSD.org> | 1996-10-11 10:15:33 +0000 |
---|---|---|
committer | dfr <dfr@FreeBSD.org> | 1996-10-11 10:15:33 +0000 |
commit | de60fb9205631ede6a176edd5d010140cfa266d3 (patch) | |
tree | 7d089c2d16ff47871a314141686d876a406a3741 | |
parent | 2d340b514c77a95e6ab924510db5f79b7d3f448a (diff) | |
download | FreeBSD-src-de60fb9205631ede6a176edd5d010140cfa266d3.zip FreeBSD-src-de60fb9205631ede6a176edd5d010140cfa266d3.tar.gz |
This fixes a problem with the nfs socket handling code which happens
if a single process is performing a large number of requests (in this
case writing a large file). The writing process could monopolise the
recieve lock and prevent any other processes from recieving their
replies.
It also adds a new sysctl variable 'vfs.nfs.dwrite' which controls the
behaviour which originally pointed out the problem. When a process
writes to a file over NFS, it usually arranges for another process
(the 'iod') to perform the request. If no iods are available, then it
turns the write into a 'delayed write' which is later picked up by the
next iod to do a write request for that file. This can cause that
particular iod to do a disproportionate number of requests from a
single process which can harm performance on some NFS servers. The
alternative is to perform the write synchronously in the context of
the original writing process if no iod is avaiable for asynchronous
writing.
The 'delayed write' behaviour is selected when vfs.nfs.dwrite=1 and
the non-delayed behaviour is selected when vfs.nfs.dwrite=0. The
default is vfs.nfs.dwrite=1; if many people tell me that performance
is better if vfs.nfs.dwrite=0 then I will change the default.
Submitted by: Hidetoshi Shimokawa <simokawa@sat.t.u-tokyo.ac.jp>
-rw-r--r-- | sys/nfs/nfs_bio.c | 14 | ||||
-rw-r--r-- | sys/nfs/nfs_socket.c | 22 | ||||
-rw-r--r-- | sys/nfsclient/nfs_bio.c | 14 | ||||
-rw-r--r-- | sys/nfsclient/nfs_socket.c | 22 | ||||
-rw-r--r-- | sys/nfsserver/nfs_srvsock.c | 22 |
5 files changed, 74 insertions, 20 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index 61e6d00..ce4b896 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 - * $Id: nfs_bio.c,v 1.24 1996/07/16 10:19:43 dfr Exp $ + * $Id: nfs_bio.c,v 1.25 1996/09/19 18:20:54 nate Exp $ */ #include <sys/param.h> @@ -46,6 +46,7 @@ #include <sys/vnode.h> #include <sys/mount.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -65,6 +66,9 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern int nfs_numasync; extern struct nfsstats nfsstats; +int nfs_dwrite = 1; +SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, ""); + /* * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate * that this isn't done inside getblk() and brelse() so these calls @@ -753,6 +757,14 @@ nfs_asyncio(bp, cred) return (EIO); /* + * Allow the administrator to override the choice of using a delayed + * write since it is a pessimization for some servers, notably some + * Solaris servers. + */ + if (!nfs_dwrite) + return (EIO); + + /* * Just turn the async write into a delayed write, instead of * doing in synchronously. Hopefully, at least one of the nfsiods * is currently doing a write for this file and will pick up the diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c index fb29864..6cafa8a 100644 --- a/sys/nfs/nfs_socket.c +++ b/sys/nfs/nfs_socket.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_socket.c 8.3 (Berkeley) 1/12/94 - * $Id: nfs_socket.c,v 1.16 1996/06/14 11:13:18 phk Exp $ + * $Id: nfs_socket.c,v 1.17 1996/07/11 16:32:45 wollman Exp $ */ /* @@ -681,15 +681,17 @@ nfs_reply(myrep) * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. + * If nfs_rcvlock() returns EALREADY, that means that + * the reply has already been recieved by another + * process and we can return immediately. In this + * case, the lock is not taken to avoid races with + * other processes. */ error = nfs_rcvlock(myrep); + if (error == EALREADY) + return (0); if (error) return (error); - /* Already received, bye bye */ - if (myrep->r_mrep != NULL) { - nfs_rcvunlock(&nmp->nm_flag); - return (0); - } /* * Get the next Rpc reply off the socket */ @@ -1494,6 +1496,14 @@ nfs_rcvlock(rep) *flagp |= NFSMNT_WANTRCV; (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); + /* + * If our reply was recieved while we were sleeping, + * then just return without taking the lock to avoid a + * situation where a single iod could 'capture' the + * recieve lock. + */ + if (rep->r_mrep != NULL) + return (EALREADY); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index 61e6d00..ce4b896 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 - * $Id: nfs_bio.c,v 1.24 1996/07/16 10:19:43 dfr Exp $ + * $Id: nfs_bio.c,v 1.25 1996/09/19 18:20:54 nate Exp $ */ #include <sys/param.h> @@ -46,6 +46,7 @@ #include <sys/vnode.h> #include <sys/mount.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -65,6 +66,9 @@ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern int nfs_numasync; extern struct nfsstats nfsstats; +int nfs_dwrite = 1; +SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, ""); + /* * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate * that this isn't done inside getblk() and brelse() so these calls @@ -753,6 +757,14 @@ nfs_asyncio(bp, cred) return (EIO); /* + * Allow the administrator to override the choice of using a delayed + * write since it is a pessimization for some servers, notably some + * Solaris servers. + */ + if (!nfs_dwrite) + return (EIO); + + /* * Just turn the async write into a delayed write, instead of * doing in synchronously. Hopefully, at least one of the nfsiods * is currently doing a write for this file and will pick up the diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c index fb29864..6cafa8a 100644 --- a/sys/nfsclient/nfs_socket.c +++ b/sys/nfsclient/nfs_socket.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_socket.c 8.3 (Berkeley) 1/12/94 - * $Id: nfs_socket.c,v 1.16 1996/06/14 11:13:18 phk Exp $ + * $Id: nfs_socket.c,v 1.17 1996/07/11 16:32:45 wollman Exp $ */ /* @@ -681,15 +681,17 @@ nfs_reply(myrep) * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. + * If nfs_rcvlock() returns EALREADY, that means that + * the reply has already been recieved by another + * process and we can return immediately. In this + * case, the lock is not taken to avoid races with + * other processes. */ error = nfs_rcvlock(myrep); + if (error == EALREADY) + return (0); if (error) return (error); - /* Already received, bye bye */ - if (myrep->r_mrep != NULL) { - nfs_rcvunlock(&nmp->nm_flag); - return (0); - } /* * Get the next Rpc reply off the socket */ @@ -1494,6 +1496,14 @@ nfs_rcvlock(rep) *flagp |= NFSMNT_WANTRCV; (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); + /* + * If our reply was recieved while we were sleeping, + * then just return without taking the lock to avoid a + * situation where a single iod could 'capture' the + * recieve lock. + */ + if (rep->r_mrep != NULL) + return (EALREADY); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c index fb29864..6cafa8a 100644 --- a/sys/nfsserver/nfs_srvsock.c +++ b/sys/nfsserver/nfs_srvsock.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_socket.c 8.3 (Berkeley) 1/12/94 - * $Id: nfs_socket.c,v 1.16 1996/06/14 11:13:18 phk Exp $ + * $Id: nfs_socket.c,v 1.17 1996/07/11 16:32:45 wollman Exp $ */ /* @@ -681,15 +681,17 @@ nfs_reply(myrep) * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. + * If nfs_rcvlock() returns EALREADY, that means that + * the reply has already been recieved by another + * process and we can return immediately. In this + * case, the lock is not taken to avoid races with + * other processes. */ error = nfs_rcvlock(myrep); + if (error == EALREADY) + return (0); if (error) return (error); - /* Already received, bye bye */ - if (myrep->r_mrep != NULL) { - nfs_rcvunlock(&nmp->nm_flag); - return (0); - } /* * Get the next Rpc reply off the socket */ @@ -1494,6 +1496,14 @@ nfs_rcvlock(rep) *flagp |= NFSMNT_WANTRCV; (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); + /* + * If our reply was recieved while we were sleeping, + * then just return without taking the lock to avoid a + * situation where a single iod could 'capture' the + * recieve lock. + */ + if (rep->r_mrep != NULL) + return (EALREADY); if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; |