summaryrefslogtreecommitdiffstats
path: root/sys/nfsclient
diff options
context:
space:
mode:
authorgreen <green@FreeBSD.org>2005-06-10 23:50:41 +0000
committergreen <green@FreeBSD.org>2005-06-10 23:50:41 +0000
commitff904ffb643a4538cfbfba4658052e1b36e2bfe5 (patch)
tree4eeddcd51bbe2b1f2ab59415946715327f429a80 /sys/nfsclient
parent613f0c4df2e951f81d91d8550f1ce05c8aa55f2d (diff)
downloadFreeBSD-src-ff904ffb643a4538cfbfba4658052e1b36e2bfe5.zip
FreeBSD-src-ff904ffb643a4538cfbfba4658052e1b36e2bfe5.tar.gz
Fix a serious deadlock with the NFS client. Given a large enough
atomic write request, it can fill the buffer cache with the entirety of that write in order to handle retries. However, it never drops the vnode lock, or else it wouldn't be atomic, so it ends up waiting indefinitely for more buf memory that cannot be gotten as it has it all, and it waits in an uncancellable state. To fix this, hibufspace is exported and scaled to a reasonable fraction. This is used as the limit of how much of an atomic write request by the NFS client will be handled asynchronously. If the request is larger than this, it will be turned into a synchronous request which won't deadlock the system. It's possible this value is far off from what is required by some, so it shall be tunable as soon as mount_nfs(8) learns of the new field. The slowdown between an asynchronous and a synchronous write on NFS appears to be on the order of 2x-4x. General nod by: gad MFC after: 2 weeks More testing: wes PR: kern/79208
Diffstat (limited to 'sys/nfsclient')
-rw-r--r--sys/nfsclient/nfs_bio.c65
-rw-r--r--sys/nfsclient/nfs_vfsops.c9
-rw-r--r--sys/nfsclient/nfsargs.h4
-rw-r--r--sys/nfsclient/nfsmount.h1
4 files changed, 77 insertions, 2 deletions
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index 17c3c3d..4eb461d 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -873,6 +873,14 @@ nfs_write(struct vop_write_args *ap)
*/
if (ioflag & (IO_APPEND | IO_SYNC)) {
if (np->n_flag & NMODIFIED) {
+ /*
+ * Require non-blocking, synchronous writes to
+ * dirty files to inform the program it needs
+ * to fsync(2) explicitly.
+ */
+ if (ioflag & IO_NDELAY)
+ return (EAGAIN);
+flush_and_restart:
np->n_attrstamp = 0;
error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
if (error)
@@ -953,6 +961,63 @@ restart:
}
biosize = vp->v_mount->mnt_stat.f_iosize;
+ /*
+ * Find all of this file's B_NEEDCOMMIT buffers. If our writes
+ * would exceed the local maximum per-file write commit size when
+ * combined with those, we must decide whether to flush,
+ * go synchronous, or return error. We don't bother checking
+ * IO_UNIT -- we just make all writes atomic anyway, as there's
+ * no point optimizing for something that really won't ever happen.
+ */
+ if (!(ioflag & IO_SYNC)) {
+ int needrestart = 0;
+ if (nmp->nm_wcommitsize < uio->uio_resid) {
+ /*
+ * If this request could not possibly be completed
+ * without exceeding the maximum outstanding write
+ * commit size, see if we can convert it into a
+ * synchronous write operation.
+ */
+ if (ioflag & IO_NDELAY)
+ return (EAGAIN);
+ ioflag |= IO_SYNC;
+ if (np->n_flag & NMODIFIED)
+ needrestart = 1;
+ } else if (np->n_flag & NMODIFIED) {
+ int wouldcommit = 0;
+ BO_LOCK(&vp->v_bufobj);
+ if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
+ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
+ b_bobufs) {
+ if (bp->b_flags & B_NEEDCOMMIT)
+ wouldcommit += bp->b_bcount;
+ }
+ }
+ BO_UNLOCK(&vp->v_bufobj);
+ /*
+ * Since we're not operating synchronously and
+ * bypassing the buffer cache, we are in a commit
+ * and holding all of these buffers whether
+ * transmitted or not. If not limited, this
+ * will lead to the buffer cache deadlocking,
+ * as no one else can flush our uncommitted buffers.
+ */
+ wouldcommit += uio->uio_resid;
+ /*
+ * If we would initially exceed the maximum
+ * outstanding write commit size, flush and restart.
+ */
+ if (wouldcommit > nmp->nm_wcommitsize)
+ needrestart = 1;
+ }
+ if (needrestart) {
+ if (haverslock) {
+ nfs_rsunlock(np, td);
+ haverslock = 0;
+ }
+ goto flush_and_restart;
+ }
+ }
do {
nfsstats.biocache_writes++;
diff --git a/sys/nfsclient/nfs_vfsops.c b/sys/nfsclient/nfs_vfsops.c
index e7a3a1a..9cf9b2b 100644
--- a/sys/nfsclient/nfs_vfsops.c
+++ b/sys/nfsclient/nfs_vfsops.c
@@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
@@ -633,6 +635,12 @@ nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp)
else
nmp->nm_readahead = NFS_MAXRAHEAD;
}
+ if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
+ if (argp->wcommitsize < nmp->nm_wsize)
+ nmp->nm_wcommitsize = nmp->nm_wsize;
+ else
+ nmp->nm_wcommitsize = argp->wcommitsize;
+ }
if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
if (argp->deadthresh <= NFS_MAXDEADTHRESH)
nmp->nm_deadthresh = argp->deadthresh;
@@ -815,6 +823,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
nmp->nm_wsize = NFS_WSIZE;
nmp->nm_rsize = NFS_RSIZE;
}
+ nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
nmp->nm_readdirsize = NFS_READDIRSIZE;
nmp->nm_numgrps = NFS_MAXGRPS;
nmp->nm_readahead = NFS_DEFRAHEAD;
diff --git a/sys/nfsclient/nfsargs.h b/sys/nfsclient/nfsargs.h
index 8aa2392..7ebf1a0 100644
--- a/sys/nfsclient/nfsargs.h
+++ b/sys/nfsclient/nfsargs.h
@@ -56,7 +56,7 @@ struct nfs_args {
int retrans; /* times to retry send */
int maxgrouplist; /* Max. size of group list */
int readahead; /* # of blocks to readahead */
- int __pad1; /* was "leaseterm" */
+ int wcommitsize; /* Max. write commit size in bytes */
int deadthresh; /* Retrans threshold */
char *hostname; /* server's name */
int acregmin; /* cache attrs for reg files min time */
@@ -80,7 +80,7 @@ struct nfs_args {
#define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */
/* 0x400 free, was NFSMNT_KERB */
#define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */
-/* 0x1000 free, was NFSMNT_LEASETERM */
+#define NFSMNT_WCOMMITSIZE 0x00001000 /* set max write commit size */
#define NFSMNT_READAHEAD 0x00002000 /* set read ahead */
#define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */
#define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */
diff --git a/sys/nfsclient/nfsmount.h b/sys/nfsclient/nfsmount.h
index 7f23aaa..a8bcc14 100644
--- a/sys/nfsclient/nfsmount.h
+++ b/sys/nfsclient/nfsmount.h
@@ -74,6 +74,7 @@ struct nfsmount {
int nm_wsize; /* Max size of write rpc */
int nm_readdirsize; /* Size of a readdir rpc */
int nm_readahead; /* Num. of blocks to readahead */
+ int nm_wcommitsize; /* Max size of commit for write */
int nm_acdirmin; /* Directory attr cache min lifetime */
int nm_acdirmax; /* Directory attr cache max lifetime */
int nm_acregmin; /* Reg file attr cache min lifetime */
OpenPOWER on IntegriCloud