diff options
author | dfr <dfr@FreeBSD.org> | 1996-11-06 10:53:16 +0000 |
---|---|---|
committer | dfr <dfr@FreeBSD.org> | 1996-11-06 10:53:16 +0000 |
commit | 4c8f7388e578751aab31ce0b8a982373a002aa7e (patch) | |
tree | b99fa8ba7a7ef13dafb400913ede1cef59f59743 /sys/nfs | |
parent | d7d2cfcbf0e341b9b9672dcde6e1be024ed506b9 (diff) | |
download | FreeBSD-src-4c8f7388e578751aab31ce0b8a982373a002aa7e.zip FreeBSD-src-4c8f7388e578751aab31ce0b8a982373a002aa7e.tar.gz |
Improve the queuing algorithms used by NFS' asynchronous i/o. The
existing mechanism uses a global queue for some buffers and the
vp->b_dirtyblkhd queue for others. This turns sequential writes into
randomly ordered writes to the server, affecting both read and write
performance. The existing mechanism also copes badly with hung
servers, tending to block accesses to other servers when all the iods
are waiting for a hung server.
The new mechanism uses a queue for each mount point. All asynchronous
i/o goes through this queue which preserves the ordering of requests.
A simple mechanism ensures that the iods are shared out fairly between
active mount points. This removes the sysctl variable vfs.nfs.dwrite
since the new queueing mechanism removes the old delayed write code
completely.
This should go into the 2.2 branch.
Diffstat (limited to 'sys/nfs')
-rw-r--r-- | sys/nfs/nfs.h | 4 | ||||
-rw-r--r-- | sys/nfs/nfs_bio.c | 120 | ||||
-rw-r--r-- | sys/nfs/nfs_common.c | 8 | ||||
-rw-r--r-- | sys/nfs/nfs_subs.c | 8 | ||||
-rw-r--r-- | sys/nfs/nfs_syscalls.c | 80 | ||||
-rw-r--r-- | sys/nfs/nfs_vfsops.c | 7 | ||||
-rw-r--r-- | sys/nfs/nfs_vnops.c | 3 | ||||
-rw-r--r-- | sys/nfs/nfsmount.h | 23 | ||||
-rw-r--r-- | sys/nfs/nfsnode.h | 4 |
9 files changed, 168 insertions, 89 deletions
diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h index 3d9d801..bc2edeb 100644 --- a/sys/nfs/nfs.h +++ b/sys/nfs/nfs.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs.h 8.1 (Berkeley) 6/10/93 - * $Id: nfs.h,v 1.17 1996/01/30 22:59:39 mpp Exp $ + * $Id: nfs.h,v 1.18 1996/08/21 21:55:44 dyson Exp $ */ #ifndef _NFS_NFS_H_ @@ -335,7 +335,7 @@ extern TAILQ_HEAD(nfs_reqq, nfsreq) nfs_reqq; #define NWDELAYHASH(sock, f) \ (&(sock)->ns_wdelayhashtbl[(*((u_long *)(f))) % NFS_WDELAYHASHSIZ]) #ifndef NFS_MUIDHASHSIZ -#define NFS_MUIDHASHSIZ 67 /* Tune the size of nfsmount with this */ +#define NFS_MUIDHASHSIZ 63 /* Tune the size of nfsmount with this */ #endif #define NMUIDHASH(nmp, uid) \ (&(nmp)->nm_uidhashtbl[(uid) % NFS_MUIDHASHSIZ]) diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index c57db67..da03a9d 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 - * $Id: nfs_bio.c,v 1.27 1996/10/12 17:39:39 bde Exp $ + * $Id: nfs_bio.c,v 1.28 1996/10/21 10:07:48 dfr Exp $ */ #include <sys/param.h> @@ -62,13 +62,9 @@ static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, struct proc *p)); -extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern int nfs_numasync; extern struct nfsstats nfsstats; -static int nfs_dwrite = 1; -SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, ""); - /* * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate * that this isn't done inside getblk() and brelse() so these calls @@ -727,12 +723,87 @@ nfs_asyncio(bp, cred) register struct buf *bp; struct ucred *cred; { - register int i; + struct nfsmount *nmp; + int i; + int gotiod; + int slpflag = 0; + int slptimeo = 0; + int error; if (nfs_numasync == 0) return (EIO); + + nmp = VFSTONFS(bp->b_vp->v_mount); +again: + if (nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + gotiod = FALSE; + + /* + * Find a free iod to process this request. + */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_iodwant[i]) { + if (nfs_iodwant[i]) { + /* + * Found one, so wake it up and tell it which + * mount to process. + */ + NFS_DPF(ASYNCIO, + ("nfs_asyncio: waking iod %d for mount %p\n", + i, nmp)); + nfs_iodwant[i] = (struct proc *)0; + nfs_iodmount[i] = nmp; + nmp->nm_bufqiods++; + wakeup((caddr_t)&nfs_iodwant[i]); + gotiod = TRUE; + } + + /* + * If none are free, we may already have an iod working on this mount + * point. If so, it will process our request. + */ + if (!gotiod) { + if (nmp->nm_bufqiods > 0) { + NFS_DPF(ASYNCIO, + ("nfs_asyncio: %d iods are already processing mount %p\n", + nmp->nm_bufqiods, nmp)); + gotiod = TRUE; + } + } + + /* + * If we have an iod which can process the request, then queue + * the buffer. + */ + if (gotiod) { + /* + * Ensure that the queue never grows too large. + */ + while (nmp->nm_bufqlen >= 2*nfs_numasync) { + NFS_DPF(ASYNCIO, + ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); + nmp->nm_bufqwant = TRUE; + error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, + "nfsaio", slptimeo); + if (error) { + if (nfs_sigintr(nmp, NULL, bp->b_proc)) + return (EINTR); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + /* + * We might have lost our iod while sleeping, + * so check and loop if nescessary. + */ + if (nmp->nm_bufqiods == 0) { + NFS_DPF(ASYNCIO, + ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); + goto again; + } + } + if (bp->b_flags & B_READ) { if (bp->b_rcred == NOCRED && cred != NOCRED) { crhold(cred); @@ -746,38 +817,17 @@ nfs_asyncio(bp, cred) } } - TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); - nfs_iodwant[i] = (struct proc *)0; - wakeup((caddr_t)&nfs_iodwant[i]); + TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); + nmp->nm_bufqlen++; return (0); - } - - /* - * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE - * return EIO so the process will call nfs_doio() and do it - * synchronously. - */ - if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) - return (EIO); - - /* - * Allow the administrator to override the choice of using a delayed - * write since it is a pessimization for some servers, notably some - * Solaris servers. - */ - if (!nfs_dwrite) - return (EIO); + } /* - * Just turn the async write into a delayed write, instead of - * doing in synchronously. Hopefully, at least one of the nfsiods - * is currently doing a write for this file and will pick up the - * delayed writes before going back to sleep. + * All the iods are busy on other mounts, so return EIO to + * force the caller to process the i/o synchronously. */ - bp->b_flags |= B_DELWRI; - reassignbuf(bp, bp->b_vp); - biodone(bp); - return (0); + NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); + return (EIO); } /* diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c index 7d97f09..2566fe6 100644 --- a/sys/nfs/nfs_common.c +++ b/sys/nfs/nfs_common.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_subs.c,v 1.32 1996/08/21 21:55:51 dyson Exp $ + * $Id: nfs_subs.c,v 1.33 1996/09/19 18:20:59 nate Exp $ */ /* @@ -538,7 +538,6 @@ static short *nfsrv_v3errmap[] = { #endif /* NFS_NOSERVER */ -extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; @@ -1136,9 +1135,10 @@ nfs_init() if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = (struct proc *)0; - TAILQ_INIT(&nfs_bufq); + nfs_iodmount[i] = (struct nfsmount *)0; + } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c index 7d97f09..2566fe6 100644 --- a/sys/nfs/nfs_subs.c +++ b/sys/nfs/nfs_subs.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_subs.c,v 1.32 1996/08/21 21:55:51 dyson Exp $ + * $Id: nfs_subs.c,v 1.33 1996/09/19 18:20:59 nate Exp $ */ /* @@ -538,7 +538,6 @@ static short *nfsrv_v3errmap[] = { #endif /* NFS_NOSERVER */ -extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; @@ -1136,9 +1135,10 @@ nfs_init() if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = (struct proc *)0; - TAILQ_INIT(&nfs_bufq); + nfs_iodmount[i] = (struct nfsmount *)0; + } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ diff --git a/sys/nfs/nfs_syscalls.c b/sys/nfs/nfs_syscalls.c index d94b7e6..e2f3435 100644 --- a/sys/nfs/nfs_syscalls.c +++ b/sys/nfs/nfs_syscalls.c @@ -34,13 +34,14 @@ * SUCH DAMAGE. * * @(#)nfs_syscalls.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_syscalls.c,v 1.13 1996/01/13 23:27:58 phk Exp $ + * $Id: nfs_syscalls.c,v 1.14 1996/04/30 23:26:52 bde Exp $ */ #include <sys/param.h> #include <sys/systm.h> #include <sys/sysproto.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/stat.h> @@ -79,7 +80,6 @@ extern int (*nfsrv3_procs[NFS_NPROCS]) __P((struct nfsrv_descript *nd, struct nfssvc_sock *slp, struct proc *procp, struct mbuf **mreqp)); -extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern int nfs_numasync; extern time_t nqnfsstarttime; extern int nqsrv_writeslack; @@ -715,6 +715,9 @@ done: } #endif /* NFS_NOSERVER */ +int nfs_defect = 0; +SYSCTL_INT(_vfs_nfs, OID_AUTO, defect, CTLFLAG_RW, &nfs_defect, 0, ""); + /* * Asynchronous I/O daemons for client nfs. * They do read-ahead and write-behind operations on the block I/O cache. @@ -727,6 +730,7 @@ nfssvc_iod(p) register struct buf *bp, *nbp; register int i, myiod; struct vnode *vp; + struct nfsmount *nmp; int error = 0, s; /* @@ -746,53 +750,49 @@ nfssvc_iod(p) * Just loop around doin our stuff until SIGKILL */ for (;;) { - while (nfs_bufq.tqh_first == NULL && error == 0) { + while (((nmp = nfs_iodmount[myiod]) == NULL + || nmp->nm_bufq.tqh_first == NULL) + && error == 0) { + if (nmp) + nmp->nm_bufqiods--; nfs_iodwant[myiod] = p; + nfs_iodmount[myiod] = NULL; error = tsleep((caddr_t)&nfs_iodwant[myiod], PWAIT | PCATCH, "nfsidl", 0); } - while ((bp = nfs_bufq.tqh_first) != NULL) { - /* Take one off the front of the list */ - TAILQ_REMOVE(&nfs_bufq, bp, b_freelist); - if (bp->b_flags & B_READ) - (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); - else do { - /* - * Look for a delayed write for the same vnode, so I can do - * it now. We must grab it before calling nfs_doio() to - * avoid any risk of the vnode getting vclean()'d while - * we are doing the write rpc. - */ - vp = bp->b_vp; - s = splbio(); - for (nbp = vp->v_dirtyblkhd.lh_first; nbp; - nbp = nbp->b_vnbufs.le_next) { - if ((nbp->b_flags & - (B_BUSY|B_DELWRI|B_NEEDCOMMIT|B_NOCACHE))!=B_DELWRI) - continue; - bremfree(nbp); - vfs_busy_pages(nbp, 1); - nbp->b_flags |= (B_BUSY|B_ASYNC); - break; - } - splx(s); - /* - * For the delayed write, do the first part of nfs_bwrite() - * up to, but not including nfs_strategy(). - */ - if (nbp) { - nbp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); - reassignbuf(nbp, nbp->b_vp); - nbp->b_vp->v_numoutput++; - } - (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); - } while (bp = nbp); - } if (error) { nfs_asyncdaemon[myiod] = 0; + if (nmp) nmp->nm_bufqiods--; + nfs_iodmount[myiod] = NULL; nfs_numasync--; return (error); } + while ((bp = nmp->nm_bufq.tqh_first) != NULL) { + /* Take one off the front of the list */ + TAILQ_REMOVE(&nmp->nm_bufq, bp, b_freelist); + nmp->nm_bufqlen--; + if (nmp->nm_bufqwant && nmp->nm_bufqlen < 2 * nfs_numasync) { + nmp->nm_bufqwant = FALSE; + wakeup(&nmp->nm_bufq); + } + if (bp->b_flags & B_READ) + (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); + else + (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); + + /* + * If there are more than one iod on this mount, then defect + * so that the iods can be shared out fairly between the mounts + */ + if (nfs_defect && nmp->nm_bufqiods > 1) { + NFS_DPF(ASYNCIO, + ("nfssvc_iod: iod %d defecting from mount %p\n", + myiod, nmp)); + nfs_iodmount[myiod] = NULL; + nmp->nm_bufqiods--; + break; + } + } } } diff --git a/sys/nfs/nfs_vfsops.c b/sys/nfs/nfs_vfsops.c index af0b0c6..0e9e1e2 100644 --- a/sys/nfs/nfs_vfsops.c +++ b/sys/nfs/nfs_vfsops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vfsops.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_vfsops.c,v 1.29 1996/05/02 14:20:40 phk Exp $ + * $Id: nfs_vfsops.c,v 1.30 1996/10/20 15:01:58 phk Exp $ */ #include <sys/param.h> @@ -79,6 +79,10 @@ struct nfsstats nfsstats; SYSCTL_NODE(_vfs, MOUNT_NFS, nfs, CTLFLAG_RW, 0, "NFS filesystem"); SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RD, &nfsstats, nfsstats, ""); +#ifdef NFS_DEBUG +int nfs_debug; +SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, ""); +#endif static int nfs_iosize __P((struct nfsmount *nmp)); static int mountnfs __P((struct nfs_args *,struct mount *, @@ -588,6 +592,7 @@ mountnfs(argp, mp, nam, pth, hst, vpp) M_NFSMNT, M_WAITOK); bzero((caddr_t)nmp, sizeof (struct nfsmount)); TAILQ_INIT(&nmp->nm_uidlruhead); + TAILQ_INIT(&nmp->nm_bufq); mp->mnt_data = (qaddr_t)nmp; } getnewfsid(mp, MOUNT_NFS); diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index bb0a94a..8a6bb0d 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.5 (Berkeley) 2/13/94 - * $Id: nfs_vnops.c,v 1.35 1996/09/19 18:21:01 nate Exp $ + * $Id: nfs_vnops.c,v 1.36 1996/10/21 10:07:52 dfr Exp $ */ /* @@ -337,6 +337,7 @@ extern u_long nfs_true, nfs_false; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) diff --git a/sys/nfs/nfsmount.h b/sys/nfs/nfsmount.h index 1bd71c5..a1c3b38 100644 --- a/sys/nfs/nfsmount.h +++ b/sys/nfs/nfsmount.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfsmount.h 8.1 (Berkeley) 6/10/93 - * $Id: nfsmount.h,v 1.6 1995/11/21 12:54:40 bde Exp $ + * $Id: nfsmount.h,v 1.7 1995/12/17 21:12:36 phk Exp $ */ #ifndef _NFS_NFSMOUNT_H_ @@ -82,6 +82,10 @@ struct nfsmount { int nm_numuids; /* Number of nfsuid mappings */ TAILQ_HEAD(, nfsuid) nm_uidlruhead; /* Lists of nfsuid mappings */ LIST_HEAD(, nfsuid) nm_uidhashtbl[NFS_MUIDHASHSIZ]; + TAILQ_HEAD(, buf) nm_bufq; /* async io buffer queue */ + short nm_bufqlen; /* number of buffers in queue */ + short nm_bufqwant; /* process wants to add to the queue */ + int nm_bufqiods; /* number of iods processing queue */ }; #if defined(KERNEL) || defined(_KERNEL) @@ -89,6 +93,23 @@ struct nfsmount { * Convert mount ptr to nfsmount ptr. */ #define VFSTONFS(mp) ((struct nfsmount *)((mp)->mnt_data)) + +#ifdef NFS_DEBUG + +extern int nfs_debug; +#define NFS_DEBUG_ASYNCIO 1 + +#define NFS_DPF(cat, args) \ + do { \ + if (nfs_debug & NFS_DEBUG_##cat) printf args; \ + } while (0) + +#else + +#define NFS_DPF(cat, args) + +#endif + #endif /* KERNEL */ #endif diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h index 49e96da..265154c 100644 --- a/sys/nfs/nfsnode.h +++ b/sys/nfs/nfsnode.h @@ -34,7 +34,7 @@ * SUCH DAMAGE. * * @(#)nfsnode.h 8.4 (Berkeley) 2/13/94 - * $Id: nfsnode.h,v 1.14 1995/11/09 08:16:59 bde Exp $ + * $Id: nfsnode.h,v 1.15 1995/12/17 21:12:37 phk Exp $ */ #ifndef _NFS_NFSNODE_H_ @@ -149,6 +149,8 @@ struct nfsnode { * Queue head for nfsiod's */ extern TAILQ_HEAD(nfs_bufq, buf) nfs_bufq; +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; #if defined(KERNEL) || defined(_KERNEL) extern vop_t **fifo_nfsv2nodeop_p; |