From 7c0944d56c2504d71b846fe79d625028c80cdfd8 Mon Sep 17 00:00:00 2001 From: ps Date: Wed, 15 Dec 2004 22:20:22 +0000 Subject: First cut of NFS direct IO support. - NFS direct IO completely bypasses the buffer and page caches. If a file is open for direct IO all caching is disabled. - Direct IO for Directories will be addressed later. - 2 new NFS directio related sysctls are added. One is a knob to disable NFS direct IO completely (direct IO is enabled by default). The other is to disallow mmaped IO on a file that has at least one O_DIRECT open (see the comment in nfs_vnops.c for more details). The default is to allow mmaps on a file that has O_DIRECT opens. Submitted by: Mohan Srinivasan mohans at yahoo-inc dot com Obtained from: Yahoo! --- sys/nfsclient/nfs.h | 2 + sys/nfsclient/nfs_bio.c | 174 +++++++++++++++++++++++++++++++++++++++++++++ sys/nfsclient/nfs_nfsiod.c | 14 ++-- sys/nfsclient/nfs_vfsops.c | 1 + sys/nfsclient/nfs_vnops.c | 39 ++++++++++ sys/nfsclient/nfsnode.h | 2 + 6 files changed, 228 insertions(+), 4 deletions(-) diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h index 4743ea2..92ba7de 100644 --- a/sys/nfsclient/nfs.h +++ b/sys/nfsclient/nfs.h @@ -124,6 +124,7 @@ MALLOC_DECLARE(M_NFSREQ); MALLOC_DECLARE(M_NFSDIROFF); MALLOC_DECLARE(M_NFSBIGFH); MALLOC_DECLARE(M_NFSHASH); +MALLOC_DECLARE(M_NFSDIRECTIO); #endif extern struct uma_zone *nfsmount_zone; @@ -275,6 +276,7 @@ int nfs_readdirrpc(struct vnode *, struct uio *, struct ucred *); int nfs_nfsiodnew(void); int nfs_asyncio(struct nfsmount *, struct buf *, struct ucred *, struct thread *); int nfs_doio(struct vnode *, struct buf *, struct ucred *, struct thread *); +void nfs_doio_directwrite (struct buf *); void nfs_up(struct nfsreq *, struct nfsmount *, struct thread *, const char *, int); void nfs_down(struct nfsreq *, struct nfsmount *, struct thread *, diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index c6eea93..74316b8 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -66,7 +66,11 @@ __FBSDID("$FreeBSD$"); static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td); +static int nfs_directio_write(struct vnode *vp, struct uio *uiop, + struct ucred *cred, int ioflag); +extern int nfs_directio_enable; +extern int nfs_directio_allow_mmap; /* * Vnode op for VM getpages. */ @@ -84,10 +88,12 @@ nfs_getpages(struct vop_getpages_args *ap) struct nfsmount *nmp; vm_object_t object; vm_page_t *pages; + struct nfsnode *np; GIANT_REQUIRED; vp = ap->a_vp; + np = VTONFS(vp); td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ nmp = VFSTONFS(vp->v_mount); @@ -99,6 +105,12 @@ nfs_getpages(struct vop_getpages_args *ap) return VM_PAGER_ERROR; } + if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && + (vp->v_type == VREG)) { + printf("nfs_getpages: called on non-cacheable vnode??\n"); + return VM_PAGER_ERROR; + } + if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { /* We'll never get here for v4, because we always have fsinfo */ @@ -275,6 +287,10 @@ nfs_putpages(struct vop_putpages_args *ap) (void)nfs_fsinfo(nmp, vp, cred, td); } + if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && + (vp->v_type == VREG)) + printf("nfs_putpages: called on noncache-able vnode??\n"); + for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_AGAIN; @@ -365,6 +381,11 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) if (vp->v_type != VDIR && (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); + + if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) + /* No caching/ no readaheads. Just read data into the user buffer */ + return nfs_readrpc(vp, uio, cred); + biosize = vp->v_mount->mnt_stat.f_iosize; seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); /* @@ -684,6 +705,136 @@ again: } /* + * The NFS write path cannot handle iovecs with len > 1. So we need to + * break up iovecs accordingly (restricting them to wsize). + * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). + * For the ASYNC case, 2 copies are needed. The first a copy from the + * user buffer to a staging buffer and then a second copy from the staging + * buffer to mbufs. This can be optimized by copying from the user buffer + * directly into mbufs and passing the chain down, but that requires a + * fair amount of re-working of the relevant codepaths (and can be done + * later). + */ +static int +nfs_directio_write(vp, uiop, cred, ioflag) + struct vnode *vp; + struct uio *uiop; + struct ucred *cred; + int ioflag; +{ + int error; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct thread *td = uiop->uio_td; + int size; + + if (ioflag & IO_SYNC) { + int iomode, must_commit; + struct uio uio; + struct iovec iov; +do_sync: + while (uiop->uio_resid > 0) { + size = min(uiop->uio_resid, nmp->nm_wsize); + size = min(uiop->uio_iov->iov_len, size); + iov.iov_base = uiop->uio_iov->iov_base; + iov.iov_len = size; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = uiop->uio_offset; + uio.uio_resid = size; + uio.uio_segflg = UIO_USERSPACE; + uio.uio_rw = UIO_WRITE; + uio.uio_td = td; + iomode = NFSV3WRITE_FILESYNC; + error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, + &iomode, &must_commit); + KASSERT((must_commit == 0), + ("nfs_directio_write: Did not commit write")); + if (error) + return (error); + uiop->uio_offset += size; + uiop->uio_resid -= size; + if (uiop->uio_iov->iov_len <= size) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base = + (char *)uiop->uio_iov->iov_base + size; + uiop->uio_iov->iov_len -= size; + } + } + } else { + struct uio *t_uio; + struct iovec *t_iov; + struct buf *bp; + + /* + * Break up the write into blocksize chunks and hand these + * over to nfsiod's for write back. + * Unfortunately, this incurs a copy of the data. Since + * the user could modify the buffer before the write is + * initiated. + * + * The obvious optimization here is that one of the 2 copies + * in the async write path can be eliminated by copying the + * data here directly into mbufs and passing the mbuf chain + * down. But that will require a fair amount of re-working + * of the code and can be done if there's enough interest + * in NFS directio access. + */ + while (uiop->uio_resid > 0) { + size = min(uiop->uio_resid, nmp->nm_wsize); + size = min(uiop->uio_iov->iov_len, size); + bp = getpbuf(&nfs_pbuf_freecnt); + t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); + t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); + t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); + t_iov->iov_len = size; + t_uio->uio_iov = t_iov; + t_uio->uio_iovcnt = 1; + t_uio->uio_offset = uiop->uio_offset; + t_uio->uio_resid = size; + t_uio->uio_segflg = UIO_SYSSPACE; + t_uio->uio_rw = UIO_WRITE; + t_uio->uio_td = td; + bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size); + bp->b_flags |= B_DIRECT; + bp->b_iocmd = BIO_WRITE; + if (cred != NOCRED) { + crhold(cred); + bp->b_wcred = cred; + } else + bp->b_wcred = NOCRED; + bp->b_caller1 = (void *)t_uio; + bp->b_vp = vp; + vhold(vp); + error = nfs_asyncio(nmp, bp, NOCRED, td); + if (error) { + free(t_iov->iov_base, M_NFSDIRECTIO); + free(t_iov, M_NFSDIRECTIO); + free(t_uio, M_NFSDIRECTIO); + vdrop(bp->b_vp); + bp->b_vp = NULL; + relpbuf(bp, &nfs_pbuf_freecnt); + if (error == EINTR) + return (error); + goto do_sync; + } + uiop->uio_offset += size; + uiop->uio_resid -= size; + if (uiop->uio_iov->iov_len <= size) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base = + (char *)uiop->uio_iov->iov_base + size; + uiop->uio_iov->iov_len -= size; + } + } + } + return (0); +} + +/* * Vnode op for write using bio */ int @@ -756,6 +907,9 @@ restart: if (uio->uio_resid == 0) return (0); + if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) + return nfs_directio_write(vp, uio, cred, ioflag); + /* * We need to obtain the rslock if we intend to modify np->n_size * in order to guarentee the append point with multiple contending @@ -1261,6 +1415,26 @@ again: return (EIO); } +void +nfs_doio_directwrite(struct buf *bp) +{ + int iomode, must_commit; + struct uio *uiop = (struct uio *)bp->b_caller1; + char *iov_base = uiop->uio_iov->iov_base; + struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); + + iomode = NFSV3WRITE_FILESYNC; + uiop->uio_td = NULL; /* NULL since we're in nfsiod */ + (nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); + KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); + free(iov_base, M_NFSDIRECTIO); + free(uiop->uio_iov, M_NFSDIRECTIO); + free(uiop, M_NFSDIRECTIO); + vdrop(bp->b_vp); + bp->b_vp = NULL; + relpbuf(bp, &nfs_pbuf_freecnt); +} + /* * Do an I/O operation to/from a cache block. This may be called * synchronously or from an nfsiod. diff --git a/sys/nfsclient/nfs_nfsiod.c b/sys/nfsclient/nfs_nfsiod.c index d1ca197..8cbee10 100644 --- a/sys/nfsclient/nfs_nfsiod.c +++ b/sys/nfsclient/nfs_nfsiod.c @@ -243,10 +243,16 @@ nfssvc_iod(void *instance) nmp->nm_bufqwant = 0; wakeup(&nmp->nm_bufq); } - if (bp->b_iocmd == BIO_READ) - (void) nfs_doio(bp->b_vp, bp, bp->b_rcred, NULL); - else - (void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL); + if (bp->b_flags & B_DIRECT) { + KASSERT((bp->b_iocmd == BIO_WRITE), ("nfscvs_iod: BIO_WRITE not set")); + (void)nfs_doio_directwrite(bp); + } else { + if (bp->b_iocmd == BIO_READ) + (void) nfs_doio(bp->b_vp, bp, bp->b_rcred, NULL); + else + (void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL); + } + /* * If there are more than one iod on this mount, then defect * so that the iods can be shared out fairly between the mounts diff --git a/sys/nfsclient/nfs_vfsops.c b/sys/nfsclient/nfs_vfsops.c index 0f16db6..cb77e1b 100644 --- a/sys/nfsclient/nfs_vfsops.c +++ b/sys/nfsclient/nfs_vfsops.c @@ -78,6 +78,7 @@ MALLOC_DEFINE(M_NFSREQ, "NFS req", "NFS request header"); MALLOC_DEFINE(M_NFSBIGFH, "NFSV3 bigfh", "NFS version 3 file handle"); MALLOC_DEFINE(M_NFSDIROFF, "NFSV3 diroff", "NFS directory offset data"); MALLOC_DEFINE(M_NFSHASH, "NFS hash", "NFS hash tables"); +MALLOC_DEFINE(M_NFSDIRECTIO, "NFS DirectIO", "NFS Direct IO async write state"); uma_zone_t nfsmount_zone; diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c index 294f9c3..dd88e42 100644 --- a/sys/nfsclient/nfs_vnops.c +++ b/sys/nfsclient/nfs_vnops.c @@ -211,6 +211,24 @@ static int nfs_clean_pages_on_close = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW, &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close"); +int nfs_directio_enable = 1; +SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW, + &nfs_directio_enable, 0, "Enable NFS directio"); + +/* + * This sysctl allows other processes to mmap a file that has been opened O_DIRECT + * by a process. In general, having processes mmap the file while Direct IO is in + * progress can lead to Data Inconsistencies. But, we allow this by default to + * prevent DoS attacks - to prevent a malicious user from opening up files O_DIRECT + * preventing other users from mmap'ing these files. "Protected" environments where + * stricter consistency guarantees are required can disable this knob. + * The process that opened the file O_DIRECT cannot mmap() the file, because + * mmap'ed IO on an O_DIRECT open() is not meaningful. + */ +int nfs_directio_allow_mmap = 1; +SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW, + &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens"); + #if 0 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count"); @@ -401,6 +419,7 @@ nfs_open(struct vop_open_args *ap) struct nfsnode *np = VTONFS(vp); struct vattr vattr; int error; + int fmode = ap->a_mode; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) return (EOPNOTSUPP); @@ -434,6 +453,18 @@ nfs_open(struct vop_open_args *ap) np->n_mtime = vattr.va_mtime; } } + /* + * If the object has >= 1 O_DIRECT active opens, we disable caching. + */ + if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { + if (np->n_directio_opens == 0) { + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_td, 1); + if (error) + return (error); + np->n_flag |= NNONCACHE; + } + np->n_directio_opens++; + } np->ra_expect_lbn = 0; return (0); } @@ -472,6 +503,7 @@ nfs_close(struct vop_close_args *ap) struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); int error = 0; + int fmode = ap->a_fflag; if (vp->v_type == VREG) { /* @@ -520,6 +552,13 @@ nfs_close(struct vop_close_args *ap) error = np->n_error; } } + if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { + KASSERT((np->n_directio_opens > 0), + ("nfs_close: unexpectedly value (0) of n_directio_opens\n")); + np->n_directio_opens--; + if (np->n_directio_opens == 0) + np->n_flag &= ~NNONCACHE; + } return (error); } diff --git a/sys/nfsclient/nfsnode.h b/sys/nfsclient/nfsnode.h index 9889ea4..4c03183 100644 --- a/sys/nfsclient/nfsnode.h +++ b/sys/nfsclient/nfsnode.h @@ -127,6 +127,7 @@ struct nfsnode { u_char *n_name; /* leaf name, for v4 OPEN op */ uint32_t n_namelen; daddr_t ra_expect_lbn; + int n_directio_opens; }; #define n_atim n_un1.nf_atim @@ -149,6 +150,7 @@ struct nfsnode { #define NCREATED 0x0800 /* Opened by nfs_create() */ #define NTRUNCATE 0x1000 /* Opened by nfs_setattr() */ #define NSIZECHANGED 0x2000 /* File size has changed: need cache inval */ +#define NNONCACHE 0x4000 /* Node marked as noncacheable */ /* * Convert between nfsnode pointers and vnode pointers -- cgit v1.1