From 7c0944d56c2504d71b846fe79d625028c80cdfd8 Mon Sep 17 00:00:00 2001
From: ps <ps@FreeBSD.org>
Date: Wed, 15 Dec 2004 22:20:22 +0000
Subject: First cut of NFS direct IO support. - NFS direct IO completely
 bypasses the buffer and page caches.   If a file is open for direct IO all
 caching is disabled. - Direct IO for Directories will be addressed later. - 2
 new NFS directio related sysctls are added. One is a knob to   disable NFS
 direct IO completely (direct IO is enabled by default).   The other is to
 disallow mmaped IO on a file that has at least one   O_DIRECT open (see the
 comment in nfs_vnops.c for more details).   The default is to allow mmaps on
 a file that has O_DIRECT opens.

Submitted by:	Mohan Srinivasan mohans at yahoo-inc dot com
Obtained from:	Yahoo!
---
 sys/nfsclient/nfs.h        |   2 +
 sys/nfsclient/nfs_bio.c    | 174 +++++++++++++++++++++++++++++++++++++++++++++
 sys/nfsclient/nfs_nfsiod.c |  14 ++--
 sys/nfsclient/nfs_vfsops.c |   1 +
 sys/nfsclient/nfs_vnops.c  |  39 ++++++++++
 sys/nfsclient/nfsnode.h    |   2 +
 6 files changed, 228 insertions(+), 4 deletions(-)

diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h
index 4743ea2..92ba7de 100644
--- a/sys/nfsclient/nfs.h
+++ b/sys/nfsclient/nfs.h
@@ -124,6 +124,7 @@ MALLOC_DECLARE(M_NFSREQ);
 MALLOC_DECLARE(M_NFSDIROFF);
 MALLOC_DECLARE(M_NFSBIGFH);
 MALLOC_DECLARE(M_NFSHASH);
+MALLOC_DECLARE(M_NFSDIRECTIO);
 #endif
 
 extern struct uma_zone *nfsmount_zone;
@@ -275,6 +276,7 @@ int	nfs_readdirrpc(struct vnode *, struct uio *, struct ucred *);
 int	nfs_nfsiodnew(void);
 int	nfs_asyncio(struct nfsmount *, struct buf *, struct ucred *, struct thread *);
 int	nfs_doio(struct vnode *, struct buf *, struct ucred *, struct thread *);
+void    nfs_doio_directwrite (struct buf *);
 void    nfs_up(struct nfsreq *, struct nfsmount *, struct thread *,
 	    const char *, int);
 void	nfs_down(struct nfsreq *, struct nfsmount *, struct thread *,
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index c6eea93..74316b8 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -66,7 +66,11 @@ __FBSDID("$FreeBSD$");
 
 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
 		    struct thread *td);
+static int nfs_directio_write(struct vnode *vp, struct uio *uiop, 
+			      struct ucred *cred, int ioflag);
 
+extern int nfs_directio_enable;
+extern int nfs_directio_allow_mmap;
 /*
  * Vnode op for VM getpages.
  */
@@ -84,10 +88,12 @@ nfs_getpages(struct vop_getpages_args *ap)
 	struct nfsmount *nmp;
 	vm_object_t object;
 	vm_page_t *pages;
+	struct nfsnode *np;
 
 	GIANT_REQUIRED;
 
 	vp = ap->a_vp;
+	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	cred = curthread->td_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
@@ -99,6 +105,12 @@ nfs_getpages(struct vop_getpages_args *ap)
 		return VM_PAGER_ERROR;
 	}
 
+	if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 
+	    (vp->v_type == VREG)) {
+		printf("nfs_getpages: called on non-cacheable vnode??\n");
+		return VM_PAGER_ERROR;
+	}
+
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		/* We'll never get here for v4, because we always have fsinfo */
@@ -275,6 +287,10 @@ nfs_putpages(struct vop_putpages_args *ap)
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	}
 
+	if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 
+	    (vp->v_type == VREG))
+		printf("nfs_putpages: called on noncache-able vnode??\n");
+
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
@@ -365,6 +381,11 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 	if (vp->v_type != VDIR &&
 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 		return (EFBIG);
+
+	if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
+		/* No caching/ no readaheads. Just read data into the user buffer */
+		return nfs_readrpc(vp, uio, cred);
+
 	biosize = vp->v_mount->mnt_stat.f_iosize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 	/*
@@ -684,6 +705,136 @@ again:
 }
 
 /*
+ * The NFS write path cannot handle iovecs with len > 1. So we need to 
+ * break up iovecs accordingly (restricting them to wsize).
+ * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 
+ * For the ASYNC case, 2 copies are needed. The first a copy from the 
+ * user buffer to a staging buffer and then a second copy from the staging
+ * buffer to mbufs. This can be optimized by copying from the user buffer
+ * directly into mbufs and passing the chain down, but that requires a 
+ * fair amount of re-working of the relevant codepaths (and can be done
+ * later).
+ */
+static int
+nfs_directio_write(vp, uiop, cred, ioflag)
+	struct vnode *vp;
+	struct uio *uiop;
+	struct ucred *cred;
+	int ioflag;
+{
+	int error;
+	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
+	struct thread *td = uiop->uio_td;
+	int size;
+
+	if (ioflag & IO_SYNC) {
+		int iomode, must_commit;
+		struct uio uio;
+		struct iovec iov;
+do_sync:
+		while (uiop->uio_resid > 0) {
+			size = min(uiop->uio_resid, nmp->nm_wsize);
+			size = min(uiop->uio_iov->iov_len, size);
+			iov.iov_base = uiop->uio_iov->iov_base;
+			iov.iov_len = size;
+			uio.uio_iov = &iov;
+			uio.uio_iovcnt = 1;
+			uio.uio_offset = uiop->uio_offset;
+			uio.uio_resid = size;
+			uio.uio_segflg = UIO_USERSPACE;
+			uio.uio_rw = UIO_WRITE;
+			uio.uio_td = td;
+			iomode = NFSV3WRITE_FILESYNC;
+			error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 
+						      &iomode, &must_commit);
+			KASSERT((must_commit == 0), 
+				("nfs_directio_write: Did not commit write"));
+			if (error)
+				return (error);
+			uiop->uio_offset += size;
+			uiop->uio_resid -= size;
+			if (uiop->uio_iov->iov_len <= size) {
+				uiop->uio_iovcnt--;
+				uiop->uio_iov++;
+			} else {
+				uiop->uio_iov->iov_base = 
+					(char *)uiop->uio_iov->iov_base + size;
+				uiop->uio_iov->iov_len -= size;
+			}
+		}
+	} else {
+		struct uio *t_uio;
+		struct iovec *t_iov;
+		struct buf *bp;
+		
+		/*
+		 * Break up the write into blocksize chunks and hand these
+		 * over to nfsiod's for write back.
+		 * Unfortunately, this incurs a copy of the data. Since 
+		 * the user could modify the buffer before the write is 
+		 * initiated.
+		 * 
+		 * The obvious optimization here is that one of the 2 copies
+		 * in the async write path can be eliminated by copying the
+		 * data here directly into mbufs and passing the mbuf chain
+		 * down. But that will require a fair amount of re-working
+		 * of the code and can be done if there's enough interest
+		 * in NFS directio access.
+		 */
+		while (uiop->uio_resid > 0) {
+			size = min(uiop->uio_resid, nmp->nm_wsize);
+			size = min(uiop->uio_iov->iov_len, size);
+			bp = getpbuf(&nfs_pbuf_freecnt);
+			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
+			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
+			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
+			t_iov->iov_len = size;
+			t_uio->uio_iov = t_iov;
+			t_uio->uio_iovcnt = 1;
+			t_uio->uio_offset = uiop->uio_offset;
+			t_uio->uio_resid = size;
+			t_uio->uio_segflg = UIO_SYSSPACE;
+			t_uio->uio_rw = UIO_WRITE;
+			t_uio->uio_td = td;
+			bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
+			bp->b_flags |= B_DIRECT;
+			bp->b_iocmd = BIO_WRITE;
+			if (cred != NOCRED) {
+				crhold(cred);
+				bp->b_wcred = cred;
+			} else 
+				bp->b_wcred = NOCRED;			
+			bp->b_caller1 = (void *)t_uio;
+			bp->b_vp = vp;
+			vhold(vp);
+			error = nfs_asyncio(nmp, bp, NOCRED, td);
+			if (error) {
+				free(t_iov->iov_base, M_NFSDIRECTIO);
+				free(t_iov, M_NFSDIRECTIO);
+				free(t_uio, M_NFSDIRECTIO);
+				vdrop(bp->b_vp);
+				bp->b_vp = NULL;
+				relpbuf(bp, &nfs_pbuf_freecnt);
+				if (error == EINTR)
+					return (error);
+				goto do_sync;
+			}
+			uiop->uio_offset += size;
+			uiop->uio_resid -= size;
+			if (uiop->uio_iov->iov_len <= size) {
+				uiop->uio_iovcnt--;
+				uiop->uio_iov++;
+			} else {
+				uiop->uio_iov->iov_base = 
+					(char *)uiop->uio_iov->iov_base + size;
+				uiop->uio_iov->iov_len -= size;
+			}
+		}
+	}
+	return (0);
+}
+
+/*
  * Vnode op for write using bio
  */
 int
@@ -756,6 +907,9 @@ restart:
 	if (uio->uio_resid == 0)
 		return (0);
 
+	if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
+		return nfs_directio_write(vp, uio, cred, ioflag);
+
 	/*
 	 * We need to obtain the rslock if we intend to modify np->n_size
 	 * in order to guarentee the append point with multiple contending
@@ -1261,6 +1415,26 @@ again:
 	return (EIO);
 }
 
+void
+nfs_doio_directwrite(struct buf *bp)
+{
+	int iomode, must_commit;
+	struct uio *uiop = (struct uio *)bp->b_caller1;
+	char *iov_base = uiop->uio_iov->iov_base;
+	struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount);
+	
+	iomode = NFSV3WRITE_FILESYNC;
+	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
+	(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit);
+	KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write"));
+	free(iov_base, M_NFSDIRECTIO);
+	free(uiop->uio_iov, M_NFSDIRECTIO);
+	free(uiop, M_NFSDIRECTIO);
+	vdrop(bp->b_vp);
+	bp->b_vp = NULL;
+	relpbuf(bp, &nfs_pbuf_freecnt);
+}
+
 /*
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.
diff --git a/sys/nfsclient/nfs_nfsiod.c b/sys/nfsclient/nfs_nfsiod.c
index d1ca197..8cbee10 100644
--- a/sys/nfsclient/nfs_nfsiod.c
+++ b/sys/nfsclient/nfs_nfsiod.c
@@ -243,10 +243,16 @@ nfssvc_iod(void *instance)
 		    nmp->nm_bufqwant = 0;
 		    wakeup(&nmp->nm_bufq);
 		}
-		if (bp->b_iocmd == BIO_READ)
-		    (void) nfs_doio(bp->b_vp, bp, bp->b_rcred, NULL);
-		else
-		    (void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL);
+		if (bp->b_flags & B_DIRECT) {
+			KASSERT((bp->b_iocmd == BIO_WRITE), ("nfscvs_iod: BIO_WRITE not set"));
+			(void)nfs_doio_directwrite(bp);
+		} else {
+			if (bp->b_iocmd == BIO_READ)
+				(void) nfs_doio(bp->b_vp, bp, bp->b_rcred, NULL);
+			else
+				(void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL);
+		}
+
 		/*
 		 * If there are more than one iod on this mount, then defect
 		 * so that the iods can be shared out fairly between the mounts
diff --git a/sys/nfsclient/nfs_vfsops.c b/sys/nfsclient/nfs_vfsops.c
index 0f16db6..cb77e1b 100644
--- a/sys/nfsclient/nfs_vfsops.c
+++ b/sys/nfsclient/nfs_vfsops.c
@@ -78,6 +78,7 @@ MALLOC_DEFINE(M_NFSREQ, "NFS req", "NFS request header");
 MALLOC_DEFINE(M_NFSBIGFH, "NFSV3 bigfh", "NFS version 3 file handle");
 MALLOC_DEFINE(M_NFSDIROFF, "NFSV3 diroff", "NFS directory offset data");
 MALLOC_DEFINE(M_NFSHASH, "NFS hash", "NFS hash tables");
+MALLOC_DEFINE(M_NFSDIRECTIO, "NFS DirectIO", "NFS Direct IO async write state");
 
 uma_zone_t nfsmount_zone;
 
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index 294f9c3..dd88e42 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -211,6 +211,24 @@ static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
+int nfs_directio_enable = 1;
+SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
+	   &nfs_directio_enable, 0, "Enable NFS directio");
+
+/*
+ * This sysctl allows other processes to mmap a file that has been opened O_DIRECT 
+ * by a process. In general, having processes mmap the file while Direct IO is in 
+ * progress can lead to Data Inconsistencies. But, we allow this by default to 
+ * prevent DoS attacks - to prevent a malicious user from opening up files O_DIRECT 
+ * preventing other users from mmap'ing these files. "Protected" environments where 
+ * stricter consistency guarantees are required can disable this knob. 
+ * The process that opened the file O_DIRECT cannot mmap() the file, because 
+ * mmap'ed IO on an O_DIRECT open() is not meaningful.
+ */
+int nfs_directio_allow_mmap = 1;
+SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
+	   &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
+
 #if 0
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
 	   &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
@@ -401,6 +419,7 @@ nfs_open(struct vop_open_args *ap)
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
+	int fmode = ap->a_mode;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
@@ -434,6 +453,18 @@ nfs_open(struct vop_open_args *ap)
 			np->n_mtime = vattr.va_mtime;
 		}
 	}
+	/*
+	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
+	 */
+	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
+		if (np->n_directio_opens == 0) {
+			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_td, 1);
+			if (error)
+				return (error);
+			np->n_flag |= NNONCACHE;
+		}
+		np->n_directio_opens++;
+	}
 	np->ra_expect_lbn = 0;
 	return (0);
 }
@@ -472,6 +503,7 @@ nfs_close(struct vop_close_args *ap)
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
+	int fmode = ap->a_fflag;
 
 	if (vp->v_type == VREG) {
 	    /*
@@ -520,6 +552,13 @@ nfs_close(struct vop_close_args *ap)
 		error = np->n_error;
 	    }
 	}
+	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
+		KASSERT((np->n_directio_opens > 0), 
+			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));		
+		np->n_directio_opens--;
+		if (np->n_directio_opens == 0)
+			np->n_flag &= ~NNONCACHE;
+	}
 	return (error);
 }
 
diff --git a/sys/nfsclient/nfsnode.h b/sys/nfsclient/nfsnode.h
index 9889ea4..4c03183 100644
--- a/sys/nfsclient/nfsnode.h
+++ b/sys/nfsclient/nfsnode.h
@@ -127,6 +127,7 @@ struct nfsnode {
 	u_char			*n_name;	/* leaf name, for v4 OPEN op */
 	uint32_t		n_namelen;
 	daddr_t			ra_expect_lbn;
+	int			n_directio_opens;
 };
 
 #define n_atim		n_un1.nf_atim
@@ -149,6 +150,7 @@ struct nfsnode {
 #define	NCREATED	0x0800	/* Opened by nfs_create() */
 #define	NTRUNCATE	0x1000	/* Opened by nfs_setattr() */
 #define	NSIZECHANGED	0x2000  /* File size has changed: need cache inval */
+#define NNONCACHE	0x4000  /* Node marked as noncacheable */
 
 /*
  * Convert between nfsnode pointers and vnode pointers
-- 
cgit v1.1