summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2001-05-24 07:22:27 +0000
committerdillon <dillon@FreeBSD.org>2001-05-24 07:22:27 +0000
commita179ee09ab9ca2d9d1d09dc4752c53a13609f5e9 (patch)
treefaca8401754525a67aa26f144230806cf238e370
parenta26134411c10ba2364d3d85686667b8a87f0015f (diff)
downloadFreeBSD-src-a179ee09ab9ca2d9d1d09dc4752c53a13609f5e9.zip
FreeBSD-src-a179ee09ab9ca2d9d1d09dc4752c53a13609f5e9.tar.gz
This patch implements O_DIRECT about 80% of the way. It takes a patchset
Tor created a while ago, removes the raw I/O piece (that has cache coherency problems), and adds a buffer cache / VM freeing piece. Essentially this patch causes O_DIRECT I/O to not be left in the cache, but does not prevent it from going through the cache, hence the 80%. For the last 20% we need a method by which the I/O can be issued directly to buffer supplied by the user process and bypass the buffer cache entirely, but still maintain cache coherency. I also have the code working under -stable but the changes made to sys/file.h may not be MFCable, so an MFC is not on the table yet. Submitted by: tegge, dillon
-rw-r--r--lib/libc/sys/fcntl.25
-rw-r--r--lib/libc/sys/open.27
-rw-r--r--sys/kern/vfs_bio.c9
-rw-r--r--sys/kern/vfs_cluster.c9
-rw-r--r--sys/kern/vfs_vnops.c4
-rw-r--r--sys/sys/buf.h10
-rw-r--r--sys/sys/fcntl.h7
-rw-r--r--sys/sys/file.h3
-rw-r--r--sys/sys/vnode.h1
-rw-r--r--sys/ufs/ufs/ufs_readwrite.c36
-rw-r--r--sys/vm/vm_page.c23
-rw-r--r--sys/vm/vm_page.h1
12 files changed, 100 insertions, 15 deletions
diff --git a/lib/libc/sys/fcntl.2 b/lib/libc/sys/fcntl.2
index b2911e5..479e097 100644
--- a/lib/libc/sys/fcntl.2
+++ b/lib/libc/sys/fcntl.2
@@ -147,6 +147,11 @@ corresponds to the
.Dv O_APPEND
flag of
.Xr open 2 .
+.It Dv O_DIRECT
+Minimize or eliminate the cache effects of reading and writing. The system
+will attempt to avoid caching the data you read or write. If it cannot
+avoid caching the data, it will minimize the impact the data has on the cache.
+Use of this flag can drastically reduce performance if not used with care.
.It Dv O_ASYNC
Enable the
.Dv SIGIO
diff --git a/lib/libc/sys/open.2 b/lib/libc/sys/open.2
index 769ac27..1a29a17 100644
--- a/lib/libc/sys/open.2
+++ b/lib/libc/sys/open.2
@@ -83,6 +83,7 @@ O_TRUNC truncate size to 0
O_EXCL error if create and file exists
O_SHLOCK atomically obtain a shared lock
O_EXLOCK atomically obtain an exclusive lock
+O_DIRECT eliminate or reduce cache effects
O_FSYNC synchronous writes
O_NOFOLLOW do not follow symlinks
.Ed
@@ -150,6 +151,12 @@ If creating a file with
the request for the lock will never fail
(provided that the underlying filesystem supports locking).
.Pp
+.Dv O_DIRECT may be used to
+minimize or eliminate the cache effects of reading and writing. The system
+will attempt to avoid caching the data you read or write. If it cannot
+avoid caching the data, it will minimize the impact the data has on the cache.
+Use of this flag can drastically reduce performance if not used with care.
+.Pp
If successful,
.Fn open
returns a non-negative integer, termed a file descriptor.
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index b06625b..246fc4c 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1249,7 +1249,7 @@ brelse(struct buf * bp)
/* unlock */
BUF_UNLOCK(bp);
- bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
bp->b_ioflags &= ~BIO_ORDERED;
if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
panic("brelse: not dirty");
@@ -1264,6 +1264,8 @@ brelse(struct buf * bp)
* biodone() to requeue an async I/O on completion. It is also used when
* known good buffers need to be requeued but we think we may need the data
* again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
*/
void
bqrelse(struct buf * bp)
@@ -1355,12 +1357,15 @@ vfs_vmio_release(bp)
vm_page_flag_clear(m, PG_ZERO);
/*
* Might as well free the page if we can and it has
- * no valid data.
+ * no valid data. We also free the page if the
+ * buffer was used for direct I/O
*/
if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
+ } else if (bp->b_flags & B_DIRECT) {
+ vm_page_try_to_free(m);
} else if (vm_page_count_severe()) {
vm_page_try_to_cache(m);
}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 0eb47bd..c9c09cb 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -505,6 +505,15 @@ cluster_callback(bp)
tbp->b_dirtyoff = tbp->b_dirtyend = 0;
tbp->b_flags &= ~B_INVAL;
tbp->b_ioflags &= ~BIO_ERROR;
+ /*
+ * XXX the bdwrite()/bqrelse() issued during
+ * cluster building clears B_RELBUF (see bqrelse()
+ * comment). If direct I/O was specified, we have
+ * to restore it here to allow the buffer and VM
+ * to be freed.
+ */
+ if (tbp->b_flags & B_DIRECT)
+ tbp->b_flags |= B_RELBUF;
}
bufdone(tbp);
}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index fd13579..de7a7ce 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -352,6 +352,8 @@ vn_read(fp, uio, cred, flags, p)
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
VOP_LEASE(vp, p, cred, LEASE_READ);
vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
if ((flags & FOF_OFFSET) == 0)
@@ -393,6 +395,8 @@ vn_write(fp, uio, cred, flags, p)
ioflag |= IO_APPEND;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
if ((fp->f_flag & O_FSYNC) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 5a449ac4..b285db2 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -187,13 +187,17 @@ struct buf {
* The buffer's data is always PAGE_SIZE aligned even
* if b_bufsize and b_bcount are not. ( b_bufsize is
* always at least DEV_BSIZE aligned, though ).
- *
+ *
+ * B_DIRECT Hint that we should attempt to completely free
+ * the pages underlying the buffer. B_DIRECT is
+ * sticky until the buffer is released and typically
+ * only has an effect when B_RELBUF is also set.
*/
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
-#define B_UNUSED0 0x00000008 /* Old B_BAD */
+#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */
#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
#define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */
@@ -225,7 +229,7 @@ struct buf {
"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
- "\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
+ "\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
/*
* These flags are kept in b_xflags.
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index 6bbf156..85a168c 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -98,15 +98,18 @@
/* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
#define O_NOCTTY 0x8000 /* don't assign controlling terminal */
+/* Attempt to bypass buffer cache */
+#define O_DIRECT 0x00010000
+
#ifdef _KERNEL
/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
#define FFLAGS(oflags) ((oflags) + 1)
#define OFLAGS(fflags) ((fflags) - 1)
/* bits to save after open */
-#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
+#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
/* bits settable by fcntl(F_SETFL, ...) */
-#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
+#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
#endif
/*
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 207f227..6e04352 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -56,7 +56,7 @@ struct knote;
*/
struct file {
LIST_ENTRY(file) f_list;/* list of active files */
- short f_flag; /* see fcntl.h */
+ short f_FILLER3; /* (old f_flag) */
#define DTYPE_VNODE 1 /* file */
#define DTYPE_SOCKET 2 /* communications endpoint */
#define DTYPE_PIPE 3 /* pipe */
@@ -93,6 +93,7 @@ struct file {
*/
off_t f_offset;
caddr_t f_data; /* vnode or socket */
+ u_int f_flag; /* see fcntl.h */
};
#ifdef MALLOC_DECLARE
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index e530d4a..f4650dc 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -220,6 +220,7 @@ struct vattr {
#define IO_VMIO 0x20 /* data already in VMIO space */
#define IO_INVAL 0x40 /* invalidate after I/O */
#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
+#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index 6f56aee..c40d98f 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -287,6 +287,15 @@ READ(ap)
}
/*
+ * If IO_DIRECT then set B_DIRECT for the buffer. This
+ * will cause us to attempt to release the buffer later on
+ * and will cause the buffer cache to attempt to free the
+ * underlying pages.
+ */
+ if (ioflag & IO_DIRECT)
+ bp->b_flags |= B_DIRECT;
+
+ /*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
@@ -328,12 +337,12 @@ READ(ap)
if (error)
break;
- if ((ioflag & IO_VMIO) &&
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
/*
- * If there are no dependencies, and
- * it's VMIO, then we don't need the buf,
- * mark it available for freeing. The VM has the data.
+ * If there are no dependencies, and it's VMIO,
+ * then we don't need the buf, mark it available
+ * for freeing. The VM has the data.
*/
bp->b_flags |= B_RELBUF;
brelse(bp);
@@ -355,7 +364,7 @@ READ(ap)
* so it must have come from a 'break' statement
*/
if (bp != NULL) {
- if ((ioflag & IO_VMIO) &&
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
brelse(bp);
@@ -514,6 +523,8 @@ WRITE(ap)
ap->a_cred, flags, &bp);
if (error != 0)
break;
+ if (ioflag & IO_DIRECT)
+ bp->b_flags |= B_DIRECT;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
@@ -526,10 +537,18 @@ WRITE(ap)
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
- if ((ioflag & IO_VMIO) &&
- (LIST_FIRST(&bp->b_dep) == NULL))
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
+ }
+ /*
+ * If IO_SYNC each buffer is written synchronously. Otherwise
+ * if we have a severe page deficiency write the buffer
+ * asynchronously. Otherwise try to cluster, and if that
+ * doesn't do it then either do an async write (if O_DIRECT),
+ * or a delayed write (if not).
+ */
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
} else if (vm_page_count_severe() ||
@@ -544,6 +563,9 @@ WRITE(ap)
} else {
bawrite(bp);
}
+ } else if (ioflag & IO_DIRECT) {
+ bp->b_flags |= B_CLUSTEROK;
+ bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index c1817d5..dc391cb 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1304,6 +1304,29 @@ vm_page_try_to_cache(vm_page_t m)
}
/*
+ * vm_page_try_to_free()
+ *
+ * Attempt to free the page. If we cannot free it, we do nothing.
+ * 1 is returned on success, 0 on failure.
+ */
+int
+vm_page_try_to_free(m)
+ vm_page_t m;
+{
+ if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+ (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+ return(0);
+ }
+ vm_page_test_dirty(m);
+ if (m->dirty)
+ return(0);
+ vm_page_busy(m);
+ vm_page_protect(m, VM_PROT_NONE);
+ vm_page_free(m);
+ return(1);
+}
+
+/*
* vm_page_cache
*
* Put the specified page onto the page cache queue (if appropriate).
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index e1c1cc4..6bc7266 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -421,6 +421,7 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
void vm_page_cache __P((register vm_page_t));
int vm_page_try_to_cache __P((vm_page_t));
+int vm_page_try_to_free __P((vm_page_t));
void vm_page_dontneed __P((register vm_page_t));
static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
static __inline void vm_page_free __P((vm_page_t));
OpenPOWER on IntegriCloud