diff options
author | dillon <dillon@FreeBSD.org> | 2001-05-24 07:22:27 +0000 |
---|---|---|
committer | dillon <dillon@FreeBSD.org> | 2001-05-24 07:22:27 +0000 |
commit | a179ee09ab9ca2d9d1d09dc4752c53a13609f5e9 (patch) | |
tree | faca8401754525a67aa26f144230806cf238e370 | |
parent | a26134411c10ba2364d3d85686667b8a87f0015f (diff) | |
download | FreeBSD-src-a179ee09ab9ca2d9d1d09dc4752c53a13609f5e9.zip FreeBSD-src-a179ee09ab9ca2d9d1d09dc4752c53a13609f5e9.tar.gz |
This patch implements O_DIRECT about 80% of the way. It takes a patchset
Tor created a while ago, removes the raw I/O piece (that has cache coherency
problems), and adds a buffer cache / VM freeing piece.
Essentially this patch causes O_DIRECT I/O to not be left in the cache, but
does not prevent it from going through the cache, hence the 80%. For
the last 20% we need a method by which the I/O can be issued directly to
buffer supplied by the user process and bypass the buffer cache entirely,
but still maintain cache coherency.
I also have the code working under -stable but the changes made to sys/file.h
may not be MFCable, so an MFC is not on the table yet.
Submitted by: tegge, dillon
-rw-r--r-- | lib/libc/sys/fcntl.2 | 5 | ||||
-rw-r--r-- | lib/libc/sys/open.2 | 7 | ||||
-rw-r--r-- | sys/kern/vfs_bio.c | 9 | ||||
-rw-r--r-- | sys/kern/vfs_cluster.c | 9 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 4 | ||||
-rw-r--r-- | sys/sys/buf.h | 10 | ||||
-rw-r--r-- | sys/sys/fcntl.h | 7 | ||||
-rw-r--r-- | sys/sys/file.h | 3 | ||||
-rw-r--r-- | sys/sys/vnode.h | 1 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_readwrite.c | 36 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 23 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 1 |
12 files changed, 100 insertions, 15 deletions
diff --git a/lib/libc/sys/fcntl.2 b/lib/libc/sys/fcntl.2 index b2911e5..479e097 100644 --- a/lib/libc/sys/fcntl.2 +++ b/lib/libc/sys/fcntl.2 @@ -147,6 +147,11 @@ corresponds to the .Dv O_APPEND flag of .Xr open 2 . +.It Dv O_DIRECT +Minimize or eliminate the cache effects of reading and writing. The system +will attempt to avoid caching the data you read or write. If it cannot +avoid caching the data, it will minimize the impact the data has on the cache. +Use of this flag can drastically reduce performance if not used with care. .It Dv O_ASYNC Enable the .Dv SIGIO diff --git a/lib/libc/sys/open.2 b/lib/libc/sys/open.2 index 769ac27..1a29a17 100644 --- a/lib/libc/sys/open.2 +++ b/lib/libc/sys/open.2 @@ -83,6 +83,7 @@ O_TRUNC truncate size to 0 O_EXCL error if create and file exists O_SHLOCK atomically obtain a shared lock O_EXLOCK atomically obtain an exclusive lock +O_DIRECT eliminate or reduce cache effects O_FSYNC synchronous writes O_NOFOLLOW do not follow symlinks .Ed @@ -150,6 +151,12 @@ If creating a file with the request for the lock will never fail (provided that the underlying filesystem supports locking). .Pp +.Dv O_DIRECT may be used to +minimize or eliminate the cache effects of reading and writing. The system +will attempt to avoid caching the data you read or write. If it cannot +avoid caching the data, it will minimize the impact the data has on the cache. +Use of this flag can drastically reduce performance if not used with care. +.Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index b06625b..246fc4c 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1249,7 +1249,7 @@ brelse(struct buf * bp) /* unlock */ BUF_UNLOCK(bp); - bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); bp->b_ioflags &= ~BIO_ORDERED; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); @@ -1264,6 +1264,8 @@ brelse(struct buf * bp) * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. + * + * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf * bp) @@ -1355,12 +1357,15 @@ vfs_vmio_release(bp) vm_page_flag_clear(m, PG_ZERO); /* * Might as well free the page if we can and it has - * no valid data. + * no valid data. We also free the page if the + * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); + } else if (bp->b_flags & B_DIRECT) { + vm_page_try_to_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 0eb47bd..c9c09cb 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -505,6 +505,15 @@ cluster_callback(bp) tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; + /* + * XXX the bdwrite()/bqrelse() issued during + * cluster building clears B_RELBUF (see bqrelse() + * comment). If direct I/O was specified, we have + * to restore it here to allow the buffer and VM + * to be freed. + */ + if (tbp->b_flags & B_DIRECT) + tbp->b_flags |= B_RELBUF; } bufdone(tbp); } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index fd13579..de7a7ce 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -352,6 +352,8 @@ vn_read(fp, uio, cred, flags, p) ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; VOP_LEASE(vp, p, cred, LEASE_READ); vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); if ((flags & FOF_OFFSET) == 0) @@ -393,6 +395,8 @@ vn_write(fp, uio, cred, flags, p) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 5a449ac4..b285db2 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -187,13 +187,17 @@ struct buf { * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). - * + * + * B_DIRECT Hint that we should attempt to completely free + * the pages underlying the buffer. B_DIRECT is + * sticky until the buffer is released and typically + * only has an effect when B_RELBUF is also set. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ -#define B_UNUSED0 0x00000008 /* Old B_BAD */ +#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ @@ -225,7 +229,7 @@ struct buf { "\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ - "\10delwri\7call\6cache\4bad\3async\2needcommit\1age" + "\10delwri\7call\6cache\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index 6bbf156..85a168c 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -98,15 +98,18 @@ /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */ #define O_NOCTTY 0x8000 /* don't assign controlling terminal */ +/* Attempt to bypass buffer cache */ +#define O_DIRECT 0x00010000 + #ifdef _KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) #define OFLAGS(fflags) ((fflags) - 1) /* bits to save after open */ -#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK) +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT) /* bits settable by fcntl(F_SETFL, ...) */ -#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM) +#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT) #endif /* diff --git a/sys/sys/file.h b/sys/sys/file.h index 207f227..6e04352 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -56,7 +56,7 @@ struct knote; */ struct file { LIST_ENTRY(file) f_list;/* list of active files */ - short f_flag; /* see fcntl.h */ + short f_FILLER3; /* (old f_flag) */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ @@ -93,6 +93,7 @@ struct file { */ off_t f_offset; caddr_t f_data; /* vnode or socket */ + u_int f_flag; /* see fcntl.h */ }; #ifdef MALLOC_DECLARE diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index e530d4a..f4650dc 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -220,6 +220,7 @@ struct vattr { #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ #define IO_ASYNC 0x80 /* bawrite rather then bdwrite */ +#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */ /* * Modes. Some values same as Ixxx entries from inode.h for now. diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index 6f56aee..c40d98f 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -287,6 +287,15 @@ READ(ap) } /* + * If IO_DIRECT then set B_DIRECT for the buffer. This + * will cause us to attempt to release the buffer later on + * and will cause the buffer cache to attempt to free the + * underlying pages. + */ + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + + /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, @@ -328,12 +337,12 @@ READ(ap) if (error) break; - if ((ioflag & IO_VMIO) && + if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_FIRST(&bp->b_dep) == NULL)) { /* - * If there are no dependencies, and - * it's VMIO, then we don't need the buf, - * mark it available for freeing. The VM has the data. + * If there are no dependencies, and it's VMIO, + * then we don't need the buf, mark it available + * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); @@ -355,7 +364,7 @@ READ(ap) * so it must have come from a 'break' statement */ if (bp != NULL) { - if ((ioflag & IO_VMIO) && + if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); @@ -514,6 +523,8 @@ WRITE(ap) ap->a_cred, flags, &bp); if (error != 0) break; + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; @@ -526,10 +537,18 @@ WRITE(ap) error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); - if ((ioflag & IO_VMIO) && - (LIST_FIRST(&bp->b_dep) == NULL)) + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; + } + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || @@ -544,6 +563,9 @@ WRITE(ap) } else { bawrite(bp); } + } else if (ioflag & IO_DIRECT) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index c1817d5..dc391cb 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1304,6 +1304,29 @@ vm_page_try_to_cache(vm_page_t m) } /* + * vm_page_try_to_free() + * + * Attempt to free the page. If we cannot free it, we do nothing. + * 1 is returned on success, 0 on failure. + */ +int +vm_page_try_to_free(m) + vm_page_t m; +{ + if (m->dirty || m->hold_count || m->busy || m->wire_count || + (m->flags & (PG_BUSY|PG_UNMANAGED))) { + return(0); + } + vm_page_test_dirty(m); + if (m->dirty) + return(0); + vm_page_busy(m); + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + return(1); +} + +/* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index e1c1cc4..6bc7266 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -421,6 +421,7 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); int vm_page_try_to_cache __P((vm_page_t)); +int vm_page_try_to_free __P((vm_page_t)); void vm_page_dontneed __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); static __inline void vm_page_free __P((vm_page_t)); |