diff options
author | jhb <jhb@FreeBSD.org> | 2011-11-04 04:02:50 +0000 |
---|---|---|
committer | jhb <jhb@FreeBSD.org> | 2011-11-04 04:02:50 +0000 |
commit | 78c075174e74e727279365476d0d076d6c3e3075 (patch) | |
tree | 159ae25b13b965df34d0e93885cca08178c0b2a2 /sys/kern/vfs_vnops.c | |
parent | 1e2d8c9d67bc3fa3bf3a560b9b8eac1745104048 (diff) | |
download | FreeBSD-src-78c075174e74e727279365476d0d076d6c3e3075.zip FreeBSD-src-78c075174e74e727279365476d0d076d6c3e3075.tar.gz |
Add the posix_fadvise(2) system call. It is somewhat similar to
madvise(2) except that it operates on a file descriptor instead of a
memory region. It is currently only supported on regular files.
Just as with madvise(2), the advice given to posix_fadvise(2) can be
divided into two types. The first type provide hints about data access
patterns and are used in the file read and write routines to modify the
I/O flags passed down to VOP_READ() and VOP_WRITE(). These modes are
thus filesystem independent. Note that to ease implementation (and
since this API is only advisory anyway), only a single non-normal
range is allowed per file descriptor.
The second type of hints are used to hint to the OS that data will or
will not be used. These hints are implemented via a new VOP_ADVISE().
A default implementation is provided which does nothing for the WILLNEED
request and attempts to move any clean pages to the cache page queue for
the DONTNEED request. This latter case required two other changes.
First, a new V_CLEANONLY flag was added to vinvalbuf(). This requests
vinvalbuf() to only flush clean buffers for the vnode from the buffer
cache and to not remove any backing pages from the vnode. This is
used to ensure clean pages are not wired into the buffer cache before
attempting to move them to the cache page queue. The second change adds
a new vm_object_page_cache() method. This method is somewhat similar to
vm_object_page_remove() except that instead of freeing each page in the
specified range, it attempts to move clean pages to the cache queue if
possible.
To preserve the ABI of struct file, the f_cdevpriv pointer is now reused
in a union to point to the currently active advice region if one is
present for regular files.
Reviewed by: jilles, kib, arch@
Approved by: re (kib)
MFC after: 1 month
Diffstat (limited to 'sys/kern/vfs_vnops.c')
-rw-r--r-- | sys/kern/vfs_vnops.c | 76 |
1 files changed, 62 insertions, 14 deletions
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 17dc5e7..e33592a 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -518,7 +518,7 @@ vn_read(fp, uio, active_cred, flags, td) struct vnode *vp; int error, ioflag; struct mtx *mtxp; - int vfslocked; + int advice, vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -529,27 +529,48 @@ vn_read(fp, uio, active_cred, flags, td) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; + advice = POSIX_FADV_NORMAL; vfslocked = VFS_LOCK_GIANT(vp->v_mount); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ - if ((flags & FOF_OFFSET) == 0) { + if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) { mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); + if ((flags & FOF_OFFSET) == 0) { + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + uio->uio_offset = fp->f_offset; } - fp->f_vnread_flags |= FOFFSET_LOCKED; + if (fp->f_advice != NULL && + uio->uio_offset >= fp->f_advice->fa_start && + uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) + advice = fp->f_advice->fa_advice; mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); - uio->uio_offset = fp->f_offset; - } else - vn_lock(vp, LK_SHARED | LK_RETRY); + } + vn_lock(vp, LK_SHARED | LK_RETRY); - ioflag |= sequential_heuristic(uio, fp); + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_SEQUENTIAL: + ioflag |= sequential_heuristic(uio, fp); + break; + case POSIX_FADV_RANDOM: + /* Disable read-ahead for random I/O. */ + break; + case POSIX_FADV_NOREUSE: + /* + * Request the underlying FS to discard the buffers + * and pages after the I/O is complete. + */ + ioflag |= IO_DIRECT; + break; + } #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); @@ -584,7 +605,8 @@ vn_write(fp, uio, active_cred, flags, td) struct vnode *vp; struct mount *mp; int error, ioflag, lock_flags; - int vfslocked; + struct mtx *mtxp; + int advice, vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -618,7 +640,33 @@ vn_write(fp, uio, active_cred, flags, td) vn_lock(vp, lock_flags | LK_RETRY); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; - ioflag |= sequential_heuristic(uio, fp); + advice = POSIX_FADV_NORMAL; + if (fp->f_advice != NULL) { + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + if (fp->f_advice != NULL && + uio->uio_offset >= fp->f_advice->fa_start && + uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) + advice = fp->f_advice->fa_advice; + mtx_unlock(mtxp); + } + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_SEQUENTIAL: + ioflag |= sequential_heuristic(uio, fp); + break; + case POSIX_FADV_RANDOM: + /* XXX: Is this correct? */ + break; + case POSIX_FADV_NOREUSE: + /* + * Request the underlying FS to discard the buffers + * and pages after the I/O is complete. + */ + ioflag |= IO_DIRECT; + break; + } + #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) |