summaryrefslogtreecommitdiffstats
path: root/sys/kern/vfs_vnops.c
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2011-11-04 04:02:50 +0000
committerjhb <jhb@FreeBSD.org>2011-11-04 04:02:50 +0000
commit78c075174e74e727279365476d0d076d6c3e3075 (patch)
tree159ae25b13b965df34d0e93885cca08178c0b2a2 /sys/kern/vfs_vnops.c
parent1e2d8c9d67bc3fa3bf3a560b9b8eac1745104048 (diff)
downloadFreeBSD-src-78c075174e74e727279365476d0d076d6c3e3075.zip
FreeBSD-src-78c075174e74e727279365476d0d076d6c3e3075.tar.gz
Add the posix_fadvise(2) system call. It is somewhat similar to
madvise(2) except that it operates on a file descriptor instead of a memory region. It is currently only supported on regular files. Just as with madvise(2), the advice given to posix_fadvise(2) can be divided into two types. The first type provide hints about data access patterns and are used in the file read and write routines to modify the I/O flags passed down to VOP_READ() and VOP_WRITE(). These modes are thus filesystem independent. Note that to ease implementation (and since this API is only advisory anyway), only a single non-normal range is allowed per file descriptor. The second type of hints are used to hint to the OS that data will or will not be used. These hints are implemented via a new VOP_ADVISE(). A default implementation is provided which does nothing for the WILLNEED request and attempts to move any clean pages to the cache page queue for the DONTNEED request. This latter case required two other changes. First, a new V_CLEANONLY flag was added to vinvalbuf(). This requests vinvalbuf() to only flush clean buffers for the vnode from the buffer cache and to not remove any backing pages from the vnode. This is used to ensure clean pages are not wired into the buffer cache before attempting to move them to the cache page queue. The second change adds a new vm_object_page_cache() method. This method is somewhat similar to vm_object_page_remove() except that instead of freeing each page in the specified range, it attempts to move clean pages to the cache queue if possible. To preserve the ABI of struct file, the f_cdevpriv pointer is now reused in a union to point to the currently active advice region if one is present for regular files. Reviewed by: jilles, kib, arch@ Approved by: re (kib) MFC after: 1 month
Diffstat (limited to 'sys/kern/vfs_vnops.c')
-rw-r--r--sys/kern/vfs_vnops.c76
1 files changed, 62 insertions, 14 deletions
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 17dc5e7..e33592a 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -518,7 +518,7 @@ vn_read(fp, uio, active_cred, flags, td)
struct vnode *vp;
int error, ioflag;
struct mtx *mtxp;
- int vfslocked;
+ int advice, vfslocked;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@@ -529,27 +529,48 @@ vn_read(fp, uio, active_cred, flags, td)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
+ advice = POSIX_FADV_NORMAL;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
/*
* According to McKusick the vn lock was protecting f_offset here.
* It is now protected by the FOFFSET_LOCKED flag.
*/
- if ((flags & FOF_OFFSET) == 0) {
+ if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
- while(fp->f_vnread_flags & FOFFSET_LOCKED) {
- fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
- msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
- "vnread offlock", 0);
+ if ((flags & FOF_OFFSET) == 0) {
+ while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+ "vnread offlock", 0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ uio->uio_offset = fp->f_offset;
}
- fp->f_vnread_flags |= FOFFSET_LOCKED;
+ if (fp->f_advice != NULL &&
+ uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ advice = fp->f_advice->fa_advice;
mtx_unlock(mtxp);
- vn_lock(vp, LK_SHARED | LK_RETRY);
- uio->uio_offset = fp->f_offset;
- } else
- vn_lock(vp, LK_SHARED | LK_RETRY);
+ }
+ vn_lock(vp, LK_SHARED | LK_RETRY);
- ioflag |= sequential_heuristic(uio, fp);
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* Disable read-ahead for random I/O. */
+ break;
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Request the underlying FS to discard the buffers
+ * and pages after the I/O is complete.
+ */
+ ioflag |= IO_DIRECT;
+ break;
+ }
#ifdef MAC
error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -584,7 +605,8 @@ vn_write(fp, uio, active_cred, flags, td)
struct vnode *vp;
struct mount *mp;
int error, ioflag, lock_flags;
- int vfslocked;
+ struct mtx *mtxp;
+ int advice, vfslocked;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@@ -618,7 +640,33 @@ vn_write(fp, uio, active_cred, flags, td)
vn_lock(vp, lock_flags | LK_RETRY);
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
- ioflag |= sequential_heuristic(uio, fp);
+ advice = POSIX_FADV_NORMAL;
+ if (fp->f_advice != NULL) {
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ advice = fp->f_advice->fa_advice;
+ mtx_unlock(mtxp);
+ }
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* XXX: Is this correct? */
+ break;
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Request the underlying FS to discard the buffers
+ * and pages after the I/O is complete.
+ */
+ ioflag |= IO_DIRECT;
+ break;
+ }
+
#ifdef MAC
error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
if (error == 0)
OpenPOWER on IntegriCloud