summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libc/sys/Makefile.inc3
-rw-r--r--lib/libc/sys/Symbol.map4
-rw-r--r--lib/libc/sys/madvise.23
-rw-r--r--lib/libc/sys/posix_fadvise.2139
-rw-r--r--sys/compat/freebsd32/freebsd32_misc.c13
-rw-r--r--sys/compat/freebsd32/syscalls.master5
-rw-r--r--sys/kern/syscalls.master3
-rw-r--r--sys/kern/vfs_default.c53
-rw-r--r--sys/kern/vfs_subr.c7
-rw-r--r--sys/kern/vfs_syscalls.c134
-rw-r--r--sys/kern/vfs_vnops.c76
-rw-r--r--sys/kern/vnode_if.src9
-rw-r--r--sys/sys/fcntl.h11
-rw-r--r--sys/sys/file.h15
-rw-r--r--sys/sys/param.h2
-rw-r--r--sys/sys/vnode.h2
-rw-r--r--sys/vm/vm_object.c54
-rw-r--r--sys/vm/vm_object.h2
18 files changed, 511 insertions, 24 deletions
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index fe5061d..6da6a00 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -96,7 +96,8 @@ MAN+= abort2.2 accept.2 access.2 acct.2 adjtime.2 \
mq_setattr.2 \
msgctl.2 msgget.2 msgrcv.2 msgsnd.2 \
msync.2 munmap.2 nanosleep.2 nfssvc.2 ntp_adjtime.2 open.2 \
- pathconf.2 pdfork.2 pipe.2 poll.2 posix_fallocate.2 posix_openpt.2 profil.2 \
+ pathconf.2 pdfork.2 pipe.2 poll.2 posix_fadvise.2 posix_fallocate.2 \
+ posix_openpt.2 profil.2 \
pselect.2 ptrace.2 quotactl.2 \
read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \
rtprio.2
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 095751a..d0c0c94 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -378,6 +378,10 @@ FBSD_1.2 {
setloginclass;
};
+FBSD_1.3 {
+ posix_fadvise;
+};
+
FBSDprivate_1.0 {
___acl_aclcheck_fd;
__sys___acl_aclcheck_fd;
diff --git a/lib/libc/sys/madvise.2 b/lib/libc/sys/madvise.2
index 48f0e5a..b5ea6b2 100644
--- a/lib/libc/sys/madvise.2
+++ b/lib/libc/sys/madvise.2
@@ -169,7 +169,8 @@ was specified and the process does not have superuser privileges.
.Xr mincore 2 ,
.Xr mprotect 2 ,
.Xr msync 2 ,
-.Xr munmap 2
+.Xr munmap 2 ,
+.Xr posix_fadvise 2
.Sh STANDARDS
The
.Fn posix_madvise
diff --git a/lib/libc/sys/posix_fadvise.2 b/lib/libc/sys/posix_fadvise.2
new file mode 100644
index 0000000..bdf321f
--- /dev/null
+++ b/lib/libc/sys/posix_fadvise.2
@@ -0,0 +1,139 @@
+.\" Copyright (c) 1991, 1993
+.\" The Regents of the University of California. All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" @(#)madvise.2 8.1 (Berkeley) 6/9/93
+.\" $FreeBSD$
+.\"
+.Dd October 26, 2011
+.Dt POSIX_FADVISE 2
+.Os
+.Sh NAME
+.Nm posix_fadvise
+.Nd give advice about use of file data
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In fcntl.h
+.Ft int
+.Fn posix_fadvise "int fd" "off_t offset" "off_t len" "int advice"
+.Sh DESCRIPTION
+The
+.Fn posix_fadvise
+system call
+allows a process to describe to the system its data access behavior for an
+open file descriptor
+.Fa fd .
+The advice covers the data starting at offset
+.Fa offset
+and continuing for
+.Fa len
+bytes.
+If
+.Fa len
+is zero,
+all data from
+.Fa offset
+to the end of the file is covered.
+.Pp
+The behavior is specified by the
+.Fa advice
+parameter and may be one of:
+.Bl -tag -width POSIX_FADV_SEQUENTIAL
+.It Dv POSIX_FADV_NORMAL
+Tells the system to revert to the default data access behavior.
+.It Dv POSIX_FADV_RANDOM
+Is a hint that file data will be accessed randomly,
+and prefetching is likely not advantageous.
+.It Dv POSIX_FADV_SEQUENTIAL
+Tells the system that file data will be accessed sequentially.
+This currently does nothing as the default behavior uses heuristics to
+detect sequential behavior.
+.It Dv POSIX_FADV_WILLNEED
+Tells the system that the specified data will be accessed in the near future.
+The system may initiate an asychronous read of the data if it is not already
+present in memory.
+.It Dv POSIX_FADV_DONTNEED
+Tells the system that the specified data will not be accessed in the near
+future.
+The system may decrease the in-memory priority of clean data within the
+specified range and future access to this data may require a read operation.
+.It Dv POSIX_FADV_NOREUSE
+Tells the system that the specified data will only be accessed once and
+then not reused.
+Accesses to data within the specified range are treated as if the file
+descriptor has the
+.Dv O_DIRECT
+flag enabled.
+.El
+.Pp
+.Sh RETURN VALUES
+.Rv -std posix_fadvise
+.Sh ERRORS
+The
+.Fn posix_fadvise
+system call will fail if:
+.Bl -tag -width Er
+.It Bq Er EBADF
+The
+.Fa fd
+argument is not a valid file descriptor.
+.It Bq Er EINVAL
+The
+.Fa advice
+argument is not valid.
+.It Bq Er EINVAL
+The
+.Fa offset
+or
+.Fa len
+arguments are negative,
+or
+.Fa offset
++
+.Fa len
+is greater than the maximum file size.
+.It Bq Er ENODEV
+The
+.Fa fd
+argument does not refer to a regular file.
+.It Bq Er ESPIPE
+The
+.Fa fd
+argument is associated with a pipe or FIFO.
+.El
+.Sh SEE ALSO
+.Xr madvise 2
+.Sh STANDARDS
+The
+.Fn posix_fadvise
+interface conforms to
+.St -p1003.1-2001 .
+.Sh HISTORY
+The
+.Fn posix_fadvise
+system call first appeared in
+.Fx 10.0 .
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index fc2932b..83ee962 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -2835,3 +2835,16 @@ freebsd32_posix_fallocate(struct thread *td,
ap.len = PAIR32TO64(off_t, uap->len);
return (sys_posix_fallocate(td, &ap));
}
+
+int
+freebsd32_posix_fadvise(struct thread *td,
+ struct freebsd32_posix_fadvise_args *uap)
+{
+ struct posix_fadvise_args ap;
+
+ ap.fd = uap->fd;
+ ap.offset = PAIR32TO64(off_t, uap->offset);
+ ap.len = PAIR32TO64(off_t, uap->len);
+ ap.advice = uap->advice;
+ return (sys_posix_fadvise(td, &ap));
+}
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index 9428b6c..cb22f8b 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -991,4 +991,7 @@
530 AUE_NULL STD { int freebsd32_posix_fallocate(int fd,\
uint32_t offset1, uint32_t offset2,\
uint32_t len1, uint32_t len2); }
-531 AUE_NULL UNIMPL posix_fadvise
+531 AUE_NULL STD { int freebsd32_posix_fadvise(int fd, \
+ uint32_t offset1, uint32_t offset2,\
+ uint32_t len1, uint32_t len2, \
+ int advice); }
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index b79c6c7..8188ccd 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -947,6 +947,7 @@
size_t outbuflen); }
530 AUE_NULL STD { int posix_fallocate(int fd, \
off_t offset, off_t len); }
-531 AUE_NULL UNIMPL posix_fadvise
+531 AUE_NULL STD { int posix_fadvise(int fd, off_t offset, \
+ off_t len, int advice); }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index b89d990..e9f8151 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -96,6 +96,7 @@ struct vop_vector default_vnodeops = {
.vop_access = vop_stdaccess,
.vop_accessx = vop_stdaccessx,
+ .vop_advise = vop_stdadvise,
.vop_advlock = vop_stdadvlock,
.vop_advlockasync = vop_stdadvlockasync,
.vop_advlockpurge = vop_stdadvlockpurge,
@@ -984,6 +985,58 @@ vop_stdallocate(struct vop_allocate_args *ap)
return (error);
}
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+ struct vnode *vp;
+ off_t start, end;
+ int error, vfslocked;
+
+ vp = ap->a_vp;
+ switch (ap->a_advice) {
+ case POSIX_FADV_WILLNEED:
+ /*
+ * Do nothing for now. Filesystems should provide a
+ * custom method which starts an asynchronous read of
+ * the requested region.
+ */
+ error = 0;
+ break;
+ case POSIX_FADV_DONTNEED:
+ /*
+ * Flush any open FS buffers and then remove pages
+ * from the backing VM object. Using vinvalbuf() here
+ * is a bit heavy-handed as it flushes all buffers for
+ * the given vnode, not just the buffers covering the
+ * requested range.
+ */
+ error = 0;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_iflag & VI_DOOMED) {
+ VOP_UNLOCK(vp, 0);
+ VFS_UNLOCK_GIANT(vfslocked);
+ break;
+ }
+ vinvalbuf(vp, V_CLEANONLY, 0, 0);
+ if (vp->v_object != NULL) {
+ start = trunc_page(ap->a_start);
+ end = round_page(ap->a_end);
+ VM_OBJECT_LOCK(vp->v_object);
+ vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
+ OFF_TO_IDX(end));
+ VM_OBJECT_UNLOCK(vp->v_object);
+ }
+ VOP_UNLOCK(vp, 0);
+ VFS_UNLOCK_GIANT(vfslocked);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
/*
* vfs default ops
* used to fill the vfs function table to get reasonable default return values.
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 10d3ad2..2872f77 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1191,7 +1191,7 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
do {
error = flushbuflist(&bo->bo_clean,
flags, bo, slpflag, slptimeo);
- if (error == 0)
+ if (error == 0 && !(flags & V_CLEANONLY))
error = flushbuflist(&bo->bo_dirty,
flags, bo, slpflag, slptimeo);
if (error != 0 && error != EAGAIN) {
@@ -1220,7 +1220,8 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
/*
* Destroy the copy in the VM cache, too.
*/
- if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
+ if (bo->bo_object != NULL &&
+ (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
VM_OBJECT_LOCK(bo->bo_object);
vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
OBJPR_CLEANONLY : 0);
@@ -1229,7 +1230,7 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
#ifdef INVARIANTS
BO_LOCK(bo);
- if ((flags & (V_ALT | V_NORMAL)) == 0 &&
+ if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
(bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
panic("vinvalbuf: flush failed");
BO_UNLOCK(bo);
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index ec5ad06..e886079 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -86,6 +86,8 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_page.h>
#include <vm/uma.h>
+static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
SDT_PROVIDER_DEFINE(vfs);
SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
@@ -4845,3 +4847,135 @@ sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint. Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+ struct fadvise_info *fa, *new;
+ struct file *fp;
+ struct vnode *vp;
+ off_t end;
+ int error;
+
+ if (uap->offset < 0 || uap->len < 0 ||
+ uap->offset > OFF_MAX - uap->len)
+ return (EINVAL);
+ switch (uap->advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+ break;
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ new = NULL;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /* XXX: CAP_POSIX_FADVISE? */
+ error = fget(td, uap->fd, 0, &fp);
+ if (error != 0)
+ goto out;
+
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ break;
+ case DTYPE_PIPE:
+ case DTYPE_FIFO:
+ error = ESPIPE;
+ goto out;
+ default:
+ error = ENODEV;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+ if (uap->len == 0)
+ end = OFF_MAX;
+ else
+ end = uap->offset + uap->len - 1;
+ switch (uap->advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Try to merge any existing non-standard region with
+ * this new region if possible, otherwise create a new
+ * non-standard region for this request.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL && fa->fa_advice == uap->advice &&
+ ((fa->fa_start <= end && fa->fa_end >= uap->offset) ||
+ (end != OFF_MAX && fa->fa_start == end + 1) ||
+ (fa->fa_end != OFF_MAX && fa->fa_end + 1 == uap->offset))) {
+ if (uap->offset < fa->fa_start)
+ fa->fa_start = uap->offset;
+ if (end > fa->fa_end)
+ fa->fa_end = end;
+ } else {
+ new->fa_advice = uap->advice;
+ new->fa_start = uap->offset;
+ new->fa_end = end;
+ fp->f_advice = new;
+ new = fa;
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_NORMAL:
+ /*
+ * If a the "normal" region overlaps with an existing
+ * non-standard region, trim or remove the
+ * non-standard region.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL) {
+ if (uap->offset <= fa->fa_start &&
+ end >= fa->fa_end) {
+ new = fa;
+ fp->f_advice = NULL;
+ } else if (uap->offset <= fa->fa_start &&
+ end >= fa->fa_start)
+ fa->fa_start = end + 1;
+ else if (uap->offset <= fa->fa_end &&
+ end >= fa->fa_end)
+ fa->fa_end = uap->offset - 1;
+ else if (uap->offset >= fa->fa_start &&
+ end <= fa->fa_end) {
+ /*
+ * If the "normal" region is a middle
+ * portion of the existing
+ * non-standard region, just remove
+ * the whole thing rather than picking
+ * one side or the other to
+ * preserve.
+ */
+ new = fa;
+ fp->f_advice = NULL;
+ }
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ error = VOP_ADVISE(vp, uap->offset, end, uap->advice);
+ break;
+ }
+out:
+ if (fp != NULL)
+ fdrop(fp, td);
+ free(new, M_FADVISE);
+ return (error);
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 17dc5e7..e33592a 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -518,7 +518,7 @@ vn_read(fp, uio, active_cred, flags, td)
struct vnode *vp;
int error, ioflag;
struct mtx *mtxp;
- int vfslocked;
+ int advice, vfslocked;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@@ -529,27 +529,48 @@ vn_read(fp, uio, active_cred, flags, td)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
+ advice = POSIX_FADV_NORMAL;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
/*
* According to McKusick the vn lock was protecting f_offset here.
* It is now protected by the FOFFSET_LOCKED flag.
*/
- if ((flags & FOF_OFFSET) == 0) {
+ if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
- while(fp->f_vnread_flags & FOFFSET_LOCKED) {
- fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
- msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
- "vnread offlock", 0);
+ if ((flags & FOF_OFFSET) == 0) {
+ while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+ "vnread offlock", 0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ uio->uio_offset = fp->f_offset;
}
- fp->f_vnread_flags |= FOFFSET_LOCKED;
+ if (fp->f_advice != NULL &&
+ uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ advice = fp->f_advice->fa_advice;
mtx_unlock(mtxp);
- vn_lock(vp, LK_SHARED | LK_RETRY);
- uio->uio_offset = fp->f_offset;
- } else
- vn_lock(vp, LK_SHARED | LK_RETRY);
+ }
+ vn_lock(vp, LK_SHARED | LK_RETRY);
- ioflag |= sequential_heuristic(uio, fp);
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* Disable read-ahead for random I/O. */
+ break;
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Request the underlying FS to discard the buffers
+ * and pages after the I/O is complete.
+ */
+ ioflag |= IO_DIRECT;
+ break;
+ }
#ifdef MAC
error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -584,7 +605,8 @@ vn_write(fp, uio, active_cred, flags, td)
struct vnode *vp;
struct mount *mp;
int error, ioflag, lock_flags;
- int vfslocked;
+ struct mtx *mtxp;
+ int advice, vfslocked;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@@ -618,7 +640,33 @@ vn_write(fp, uio, active_cred, flags, td)
vn_lock(vp, lock_flags | LK_RETRY);
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
- ioflag |= sequential_heuristic(uio, fp);
+ advice = POSIX_FADV_NORMAL;
+ if (fp->f_advice != NULL) {
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ advice = fp->f_advice->fa_advice;
+ mtx_unlock(mtxp);
+ }
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* XXX: Is this correct? */
+ break;
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Request the underlying FS to discard the buffers
+ * and pages after the I/O is complete.
+ */
+ ioflag |= IO_DIRECT;
+ break;
+ }
+
#ifdef MAC
error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
if (error == 0)
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index f75e7df..6f24d17 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -628,3 +628,12 @@ vop_allocate {
INOUT off_t *offset;
INOUT off_t *len;
};
+
+%% advise vp U U U
+
+vop_advise {
+ IN struct vnode *vp;
+ IN off_t start;
+ IN off_t end;
+ IN int advice;
+};
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index 28a66d0..29b2a0c 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -277,9 +277,17 @@ struct oflock {
#define LOCK_UN 0x08 /* unlock file */
#endif
+#if __POSIX_VISIBLE >= 200112
/*
- * XXX missing posix_fadvise() and POSIX_FADV_* macros.
+ * Advice to posix_fadvise
*/
+#define POSIX_FADV_NORMAL 0 /* no special treatment */
+#define POSIX_FADV_RANDOM 1 /* expect random page references */
+#define POSIX_FADV_SEQUENTIAL 2 /* expect sequential page references */
+#define POSIX_FADV_WILLNEED 3 /* will need these pages */
+#define POSIX_FADV_DONTNEED 4 /* dont need these pages */
+#define POSIX_FADV_NOREUSE 5 /* access data only once */
+#endif
#ifndef _KERNEL
__BEGIN_DECLS
@@ -293,6 +301,7 @@ int flock(int, int);
int openat(int, const char *, int, ...);
#endif
#if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112
+int posix_fadvise(int, off_t, off_t, int);
int posix_fallocate(int, off_t, off_t);
#endif
__END_DECLS
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 57e7047..99889ba 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -122,6 +122,12 @@ struct fileops {
* none not locked
*/
+struct fadvise_info {
+ int fa_advice; /* (f) FADV_* type. */
+ off_t fa_start; /* (f) Region start. */
+ off_t fa_end; /* (f) Region end. */
+};
+
struct file {
void *f_data; /* file descriptor specific data */
struct fileops *f_ops; /* File operations */
@@ -136,7 +142,11 @@ struct file {
*/
int f_seqcount; /* Count of sequential accesses. */
off_t f_nextoff; /* next expected read/write offset. */
- struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */
+ union {
+ struct cdev_privdata *fvn_cdevpriv;
+ /* (d) Private data for the cdev. */
+ struct fadvise_info *fvn_advice;
+ } f_vnun;
/*
* DFLAG_SEEKABLE specific fields
*/
@@ -147,6 +157,9 @@ struct file {
void *f_label; /* Place-holder for MAC label. */
};
+#define f_cdevpriv f_vnun.fvn_cdevpriv
+#define f_advice f_vnun.fvn_advice
+
#define FOFFSET_LOCKED 0x1
#define FOFFSET_LOCK_WAITING 0x2
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 57ff3a3..893061a 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -58,7 +58,7 @@
* in the range 5 to 9.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 1000000 /* Master, propagated to newvers */
+#define __FreeBSD_version 1000001 /* Master, propagated to newvers */
#ifdef _KERNEL
#define P_OSREL_SIGWAIT 700000
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 4cb6633..7382336 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -384,6 +384,7 @@ extern int vttoif_tab[];
#define V_SAVE 0x0001 /* vinvalbuf: sync file first */
#define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */
#define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */
+#define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */
#define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */
#define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */
#define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */
@@ -685,6 +686,7 @@ int vop_stdunlock(struct vop_unlock_args *);
int vop_nopoll(struct vop_poll_args *);
int vop_stdaccess(struct vop_access_args *ap);
int vop_stdaccessx(struct vop_accessx_args *ap);
+int vop_stdadvise(struct vop_advise_args *ap);
int vop_stdadvlock(struct vop_advlock_args *ap);
int vop_stdadvlockasync(struct vop_advlockasync_args *ap);
int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 3de793b..600dea8 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -1863,6 +1863,60 @@ skipmemq:
}
/*
+ * vm_object_page_cache:
+ *
+ * For the given object, attempt to move the specified clean
+ * pages to the cache queue. If a page is wired for any reason,
+ * then it will not be changed. Pages are specified by the given
+ * range ["start", "end"). As a special case, if "end" is zero,
+ * then the range extends from "start" to the end of the object.
+ * Any mappings to the specified pages are removed before the
+ * pages are moved to the cache queue.
+ *
+ * This operation should only be performed on objects that
+ * contain managed pages.
+ *
+ * The object must be locked.
+ */
+void
+vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+{
+ struct mtx *mtx, *new_mtx;
+ vm_page_t p, next;
+
+ VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_SG &&
+ object->type != OBJT_PHYS),
+ ("vm_object_page_cache: illegal object %p", object));
+ if (object->resident_page_count == 0)
+ return;
+ p = vm_page_find_least(object, start);
+
+ /*
+ * Here, the variable "p" is either (1) the page with the least pindex
+ * greater than or equal to the parameter "start" or (2) NULL.
+ */
+ mtx = NULL;
+ for (; p != NULL && (p->pindex < end || end == 0); p = next) {
+ next = TAILQ_NEXT(p, listq);
+
+ /*
+ * Avoid releasing and reacquiring the same page lock.
+ */
+ new_mtx = vm_page_lockptr(p);
+ if (mtx != new_mtx) {
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+ mtx = new_mtx;
+ mtx_lock(mtx);
+ }
+ vm_page_try_to_cache(p);
+ }
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+}
+
+/*
* Populate the specified range of the object with valid pages. Returns
* TRUE if the range is successfully populated and FALSE otherwise.
*
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index a11f144..0c13786 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -223,6 +223,8 @@ void vm_object_destroy (vm_object_t);
void vm_object_terminate (vm_object_t);
void vm_object_set_writeable_dirty (vm_object_t);
void vm_object_init (void);
+void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
+ vm_pindex_t end);
void vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
vm_ooffset_t end, int flags);
void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
OpenPOWER on IntegriCloud