summaryrefslogtreecommitdiffstats
path: root/sys/kern/vfs_syscalls.c
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2011-11-04 04:02:50 +0000
committerjhb <jhb@FreeBSD.org>2011-11-04 04:02:50 +0000
commit78c075174e74e727279365476d0d076d6c3e3075 (patch)
tree159ae25b13b965df34d0e93885cca08178c0b2a2 /sys/kern/vfs_syscalls.c
parent1e2d8c9d67bc3fa3bf3a560b9b8eac1745104048 (diff)
downloadFreeBSD-src-78c075174e74e727279365476d0d076d6c3e3075.zip
FreeBSD-src-78c075174e74e727279365476d0d076d6c3e3075.tar.gz
Add the posix_fadvise(2) system call. It is somewhat similar to
madvise(2) except that it operates on a file descriptor instead of a memory region. It is currently only supported on regular files. Just as with madvise(2), the advice given to posix_fadvise(2) can be divided into two types. The first type provide hints about data access patterns and are used in the file read and write routines to modify the I/O flags passed down to VOP_READ() and VOP_WRITE(). These modes are thus filesystem independent. Note that to ease implementation (and since this API is only advisory anyway), only a single non-normal range is allowed per file descriptor. The second type of hints are used to hint to the OS that data will or will not be used. These hints are implemented via a new VOP_ADVISE(). A default implementation is provided which does nothing for the WILLNEED request and attempts to move any clean pages to the cache page queue for the DONTNEED request. This latter case required two other changes. First, a new V_CLEANONLY flag was added to vinvalbuf(). This requests vinvalbuf() to only flush clean buffers for the vnode from the buffer cache and to not remove any backing pages from the vnode. This is used to ensure clean pages are not wired into the buffer cache before attempting to move them to the cache page queue. The second change adds a new vm_object_page_cache() method. This method is somewhat similar to vm_object_page_remove() except that instead of freeing each page in the specified range, it attempts to move clean pages to the cache queue if possible. To preserve the ABI of struct file, the f_cdevpriv pointer is now reused in a union to point to the currently active advice region if one is present for regular files. Reviewed by: jilles, kib, arch@ Approved by: re (kib) MFC after: 1 month
Diffstat (limited to 'sys/kern/vfs_syscalls.c')
-rw-r--r--sys/kern/vfs_syscalls.c134
1 files changed, 134 insertions, 0 deletions
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index ec5ad06..e886079 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -86,6 +86,8 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_page.h>
#include <vm/uma.h>
+static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
SDT_PROVIDER_DEFINE(vfs);
SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
@@ -4845,3 +4847,135 @@ sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint. Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+ struct fadvise_info *fa, *new;
+ struct file *fp;
+ struct vnode *vp;
+ off_t end;
+ int error;
+
+ if (uap->offset < 0 || uap->len < 0 ||
+ uap->offset > OFF_MAX - uap->len)
+ return (EINVAL);
+ switch (uap->advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+ break;
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ new = NULL;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /* XXX: CAP_POSIX_FADVISE? */
+ error = fget(td, uap->fd, 0, &fp);
+ if (error != 0)
+ goto out;
+
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ break;
+ case DTYPE_PIPE:
+ case DTYPE_FIFO:
+ error = ESPIPE;
+ goto out;
+ default:
+ error = ENODEV;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+ if (uap->len == 0)
+ end = OFF_MAX;
+ else
+ end = uap->offset + uap->len - 1;
+ switch (uap->advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Try to merge any existing non-standard region with
+ * this new region if possible, otherwise create a new
+ * non-standard region for this request.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL && fa->fa_advice == uap->advice &&
+ ((fa->fa_start <= end && fa->fa_end >= uap->offset) ||
+ (end != OFF_MAX && fa->fa_start == end + 1) ||
+ (fa->fa_end != OFF_MAX && fa->fa_end + 1 == uap->offset))) {
+ if (uap->offset < fa->fa_start)
+ fa->fa_start = uap->offset;
+ if (end > fa->fa_end)
+ fa->fa_end = end;
+ } else {
+ new->fa_advice = uap->advice;
+ new->fa_start = uap->offset;
+ new->fa_end = end;
+ fp->f_advice = new;
+ new = fa;
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_NORMAL:
+ /*
+ * If a the "normal" region overlaps with an existing
+ * non-standard region, trim or remove the
+ * non-standard region.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL) {
+ if (uap->offset <= fa->fa_start &&
+ end >= fa->fa_end) {
+ new = fa;
+ fp->f_advice = NULL;
+ } else if (uap->offset <= fa->fa_start &&
+ end >= fa->fa_start)
+ fa->fa_start = end + 1;
+ else if (uap->offset <= fa->fa_end &&
+ end >= fa->fa_end)
+ fa->fa_end = uap->offset - 1;
+ else if (uap->offset >= fa->fa_start &&
+ end <= fa->fa_end) {
+ /*
+ * If the "normal" region is a middle
+ * portion of the existing
+ * non-standard region, just remove
+ * the whole thing rather than picking
+ * one side or the other to
+ * preserve.
+ */
+ new = fa;
+ fp->f_advice = NULL;
+ }
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ error = VOP_ADVISE(vp, uap->offset, end, uap->advice);
+ break;
+ }
+out:
+ if (fp != NULL)
+ fdrop(fp, td);
+ free(new, M_FADVISE);
+ return (error);
+}
OpenPOWER on IntegriCloud