summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libc/sys/madvise.227
-rw-r--r--lib/libc/sys/mmap.229
-rw-r--r--sys/kern/vfs_export.c2
-rw-r--r--sys/kern/vfs_extattr.c3
-rw-r--r--sys/kern/vfs_subr.c2
-rw-r--r--sys/kern/vfs_syscalls.c3
-rw-r--r--sys/sys/mman.h3
-rw-r--r--sys/vm/vm_fault.c18
-rw-r--r--sys/vm/vm_map.c17
-rw-r--r--sys/vm/vm_map.h3
-rw-r--r--sys/vm/vm_mmap.c7
-rw-r--r--sys/vm/vm_object.c35
-rw-r--r--sys/vm/vm_object.h5
-rw-r--r--sys/vm/vm_page.c10
-rw-r--r--sys/vm/vm_page.h8
15 files changed, 147 insertions, 25 deletions
diff --git a/lib/libc/sys/madvise.2 b/lib/libc/sys/madvise.2
index 415af68..9a82782 100644
--- a/lib/libc/sys/madvise.2
+++ b/lib/libc/sys/madvise.2
@@ -58,6 +58,8 @@ The known behaviors are given in
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
#define MADV_FREE 5 /* data is now unimportant */
+#define MADV_NOSYNC 6 /* no explicit commit to physical backing store */
+#define MADV_AUTOSYNC 7 /* default commit method to physical backing store */
.Ed
.Pp
.Bl -tag -width MADV_SEQUENTIAL
@@ -96,6 +98,31 @@ call.
References made to that address space range will not make the VM system
page the information back in from backing store until the page is
modified again.
+.It Dv MADV_NOSYNC
+Request that the system not flush the data associated with this map to
+physical backing store unless it needs to. Typically this prevents the
+filesystem update daemon from gratuitously writing pages dirtied
+by the VM system to physical disk. Note that VM/filesystem coherency is
+always maintained, this feature simply ensures that the mapped data is
+only flush when it needs to be, usually by the system pager.
+.Pp
+This feature is typically used when you want to use a file-backed shared
+memory area to communicate between processes (IPC) and do not particularly
+need the data being stored in that area to be physically written to disk.
+With this feature you get the equivalent performance with mmap that you
+would expect to get with SysV shared memory calls, but in a more controllable
+and less restrictive manner. However, note that this feature is not portable
+across UNIX platforms (though some may do the right thing by default).
+For more information see the MAP_NOSYNC section of
+.Xr mmap 2
+.It Dv MADV_AUTOSYNC
+Undoes the effects of MADV_NOSYNC for any future pages dirtied within the
+address range. The effect on pages already dirtied is indeterminate - they
+may or may not be reverted. You can guarentee reversion by using the
+.Xr msync 2
+or
+.Xr fsync 2
+system calls.
.El
.Sh RETURN VALUES
Upon successful completion,
diff --git a/lib/libc/sys/mmap.2 b/lib/libc/sys/mmap.2
index 6e3619e..e165d32 100644
--- a/lib/libc/sys/mmap.2
+++ b/lib/libc/sys/mmap.2
@@ -150,6 +150,35 @@ stack top is the starting address returned by the call, plus
.Fa len
bytes. The bottom of the stack at maximum growth is the starting
address returned by the call.
+.It Dv MAP_NOSYNC
+Causes data dirtied via this VM map to be flushed to physical media
+only when necessary (usually by the pager) rather then gratuitously.
+Typically this prevents the update daemons from flushing pages dirtied
+through such maps and thus allows efficient sharing of memory across
+unassociated processes using a file-backed shared memory map. Without
+this option any VM pages you dirty may be flushed to disk every so often
+(every 30-60 seconds usually) which can create performance problems if you
+do not need that to occur (such as when you are using shared file-backed
+mmap regions for IPC purposes). Note that VM/filesystem coherency is
+maintained whether you use MAP_NOSYNC or not. This option is not portable
+across UNIX platforms (yet), though some may implement the same behavior
+by default.
+.Pp
+The
+.Xr fsync 2
+function will flush all dirty data and metadata associated with a file,
+including dirty NOSYNC VM data, to physical media. The
+.Xr sync 1
+command and
+.Xr sync 2
+system call generally do not flush dirty NOSYNC VM data.
+The
+.Xr msync 2
+system call is obsolete since
+.Os BSD
+implements a coherent filesystem buffer cache. However, it may be
+used to associate dirty VM pages with filesystem buffers and thus cause
+them to be flushed to physical media sooner rather then later.
.El
.Pp
The
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 1f6b7a4..fd592943d 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -2460,7 +2460,7 @@ loop:
if (!vget(vp,
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
if (vp->v_object) {
- vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
+ vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
anyio = 1;
}
vput(vp);
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
index 4d5d394..a33a4de 100644
--- a/sys/kern/vfs_extattr.c
+++ b/sys/kern/vfs_extattr.c
@@ -1307,7 +1307,8 @@ symlink(p, uap)
vput(nd.ni_vp);
vput(nd.ni_dvp);
ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
- ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+ if (error == 0)
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
out:
zfree(namei_zone, path);
return (error);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 1f6b7a4..fd592943d 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2460,7 +2460,7 @@ loop:
if (!vget(vp,
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
if (vp->v_object) {
- vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
+ vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
anyio = 1;
}
vput(vp);
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 4d5d394..a33a4de 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1307,7 +1307,8 @@ symlink(p, uap)
vput(nd.ni_vp);
vput(nd.ni_dvp);
ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
- ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+ if (error == 0)
+ ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
out:
zfree(namei_zone, path);
return (error);
diff --git a/sys/sys/mman.h b/sys/sys/mman.h
index d5257ea..80ac51b 100644
--- a/sys/sys/mman.h
+++ b/sys/sys/mman.h
@@ -65,6 +65,7 @@
#define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */
#define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */
#define MAP_STACK 0x0400 /* region grows down, like a stack */
+#define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */
#ifdef _P1003_1B_VISIBLE
/*
@@ -103,6 +104,8 @@
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* dont need these pages */
#define MADV_FREE 5 /* dont need these pages, and junk contents */
+#define MADV_NOSYNC 6 /* try to avoid flushes to physical media */
+#define MADV_AUTOSYNC 7 /* revert to default flushing strategy */
/*
* Return bits from mincore
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index ca1ee1c..23e2ba0 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -779,15 +779,29 @@ readrest:
vm_page_flag_set(fs.m, PG_WRITEABLE);
vm_object_set_flag(fs.m->object,
OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+
/*
* If the fault is a write, we know that this page is being
- * written NOW. This will save on the pmap_is_modified() calls
- * later.
+ * written NOW so dirty it explicitly to save on
+ * pmap_is_modified() calls later.
+ *
+ * If this is a NOSYNC mmap we do not want to set PG_NOSYNC
+ * if the page is already dirty to prevent data written with
+ * the expectation of being synced from not being synced.
+ * Likewise if this entry does not request NOSYNC then make
+ * sure the page isn't marked NOSYNC. Applications sharing
+ * data should use the same flags to avoid ping ponging.
*
* Also tell the backing pager, if any, that it should remove
* any swap backing since the page is now dirty.
*/
if (fault_flags & VM_FAULT_DIRTY) {
+ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
+ if (fs.m->dirty == 0)
+ vm_page_flag_set(fs.m, PG_NOSYNC);
+ } else {
+ vm_page_flag_clear(fs.m, PG_NOSYNC);
+ }
vm_page_dirty(fs.m);
vm_pager_page_unswapped(fs.m);
}
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index eaa65f7..a1f422d3 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -460,6 +460,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
KASSERT(object == NULL,
("vm_map_insert: paradoxical MAP_NOFAULT request"));
}
+ if (cow & MAP_DISABLE_SYNCER)
+ protoeflags |= MAP_ENTRY_NOSYNC;
+
if (object) {
/*
* When object is non-NULL, it could be shared with another
@@ -539,13 +542,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
* Update the free space hint
*/
if ((map->first_free == prev_entry) &&
- (prev_entry->end >= new_entry->start))
+ (prev_entry->end >= new_entry->start)) {
map->first_free = new_entry;
+ }
- if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL))
+ if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
pmap_object_init_pt(map->pmap, start,
object, OFF_TO_IDX(offset), end - start,
cow & MAP_PREFAULT_PARTIAL);
+ }
return (KERN_SUCCESS);
}
@@ -1026,6 +1031,8 @@ vm_map_madvise(map, start, end, behav)
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
+ case MADV_NOSYNC:
+ case MADV_AUTOSYNC:
modify_map = 1;
vm_map_lock(map);
break;
@@ -1077,6 +1084,12 @@ vm_map_madvise(map, start, end, behav)
case MADV_RANDOM:
vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
break;
+ case MADV_NOSYNC:
+ current->eflags |= MAP_ENTRY_NOSYNC;
+ break;
+ case MADV_AUTOSYNC:
+ current->eflags &= ~MAP_ENTRY_NOSYNC;
+ break;
default:
break;
}
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index b02f970..b02317b 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -112,7 +112,7 @@ struct vm_map_entry {
vm_pindex_t lastr; /* last read */
};
-#define MAP_ENTRY_UNUSED_01 0x1
+#define MAP_ENTRY_NOSYNC 0x1
#define MAP_ENTRY_IS_SUB_MAP 0x2
#define MAP_ENTRY_COW 0x4
#define MAP_ENTRY_NEEDS_COPY 0x8
@@ -329,6 +329,7 @@ vmspace_resident_count(struct vmspace *vmspace)
#define MAP_NOFAULT 0x4
#define MAP_PREFAULT 0x8
#define MAP_PREFAULT_PARTIAL 0x10
+#define MAP_DISABLE_SYNCER 0x20
/*
* vm_fault option flags
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index ce349acb..1721fd3 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -626,7 +626,7 @@ madvise(p, uap)
/*
* Check for illegal behavior
*/
- if (uap->behav < 0 || uap->behav > MADV_FREE)
+ if (uap->behav < 0 || uap->behav > MADV_AUTOSYNC)
return (EINVAL);
/*
* Check for illegal addresses. Watch out for address wrap... Note
@@ -1046,9 +1046,10 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
flags |= MAP_SHARED;
}
- if ((flags & (MAP_ANON|MAP_SHARED)) == 0) {
+ if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
docow |= MAP_COPY_ON_WRITE;
- }
+ if (flags & MAP_NOSYNC)
+ docow |= MAP_DISABLE_SYNCER;
#if defined(VM_PROT_READ_IS_EXEC)
if (prot & VM_PROT_READ)
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 1df9faf..6da6c9b 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -478,8 +478,10 @@ vm_object_terminate(object)
/*
* vm_object_page_clean
*
- * Clean all dirty pages in the specified range of object.
- * Leaves page on whatever queue it is currently on.
+ * Clean all dirty pages in the specified range of object. Leaves page
+ * on whatever queue it is currently on. If NOSYNC is set then do not
+ * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
+ * leaving the object dirty.
*
* Odd semantics: if start == end, we clean everything.
*
@@ -503,6 +505,7 @@ vm_object_page_clean(object, start, end, flags)
int chkb;
int maxb;
int i;
+ int clearobjflags;
int pagerflags;
vm_page_t maf[vm_pageout_page_count];
vm_page_t mab[vm_pageout_page_count];
@@ -527,12 +530,26 @@ vm_object_page_clean(object, start, end, flags)
tend = end;
}
+ /*
+ * Generally set CLEANCHK interlock and make the page read-only so
+ * we can then clear the object flags.
+ *
+ * However, if this is a nosync mmap then the object is likely to
+ * stay dirty so do not mess with the page and do not clear the
+ * object flags.
+ */
+
+ clearobjflags = 1;
+
for(p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) {
vm_page_flag_set(p, PG_CLEANCHK);
- vm_page_protect(p, VM_PROT_READ);
+ if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
+ clearobjflags = 0;
+ else
+ vm_page_protect(p, VM_PROT_READ);
}
- if ((tstart == 0) && (tend == object->size)) {
+ if (clearobjflags && (tstart == 0) && (tend == object->size)) {
vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
}
@@ -557,6 +574,16 @@ rescan:
continue;
}
+ /*
+ * If we have been asked to skip nosync pages and this is a
+ * nosync page, skip it. Note that the object flags were
+ * not cleared in this case so we do not have to set them.
+ */
+ if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
+ vm_page_flag_clear(p, PG_CLEANCHK);
+ continue;
+ }
+
s = splvm();
while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
if (object->generation != curgeneration) {
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 315cedb..165e0e1 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -153,8 +153,9 @@ struct vm_object {
#ifdef KERNEL
-#define OBJPC_SYNC 0x1 /* sync I/O */
-#define OBJPC_INVAL 0x2 /* invalidate */
+#define OBJPC_SYNC 0x1 /* sync I/O */
+#define OBJPC_INVAL 0x2 /* invalidate */
+#define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */
TAILQ_HEAD(object_q, vm_object);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 078743f..edde291 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1522,15 +1522,19 @@ vm_page_set_validclean(m, base, size)
/*
* Set valid, clear dirty bits. If validating the entire
- * page we can safely clear the pmap modify bit.
+ * page we can safely clear the pmap modify bit. We also
+ * use this opportunity to clear the PG_NOSYNC flag. If a process
+ * takes a write fault on a MAP_NOSYNC memory area the flag will
+ * be set again.
*/
pagebits = vm_page_bits(base, size);
m->valid |= pagebits;
m->dirty &= ~pagebits;
-
- if (base == 0 && size == PAGE_SIZE)
+ if (base == 0 && size == PAGE_SIZE) {
pmap_clear_modify(VM_PAGE_TO_PHYS(m));
+ vm_page_flag_clear(m, PG_NOSYNC);
+ }
}
#if 0
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 0cfc618..c5d2827 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -234,6 +234,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
#define PG_REFERENCED 0x0080 /* page has been referenced */
#define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */
#define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */
+#define PG_NOSYNC 0x0400 /* do not collect for syncer */
/*
* Misc constants.
@@ -437,10 +438,9 @@ vm_page_unhold(vm_page_t mem)
/*
* vm_page_protect:
*
- * Reduce the protection of a page. This routine never
- * raises the protection and therefore can be safely
- * called if the page is already at VM_PROT_NONE ( it
- * will be a NOP effectively ).
+ * Reduce the protection of a page. This routine never raises the
+ * protection and therefore can be safely called if the page is already
+ * at VM_PROT_NONE (it will be a NOP effectively ).
*/
static __inline void
OpenPOWER on IntegriCloud