diff options
-rw-r--r-- | lib/libc/sys/madvise.2 | 27 | ||||
-rw-r--r-- | lib/libc/sys/mmap.2 | 29 | ||||
-rw-r--r-- | sys/kern/vfs_export.c | 2 | ||||
-rw-r--r-- | sys/kern/vfs_extattr.c | 3 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 2 | ||||
-rw-r--r-- | sys/kern/vfs_syscalls.c | 3 | ||||
-rw-r--r-- | sys/sys/mman.h | 3 | ||||
-rw-r--r-- | sys/vm/vm_fault.c | 18 | ||||
-rw-r--r-- | sys/vm/vm_map.c | 17 | ||||
-rw-r--r-- | sys/vm/vm_map.h | 3 | ||||
-rw-r--r-- | sys/vm/vm_mmap.c | 7 | ||||
-rw-r--r-- | sys/vm/vm_object.c | 35 | ||||
-rw-r--r-- | sys/vm/vm_object.h | 5 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 10 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 8 |
15 files changed, 147 insertions, 25 deletions
diff --git a/lib/libc/sys/madvise.2 b/lib/libc/sys/madvise.2 index 415af68..9a82782 100644 --- a/lib/libc/sys/madvise.2 +++ b/lib/libc/sys/madvise.2 @@ -58,6 +58,8 @@ The known behaviors are given in #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ #define MADV_FREE 5 /* data is now unimportant */ +#define MADV_NOSYNC 6 /* no explicit commit to physical backing store */ +#define MADV_AUTOSYNC 7 /* default commit method to physical backing store */ .Ed .Pp .Bl -tag -width MADV_SEQUENTIAL @@ -96,6 +98,31 @@ call. References made to that address space range will not make the VM system page the information back in from backing store until the page is modified again. +.It Dv MADV_NOSYNC +Request that the system not flush the data associated with this map to +physical backing store unless it needs to. Typically this prevents the +filesystem update daemon from gratuitously writing pages dirtied +by the VM system to physical disk. Note that VM/filesystem coherency is +always maintained, this feature simply ensures that the mapped data is +only flush when it needs to be, usually by the system pager. +.Pp +This feature is typically used when you want to use a file-backed shared +memory area to communicate between processes (IPC) and do not particularly +need the data being stored in that area to be physically written to disk. +With this feature you get the equivalent performance with mmap that you +would expect to get with SysV shared memory calls, but in a more controllable +and less restrictive manner. However, note that this feature is not portable +across UNIX platforms (though some may do the right thing by default). +For more information see the MAP_NOSYNC section of +.Xr mmap 2 +.It Dv MADV_AUTOSYNC +Undoes the effects of MADV_NOSYNC for any future pages dirtied within the +address range. The effect on pages already dirtied is indeterminate - they +may or may not be reverted. You can guarentee reversion by using the +.Xr msync 2 +or +.Xr fsync 2 +system calls. .El .Sh RETURN VALUES Upon successful completion, diff --git a/lib/libc/sys/mmap.2 b/lib/libc/sys/mmap.2 index 6e3619e..e165d32 100644 --- a/lib/libc/sys/mmap.2 +++ b/lib/libc/sys/mmap.2 @@ -150,6 +150,35 @@ stack top is the starting address returned by the call, plus .Fa len bytes. The bottom of the stack at maximum growth is the starting address returned by the call. +.It Dv MAP_NOSYNC +Causes data dirtied via this VM map to be flushed to physical media +only when necessary (usually by the pager) rather then gratuitously. +Typically this prevents the update daemons from flushing pages dirtied +through such maps and thus allows efficient sharing of memory across +unassociated processes using a file-backed shared memory map. Without +this option any VM pages you dirty may be flushed to disk every so often +(every 30-60 seconds usually) which can create performance problems if you +do not need that to occur (such as when you are using shared file-backed +mmap regions for IPC purposes). Note that VM/filesystem coherency is +maintained whether you use MAP_NOSYNC or not. This option is not portable +across UNIX platforms (yet), though some may implement the same behavior +by default. +.Pp +The +.Xr fsync 2 +function will flush all dirty data and metadata associated with a file, +including dirty NOSYNC VM data, to physical media. The +.Xr sync 1 +command and +.Xr sync 2 +system call generally do not flush dirty NOSYNC VM data. +The +.Xr msync 2 +system call is obsolete since +.Os BSD +implements a coherent filesystem buffer cache. However, it may be +used to associate dirty VM pages with filesystem buffers and thus cause +them to be flushed to physical media sooner rather then later. .El .Pp The diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 1f6b7a4..fd592943d 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -2460,7 +2460,7 @@ loop: if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { - vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); + vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); anyio = 1; } vput(vp); diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c index 4d5d394..a33a4de 100644 --- a/sys/kern/vfs_extattr.c +++ b/sys/kern/vfs_extattr.c @@ -1307,7 +1307,8 @@ symlink(p, uap) vput(nd.ni_vp); vput(nd.ni_dvp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); - ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); + if (error == 0) + ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: zfree(namei_zone, path); return (error); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 1f6b7a4..fd592943d 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2460,7 +2460,7 @@ loop: if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { - vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); + vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); anyio = 1; } vput(vp); diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 4d5d394..a33a4de 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1307,7 +1307,8 @@ symlink(p, uap) vput(nd.ni_vp); vput(nd.ni_dvp); ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); - ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); + if (error == 0) + ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); out: zfree(namei_zone, path); return (error); diff --git a/sys/sys/mman.h b/sys/sys/mman.h index d5257ea..80ac51b 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -65,6 +65,7 @@ #define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ #define MAP_STACK 0x0400 /* region grows down, like a stack */ +#define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */ #ifdef _P1003_1B_VISIBLE /* @@ -103,6 +104,8 @@ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* dont need these pages */ #define MADV_FREE 5 /* dont need these pages, and junk contents */ +#define MADV_NOSYNC 6 /* try to avoid flushes to physical media */ +#define MADV_AUTOSYNC 7 /* revert to default flushing strategy */ /* * Return bits from mincore diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index ca1ee1c..23e2ba0 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -779,15 +779,29 @@ readrest: vm_page_flag_set(fs.m, PG_WRITEABLE); vm_object_set_flag(fs.m->object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); + /* * If the fault is a write, we know that this page is being - * written NOW. This will save on the pmap_is_modified() calls - * later. + * written NOW so dirty it explicitly to save on + * pmap_is_modified() calls later. + * + * If this is a NOSYNC mmap we do not want to set PG_NOSYNC + * if the page is already dirty to prevent data written with + * the expectation of being synced from not being synced. + * Likewise if this entry does not request NOSYNC then make + * sure the page isn't marked NOSYNC. Applications sharing + * data should use the same flags to avoid ping ponging. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (fault_flags & VM_FAULT_DIRTY) { + if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { + if (fs.m->dirty == 0) + vm_page_flag_set(fs.m, PG_NOSYNC); + } else { + vm_page_flag_clear(fs.m, PG_NOSYNC); + } vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index eaa65f7..a1f422d3 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -460,6 +460,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, KASSERT(object == NULL, ("vm_map_insert: paradoxical MAP_NOFAULT request")); } + if (cow & MAP_DISABLE_SYNCER) + protoeflags |= MAP_ENTRY_NOSYNC; + if (object) { /* * When object is non-NULL, it could be shared with another @@ -539,13 +542,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, * Update the free space hint */ if ((map->first_free == prev_entry) && - (prev_entry->end >= new_entry->start)) + (prev_entry->end >= new_entry->start)) { map->first_free = new_entry; + } - if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) + if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) { pmap_object_init_pt(map->pmap, start, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); + } return (KERN_SUCCESS); } @@ -1026,6 +1031,8 @@ vm_map_madvise(map, start, end, behav) case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: + case MADV_NOSYNC: + case MADV_AUTOSYNC: modify_map = 1; vm_map_lock(map); break; @@ -1077,6 +1084,12 @@ vm_map_madvise(map, start, end, behav) case MADV_RANDOM: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); break; + case MADV_NOSYNC: + current->eflags |= MAP_ENTRY_NOSYNC; + break; + case MADV_AUTOSYNC: + current->eflags &= ~MAP_ENTRY_NOSYNC; + break; default: break; } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index b02f970..b02317b 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -112,7 +112,7 @@ struct vm_map_entry { vm_pindex_t lastr; /* last read */ }; -#define MAP_ENTRY_UNUSED_01 0x1 +#define MAP_ENTRY_NOSYNC 0x1 #define MAP_ENTRY_IS_SUB_MAP 0x2 #define MAP_ENTRY_COW 0x4 #define MAP_ENTRY_NEEDS_COPY 0x8 @@ -329,6 +329,7 @@ vmspace_resident_count(struct vmspace *vmspace) #define MAP_NOFAULT 0x4 #define MAP_PREFAULT 0x8 #define MAP_PREFAULT_PARTIAL 0x10 +#define MAP_DISABLE_SYNCER 0x20 /* * vm_fault option flags diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index ce349acb..1721fd3 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -626,7 +626,7 @@ madvise(p, uap) /* * Check for illegal behavior */ - if (uap->behav < 0 || uap->behav > MADV_FREE) + if (uap->behav < 0 || uap->behav > MADV_AUTOSYNC) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note @@ -1046,9 +1046,10 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, flags |= MAP_SHARED; } - if ((flags & (MAP_ANON|MAP_SHARED)) == 0) { + if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; - } + if (flags & MAP_NOSYNC) + docow |= MAP_DISABLE_SYNCER; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1df9faf..6da6c9b 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -478,8 +478,10 @@ vm_object_terminate(object) /* * vm_object_page_clean * - * Clean all dirty pages in the specified range of object. - * Leaves page on whatever queue it is currently on. + * Clean all dirty pages in the specified range of object. Leaves page + * on whatever queue it is currently on. If NOSYNC is set then do not + * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), + * leaving the object dirty. * * Odd semantics: if start == end, we clean everything. * @@ -503,6 +505,7 @@ vm_object_page_clean(object, start, end, flags) int chkb; int maxb; int i; + int clearobjflags; int pagerflags; vm_page_t maf[vm_pageout_page_count]; vm_page_t mab[vm_pageout_page_count]; @@ -527,12 +530,26 @@ vm_object_page_clean(object, start, end, flags) tend = end; } + /* + * Generally set CLEANCHK interlock and make the page read-only so + * we can then clear the object flags. + * + * However, if this is a nosync mmap then the object is likely to + * stay dirty so do not mess with the page and do not clear the + * object flags. + */ + + clearobjflags = 1; + for(p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) { vm_page_flag_set(p, PG_CLEANCHK); - vm_page_protect(p, VM_PROT_READ); + if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) + clearobjflags = 0; + else + vm_page_protect(p, VM_PROT_READ); } - if ((tstart == 0) && (tend == object->size)) { + if (clearobjflags && (tstart == 0) && (tend == object->size)) { vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); } @@ -557,6 +574,16 @@ rescan: continue; } + /* + * If we have been asked to skip nosync pages and this is a + * nosync page, skip it. Note that the object flags were + * not cleared in this case so we do not have to set them. + */ + if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { + vm_page_flag_clear(p, PG_CLEANCHK); + continue; + } + s = splvm(); while (vm_page_sleep_busy(p, TRUE, "vpcwai")) { if (object->generation != curgeneration) { diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 315cedb..165e0e1 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -153,8 +153,9 @@ struct vm_object { #ifdef KERNEL -#define OBJPC_SYNC 0x1 /* sync I/O */ -#define OBJPC_INVAL 0x2 /* invalidate */ +#define OBJPC_SYNC 0x1 /* sync I/O */ +#define OBJPC_INVAL 0x2 /* invalidate */ +#define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */ TAILQ_HEAD(object_q, vm_object); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 078743f..edde291 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1522,15 +1522,19 @@ vm_page_set_validclean(m, base, size) /* * Set valid, clear dirty bits. If validating the entire - * page we can safely clear the pmap modify bit. + * page we can safely clear the pmap modify bit. We also + * use this opportunity to clear the PG_NOSYNC flag. If a process + * takes a write fault on a MAP_NOSYNC memory area the flag will + * be set again. */ pagebits = vm_page_bits(base, size); m->valid |= pagebits; m->dirty &= ~pagebits; - - if (base == 0 && size == PAGE_SIZE) + if (base == 0 && size == PAGE_SIZE) { pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + vm_page_flag_clear(m, PG_NOSYNC); + } } #if 0 diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 0cfc618..c5d2827 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -234,6 +234,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; #define PG_REFERENCED 0x0080 /* page has been referenced */ #define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */ #define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */ +#define PG_NOSYNC 0x0400 /* do not collect for syncer */ /* * Misc constants. @@ -437,10 +438,9 @@ vm_page_unhold(vm_page_t mem) /* * vm_page_protect: * - * Reduce the protection of a page. This routine never - * raises the protection and therefore can be safely - * called if the page is already at VM_PROT_NONE ( it - * will be a NOP effectively ). + * Reduce the protection of a page. This routine never raises the + * protection and therefore can be safely called if the page is already + * at VM_PROT_NONE (it will be a NOP effectively ). */ static __inline void |