summaryrefslogtreecommitdiffstats
path: root/sys/vm
diff options
context:
space:
mode:
Diffstat (limited to 'sys/vm')
-rw-r--r--sys/vm/memguard.c7
-rw-r--r--sys/vm/pmap.h16
-rw-r--r--sys/vm/swap_pager.c659
-rw-r--r--sys/vm/swap_pager.h1
-rw-r--r--sys/vm/uma.h1
-rw-r--r--sys/vm/uma_core.c17
-rw-r--r--sys/vm/uma_int.h3
-rw-r--r--sys/vm/vm_extern.h1
-rw-r--r--sys/vm/vm_fault.c53
-rw-r--r--sys/vm/vm_glue.c517
-rw-r--r--sys/vm/vm_kern.c19
-rw-r--r--sys/vm/vm_map.c25
-rw-r--r--sys/vm/vm_mmap.c2
-rw-r--r--sys/vm/vm_object.c180
-rw-r--r--sys/vm/vm_object.h22
-rw-r--r--sys/vm/vm_page.c458
-rw-r--r--sys/vm/vm_page.h40
-rw-r--r--sys/vm/vm_pageout.c435
-rw-r--r--sys/vm/vm_pageout.h9
-rw-r--r--sys/vm/vm_param.h3
-rw-r--r--sys/vm/vm_phys.c51
-rw-r--r--sys/vm/vm_phys.h1
-rw-r--r--sys/vm/vm_reserv.c14
-rw-r--r--sys/vm/vm_reserv.h1
-rw-r--r--sys/vm/vm_swapout.c975
-rw-r--r--sys/vm/vm_swapout_dummy.c122
-rw-r--r--sys/vm/vnode_pager.c143
27 files changed, 2238 insertions, 1537 deletions
diff --git a/sys/vm/memguard.c b/sys/vm/memguard.c
index d4efc2b..9e1d7a4 100644
--- a/sys/vm/memguard.c
+++ b/sys/vm/memguard.c
@@ -283,7 +283,7 @@ v2sizev(vm_offset_t va)
void *
memguard_alloc(unsigned long req_size, int flags)
{
- vm_offset_t addr;
+ vm_offset_t addr, origaddr;
u_long size_p, size_v;
int do_guard, rv;
@@ -327,7 +327,7 @@ memguard_alloc(unsigned long req_size, int flags)
for (;;) {
if (vmem_xalloc(memguard_arena, size_v, 0, 0, 0,
memguard_cursor, VMEM_ADDR_MAX,
- M_BESTFIT | M_NOWAIT, &addr) == 0)
+ M_BESTFIT | M_NOWAIT, &origaddr) == 0)
break;
/*
* The map has no space. This may be due to
@@ -342,11 +342,12 @@ memguard_alloc(unsigned long req_size, int flags)
memguard_wrap++;
memguard_cursor = memguard_base;
}
+ addr = origaddr;
if (do_guard)
addr += PAGE_SIZE;
rv = kmem_back(kmem_object, addr, size_p, flags);
if (rv != KERN_SUCCESS) {
- vmem_xfree(memguard_arena, addr, size_v);
+ vmem_xfree(memguard_arena, origaddr, size_v);
memguard_fail_pgs++;
addr = (vm_offset_t)NULL;
goto out;
diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h
index 1d18823..6033f37 100644
--- a/sys/vm/pmap.h
+++ b/sys/vm/pmap.h
@@ -100,9 +100,21 @@ extern vm_offset_t kernel_vm_end;
/*
* Flags for pmap_enter(). The bits in the low-order byte are reserved
* for the protection code (vm_prot_t) that describes the fault type.
+ * Bits 24 through 31 are reserved for the pmap's internal use.
*/
-#define PMAP_ENTER_NOSLEEP 0x0100
-#define PMAP_ENTER_WIRED 0x0200
+#define PMAP_ENTER_NOSLEEP 0x00000100
+#define PMAP_ENTER_WIRED 0x00000200
+#define PMAP_ENTER_RESERVED 0xFF000000
+
+/*
+ * Define the maximum number of machine-dependent reference bits that are
+ * cleared by a call to pmap_ts_referenced(). This limit serves two purposes.
+ * First, it bounds the cost of reference bit maintenance on widely shared
+ * pages. Second, it prevents numeric overflow during maintenance of a
+ * widely shared page's "act_count" field. An overflow could result in the
+ * premature deactivation of the page.
+ */
+#define PMAP_TS_REFERENCED_MAX 5
void pmap_activate(struct thread *td);
void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index f5a766b..0703312 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -86,10 +86,12 @@ __FBSDID("$FreeBSD$");
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
+#include <sys/pctrie.h>
#include <sys/racct.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
+#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/blist.h>
@@ -126,22 +128,17 @@ __FBSDID("$FreeBSD$");
#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
#endif
+#define SWAP_META_PAGES PCTRIE_COUNT
+
/*
- * The swblock structure maps an object and a small, fixed-size range
- * of page indices to disk addresses within a swap area.
- * The collection of these mappings is implemented as a hash table.
- * Unused disk addresses within a swap area are allocated and managed
- * using a blist.
+ * A swblk structure maps each page index within a
+ * SWAP_META_PAGES-aligned and sized range to the address of an
+ * on-disk swap block (or SWAPBLK_NONE). The collection of these
+ * mappings for an entire vm object is implemented as a pc-trie.
*/
-#define SWAP_META_PAGES 32
-#define SWAP_META_MASK (SWAP_META_PAGES - 1)
-
-struct swblock {
- struct swblock *swb_hnext;
- vm_object_t swb_object;
- vm_pindex_t swb_index;
- int swb_count;
- daddr_t swb_pages[SWAP_META_PAGES];
+struct swblk {
+ vm_pindex_t p;
+ daddr_t d[SWAP_META_PAGES];
};
static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
@@ -159,7 +156,7 @@ static vm_ooffset_t swap_reserved;
SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
"Amount of swap storage needed to back all allocated anonymous memory.");
static int overcommit = 0;
-SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
+SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
"Configure virtual memory overcommit behavior. See tuning(7) "
"for details.");
static unsigned long swzone;
@@ -314,7 +311,7 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
#define SWM_FREE 0x02 /* free, period */
#define SWM_POP 0x04 /* pop out */
-int swap_pager_full = 2; /* swap space exhaustion (task killing) */
+static int swap_pager_full = 2; /* swap space exhaustion (task killing) */
static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
static int nsw_rcount; /* free read buffers */
static int nsw_wcount_sync; /* limit write buffers / synchronous */
@@ -326,10 +323,10 @@ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
"Maximum running async swap ops");
-
-static struct swblock **swhash;
-static int swhash_mask;
-static struct mtx swhash_mtx;
+static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
+ "Swap Fragmentation Info");
static struct sx sw_alloc_sx;
@@ -344,7 +341,8 @@ static struct sx sw_alloc_sx;
(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
static struct pagerlst swap_pager_object_list[NOBJLISTS];
-static uma_zone_t swap_zone;
+static uma_zone_t swblk_zone;
+static uma_zone_t swpctrie_zone;
/*
* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
@@ -402,12 +400,28 @@ static daddr_t swp_pager_getswapspace(int npages);
/*
* Metadata functions
*/
-static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
static void swp_pager_meta_free_all(vm_object_t);
static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
+static void *
+swblk_trie_alloc(struct pctrie *ptree)
+{
+
+ return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
+ M_USE_RESERVE : 0)));
+}
+
+static void
+swblk_trie_free(struct pctrie *ptree, void *node)
+{
+
+ uma_zfree(swpctrie_zone, node);
+}
+
+PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
+
/*
* SWP_SIZECHECK() - update swap_pager_full indication
*
@@ -436,33 +450,6 @@ swp_sizecheck(void)
}
/*
- * SWP_PAGER_HASH() - hash swap meta data
- *
- * This is an helper function which hashes the swapblk given
- * the object and page index. It returns a pointer to a pointer
- * to the object, or a pointer to a NULL pointer if it could not
- * find a swapblk.
- */
-static struct swblock **
-swp_pager_hash(vm_object_t object, vm_pindex_t index)
-{
- struct swblock **pswap;
- struct swblock *swap;
-
- index &= ~(vm_pindex_t)SWAP_META_MASK;
- pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
- while ((swap = *pswap) != NULL) {
- if (swap->swb_object == object &&
- swap->swb_index == index
- ) {
- break;
- }
- pswap = &swap->swb_hnext;
- }
- return (pswap);
-}
-
-/*
* SWAP_PAGER_INIT() - initialize the swap pager!
*
* Expected to be started from system init. NOTE: This code is run
@@ -527,21 +514,25 @@ swap_pager_swap_init(void)
mtx_unlock(&pbuf_mtx);
/*
- * Initialize our zone. Right now I'm just guessing on the number
- * we need based on the number of pages in the system. Each swblock
- * can hold 32 pages, so this is probably overkill. This reservation
- * is typically limited to around 32MB by default.
+ * Initialize our zone, guessing on the number we need based
+ * on the number of pages in the system.
*/
n = vm_cnt.v_page_count / 2;
- if (maxswzone && n > maxswzone / sizeof(struct swblock))
- n = maxswzone / sizeof(struct swblock);
+ if (maxswzone && n > maxswzone / sizeof(struct swblk))
+ n = maxswzone / sizeof(struct swblk);
+ swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
+ pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ if (swpctrie_zone == NULL)
+ panic("failed to create swap pctrie zone.");
+ swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
+ NULL, NULL, _Alignof(struct swblk) - 1,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ if (swblk_zone == NULL)
+ panic("failed to create swap blk zone.");
n2 = n;
- swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
- if (swap_zone == NULL)
- panic("failed to create swap_zone.");
do {
- if (uma_zone_reserve_kva(swap_zone, n))
+ if (uma_zone_reserve_kva(swblk_zone, n))
break;
/*
* if the allocation failed, try a zone two thirds the
@@ -549,25 +540,22 @@ swap_pager_swap_init(void)
*/
n -= ((n + 2) / 3);
} while (n > 0);
- if (n2 != n)
- printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
- swap_maxpages = n * SWAP_META_PAGES;
- swzone = n * sizeof(struct swblock);
- n2 = n;
/*
- * Initialize our meta-data hash table. The swapper does not need to
- * be quite as efficient as the VM system, so we do not use an
- * oversized hash table.
- *
- * n: size of hash table, must be power of 2
- * swhash_mask: hash table index mask
+ * Often uma_zone_reserve_kva() cannot reserve exactly the
+ * requested size. Account for the difference when
+ * calculating swap_maxpages.
*/
- for (n = 1; n < n2 / 8; n *= 2)
- ;
- swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
- swhash_mask = n - 1;
- mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
+ n = uma_zone_get_max(swblk_zone);
+
+ if (n < n2)
+ printf("Swap blk zone entries reduced from %lu to %lu.\n",
+ n2, n);
+ swap_maxpages = n * SWAP_META_PAGES;
+ swzone = n * sizeof(struct swblk);
+ if (!uma_zone_reserve_kva(swpctrie_zone, n))
+ printf("Cannot reserve swap pctrie zone, "
+ "reduce kern.maxswzone.\n");
}
static vm_object_t
@@ -581,14 +569,20 @@ swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size,
return (NULL);
crhold(cred);
}
+
+ /*
+ * The un_pager.swp.swp_blks trie is initialized by
+ * vm_object_allocate() to ensure the correct order of
+ * visibility to other threads.
+ */
object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
PAGE_MASK + size));
+
object->handle = handle;
if (cred != NULL) {
object->cred = cred;
object->charge = size;
}
- object->un_pager.swp.swp_bcount = 0;
return (object);
}
@@ -798,6 +792,36 @@ swp_pager_freeswapspace(daddr_t blk, int npages)
}
/*
+ * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats
+ */
+static int
+sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ struct swdevt *sp;
+ const char *devname;
+ int error;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ mtx_lock(&sw_dev_mtx);
+ TAILQ_FOREACH(sp, &swtailq, sw_list) {
+ if (vn_isdisk(sp->sw_vp, NULL))
+ devname = devtoname(sp->sw_vp->v_rdev);
+ else
+ devname = "[file]";
+ sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
+ blist_stats(sp->sw_blist, &sbuf);
+ }
+ mtx_unlock(&sw_dev_mtx);
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
+/*
* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
* range within an object.
*
@@ -1507,7 +1531,7 @@ swp_pager_async_iodone(struct buf *bp)
* so it doesn't clog the inactive list,
* then finish the I/O.
*/
- vm_page_dirty(m);
+ MPASS(m->dirty == VM_PAGE_BITS_ALL);
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);
@@ -1643,50 +1667,56 @@ swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
static void
swap_pager_swapoff(struct swdevt *sp)
{
- struct swblock *swap;
- vm_object_t locked_obj, object;
- vm_pindex_t pindex;
- int i, j, retries;
+ struct swblk *sb;
+ vm_object_t object;
+ vm_pindex_t pi;
+ int i, retries;
sx_assert(&swdev_syscall_lock, SA_XLOCKED);
retries = 0;
- locked_obj = NULL;
full_rescan:
- mtx_lock(&swhash_mtx);
- for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
-restart:
- for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
- object = swap->swb_object;
- pindex = swap->swb_index;
- for (j = 0; j < SWAP_META_PAGES; ++j) {
- if (!swp_pager_isondev(swap->swb_pages[j], sp))
+ mtx_lock(&vm_object_list_mtx);
+ TAILQ_FOREACH(object, &vm_object_list, object_list) {
+ if (object->type != OBJT_SWAP)
+ continue;
+ mtx_unlock(&vm_object_list_mtx);
+ /* Depends on type-stability. */
+ VM_OBJECT_WLOCK(object);
+
+ /*
+ * Dead objects are eventually terminated on their own.
+ */
+ if ((object->flags & OBJ_DEAD) != 0)
+ goto next_obj;
+
+ /*
+ * Sync with fences placed after pctrie
+ * initialization. We must not access pctrie below
+ * unless we checked that our object is swap and not
+ * dead.
+ */
+ atomic_thread_fence_acq();
+ if (object->type != OBJT_SWAP)
+ goto next_obj;
+
+ for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+ &object->un_pager.swp.swp_blks, pi)) != NULL; ) {
+ pi = sb->p + SWAP_META_PAGES;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] == SWAPBLK_NONE)
continue;
- if (locked_obj != object) {
- if (locked_obj != NULL)
- VM_OBJECT_WUNLOCK(locked_obj);
- locked_obj = object;
- if (!VM_OBJECT_TRYWLOCK(object)) {
- mtx_unlock(&swhash_mtx);
- /* Depends on type-stability. */
- VM_OBJECT_WLOCK(object);
- mtx_lock(&swhash_mtx);
- goto restart;
- }
- }
- MPASS(locked_obj == object);
- mtx_unlock(&swhash_mtx);
- swp_pager_force_pagein(object, pindex + j);
- mtx_lock(&swhash_mtx);
- goto restart;
+ if (swp_pager_isondev(sb->d[i], sp))
+ swp_pager_force_pagein(object,
+ sb->p + i);
}
}
+next_obj:
+ VM_OBJECT_WUNLOCK(object);
+ mtx_lock(&vm_object_list_mtx);
}
- mtx_unlock(&swhash_mtx);
- if (locked_obj != NULL) {
- VM_OBJECT_WUNLOCK(locked_obj);
- locked_obj = NULL;
- }
+ mtx_unlock(&vm_object_list_mtx);
+
if (sp->sw_used) {
/*
* Objects may be locked or paging to the device being
@@ -1729,85 +1759,120 @@ restart:
static void
swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
{
- static volatile int exhausted;
- struct swblock *swap;
- struct swblock **pswap;
- int idx;
+ static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
+ struct swblk *sb, *sb1;
+ vm_pindex_t modpi, rdpi;
+ int error, i;
VM_OBJECT_ASSERT_WLOCKED(object);
+
/*
* Convert default object to swap object if necessary
*/
if (object->type != OBJT_SWAP) {
+ pctrie_init(&object->un_pager.swp.swp_blks);
+
+ /*
+ * Ensure that swap_pager_swapoff()'s iteration over
+ * object_list does not see a garbage pctrie.
+ */
+ atomic_thread_fence_rel();
+
object->type = OBJT_SWAP;
- object->un_pager.swp.swp_bcount = 0;
KASSERT(object->handle == NULL, ("default pager with handle"));
}
- /*
- * Locate hash entry. If not found create, but if we aren't adding
- * anything just return. If we run out of space in the map we wait
- * and, since the hash table may have changed, retry.
- */
-retry:
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, pindex);
-
- if ((swap = *pswap) == NULL) {
- int i;
-
+ rdpi = rounddown(pindex, SWAP_META_PAGES);
+ sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
+ if (sb == NULL) {
if (swapblk == SWAPBLK_NONE)
- goto done;
-
- swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
- (curproc == pageproc ? M_USE_RESERVE : 0));
- if (swap == NULL) {
- mtx_unlock(&swhash_mtx);
+ return;
+ for (;;) {
+ sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
+ pageproc ? M_USE_RESERVE : 0));
+ if (sb != NULL) {
+ sb->p = rdpi;
+ for (i = 0; i < SWAP_META_PAGES; i++)
+ sb->d[i] = SWAPBLK_NONE;
+ if (atomic_cmpset_int(&swblk_zone_exhausted,
+ 1, 0))
+ printf("swblk zone ok\n");
+ break;
+ }
VM_OBJECT_WUNLOCK(object);
- if (uma_zone_exhausted(swap_zone)) {
- if (atomic_cmpset_int(&exhausted, 0, 1))
- printf("swap zone exhausted, "
+ if (uma_zone_exhausted(swblk_zone)) {
+ if (atomic_cmpset_int(&swblk_zone_exhausted,
+ 0, 1))
+ printf("swap blk zone exhausted, "
"increase kern.maxswzone\n");
vm_pageout_oom(VM_OOM_SWAPZ);
- pause("swzonex", 10);
+ pause("swzonxb", 10);
} else
VM_WAIT;
VM_OBJECT_WLOCK(object);
- goto retry;
+ sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+ rdpi);
+ if (sb != NULL)
+ /*
+ * Somebody swapped out a nearby page,
+ * allocating swblk at the rdpi index,
+ * while we dropped the object lock.
+ */
+ goto allocated;
+ }
+ for (;;) {
+ error = SWAP_PCTRIE_INSERT(
+ &object->un_pager.swp.swp_blks, sb);
+ if (error == 0) {
+ if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+ 1, 0))
+ printf("swpctrie zone ok\n");
+ break;
+ }
+ VM_OBJECT_WUNLOCK(object);
+ if (uma_zone_exhausted(swpctrie_zone)) {
+ if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+ 0, 1))
+ printf("swap pctrie zone exhausted, "
+ "increase kern.maxswzone\n");
+ vm_pageout_oom(VM_OOM_SWAPZ);
+ pause("swzonxp", 10);
+ } else
+ VM_WAIT;
+ VM_OBJECT_WLOCK(object);
+ sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+ rdpi);
+ if (sb1 != NULL) {
+ uma_zfree(swblk_zone, sb);
+ sb = sb1;
+ goto allocated;
+ }
}
-
- if (atomic_cmpset_int(&exhausted, 1, 0))
- printf("swap zone ok\n");
-
- swap->swb_hnext = NULL;
- swap->swb_object = object;
- swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
- swap->swb_count = 0;
-
- ++object->un_pager.swp.swp_bcount;
-
- for (i = 0; i < SWAP_META_PAGES; ++i)
- swap->swb_pages[i] = SWAPBLK_NONE;
}
+allocated:
+ MPASS(sb->p == rdpi);
- /*
- * Delete prior contents of metadata
- */
- idx = pindex & SWAP_META_MASK;
-
- if (swap->swb_pages[idx] != SWAPBLK_NONE) {
- swp_pager_freeswapspace(swap->swb_pages[idx], 1);
- --swap->swb_count;
- }
+ modpi = pindex % SWAP_META_PAGES;
+ /* Delete prior contents of metadata. */
+ if (sb->d[modpi] != SWAPBLK_NONE)
+ swp_pager_freeswapspace(sb->d[modpi], 1);
+ /* Enter block into metadata. */
+ sb->d[modpi] = swapblk;
/*
- * Enter block into metadata
+ * Free the swblk if we end up with the empty page run.
*/
- swap->swb_pages[idx] = swapblk;
- if (swapblk != SWAPBLK_NONE)
- ++swap->swb_count;
-done:
- mtx_unlock(&swhash_mtx);
+ if (swapblk == SWAPBLK_NONE) {
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ break;
+ }
+ if (i == SWAP_META_PAGES) {
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+ rdpi);
+ uma_zfree(swblk_zone, sb);
+ }
+ }
}
/*
@@ -1821,42 +1886,40 @@ done:
* with resident pages.
*/
static void
-swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
+swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
{
- struct swblock **pswap, *swap;
- vm_pindex_t c;
- daddr_t v;
- int n, sidx;
+ struct swblk *sb;
+ vm_pindex_t last;
+ int i;
+ bool empty;
- VM_OBJECT_ASSERT_LOCKED(object);
+ VM_OBJECT_ASSERT_WLOCKED(object);
if (object->type != OBJT_SWAP || count == 0)
return;
- mtx_lock(&swhash_mtx);
- for (c = 0; c < count;) {
- pswap = swp_pager_hash(object, index);
- sidx = index & SWAP_META_MASK;
- n = SWAP_META_PAGES - sidx;
- index += n;
- if ((swap = *pswap) == NULL) {
- c += n;
- continue;
- }
- for (; c < count && sidx < SWAP_META_PAGES; ++c, ++sidx) {
- if ((v = swap->swb_pages[sidx]) == SWAPBLK_NONE)
+ last = pindex + count - 1;
+ for (;;) {
+ sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ if (sb == NULL || sb->p > last)
+ break;
+ empty = true;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] == SWAPBLK_NONE)
continue;
- swp_pager_freeswapspace(v, 1);
- swap->swb_pages[sidx] = SWAPBLK_NONE;
- if (--swap->swb_count == 0) {
- *pswap = swap->swb_hnext;
- uma_zfree(swap_zone, swap);
- --object->un_pager.swp.swp_bcount;
- c += SWAP_META_PAGES - sidx;
- break;
- }
+ if (pindex <= sb->p + i && sb->p + i <= last) {
+ swp_pager_freeswapspace(sb->d[i], 1);
+ sb->d[i] = SWAPBLK_NONE;
+ } else
+ empty = false;
+ }
+ pindex = sb->p + SWAP_META_PAGES;
+ if (empty) {
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+ sb->p);
+ uma_zfree(swblk_zone, sb);
}
}
- mtx_unlock(&swhash_mtx);
}
/*
@@ -1868,36 +1931,23 @@ swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
static void
swp_pager_meta_free_all(vm_object_t object)
{
- struct swblock **pswap, *swap;
- vm_pindex_t index;
- daddr_t v;
+ struct swblk *sb;
+ vm_pindex_t pindex;
int i;
VM_OBJECT_ASSERT_WLOCKED(object);
if (object->type != OBJT_SWAP)
return;
- index = 0;
- while (object->un_pager.swp.swp_bcount != 0) {
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, index);
- if ((swap = *pswap) != NULL) {
- for (i = 0; i < SWAP_META_PAGES; ++i) {
- v = swap->swb_pages[i];
- if (v != SWAPBLK_NONE) {
- --swap->swb_count;
- swp_pager_freeswapspace(v, 1);
- }
- }
- if (swap->swb_count != 0)
- panic(
- "swap_pager_meta_free_all: swb_count != 0");
- *pswap = swap->swb_hnext;
- uma_zfree(swap_zone, swap);
- --object->un_pager.swp.swp_bcount;
+ for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+ &object->un_pager.swp.swp_blks, pindex)) != NULL;) {
+ pindex = sb->p + SWAP_META_PAGES;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ swp_pager_freeswapspace(sb->d[i], 1);
}
- mtx_unlock(&swhash_mtx);
- index += SWAP_META_PAGES;
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
+ uma_zfree(swblk_zone, sb);
}
}
@@ -1911,9 +1961,6 @@ swp_pager_meta_free_all(vm_object_t object)
* was invalid. This routine will automatically free any invalid
* meta-data swapblks.
*
- * It is not possible to store invalid swapblks in the swap meta data
- * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
- *
* When acting on a busy resident page and paging is in progress, we
* have to wait until paging is complete but otherwise can act on the
* busy page.
@@ -1924,43 +1971,45 @@ swp_pager_meta_free_all(vm_object_t object)
static daddr_t
swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
{
- struct swblock **pswap;
- struct swblock *swap;
+ struct swblk *sb;
daddr_t r1;
- int idx;
+ int i;
+
+ if ((flags & (SWM_FREE | SWM_POP)) != 0)
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ else
+ VM_OBJECT_ASSERT_LOCKED(object);
- VM_OBJECT_ASSERT_LOCKED(object);
/*
- * The meta data only exists of the object is OBJT_SWAP
+ * The meta data only exists if the object is OBJT_SWAP
* and even then might not be allocated yet.
*/
if (object->type != OBJT_SWAP)
return (SWAPBLK_NONE);
- r1 = SWAPBLK_NONE;
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, pindex);
-
- if ((swap = *pswap) != NULL) {
- idx = pindex & SWAP_META_MASK;
- r1 = swap->swb_pages[idx];
-
- if (r1 != SWAPBLK_NONE) {
- if (flags & SWM_FREE) {
- swp_pager_freeswapspace(r1, 1);
- r1 = SWAPBLK_NONE;
- }
- if (flags & (SWM_FREE|SWM_POP)) {
- swap->swb_pages[idx] = SWAPBLK_NONE;
- if (--swap->swb_count == 0) {
- *pswap = swap->swb_hnext;
- uma_zfree(swap_zone, swap);
- --object->un_pager.swp.swp_bcount;
- }
- }
+ sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ if (sb == NULL)
+ return (SWAPBLK_NONE);
+ r1 = sb->d[pindex % SWAP_META_PAGES];
+ if (r1 == SWAPBLK_NONE)
+ return (SWAPBLK_NONE);
+ if ((flags & (SWM_FREE | SWM_POP)) != 0) {
+ sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ break;
+ }
+ if (i == SWAP_META_PAGES) {
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ uma_zfree(swblk_zone, sb);
}
}
- mtx_unlock(&swhash_mtx);
+ if ((flags & SWM_FREE) != 0) {
+ swp_pager_freeswapspace(r1, 1);
+ r1 = SWAPBLK_NONE;
+ }
return (r1);
}
@@ -1974,32 +2023,38 @@ swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
vm_pindex_t
swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
{
- struct swblock **pswap, *swap;
- vm_pindex_t i, j, lim;
- int idx;
+ struct swblk *sb;
+ int i;
VM_OBJECT_ASSERT_LOCKED(object);
- if (object->type != OBJT_SWAP || object->un_pager.swp.swp_bcount == 0)
+ if (object->type != OBJT_SWAP)
return (object->size);
- mtx_lock(&swhash_mtx);
- for (j = pindex; j < object->size; j = lim) {
- pswap = swp_pager_hash(object, j);
- lim = rounddown2(j + SWAP_META_PAGES, SWAP_META_PAGES);
- if (lim > object->size)
- lim = object->size;
- if ((swap = *pswap) != NULL) {
- for (idx = j & SWAP_META_MASK, i = j; i < lim;
- i++, idx++) {
- if (swap->swb_pages[idx] != SWAPBLK_NONE)
- goto found;
- }
+ sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ if (sb == NULL)
+ return (object->size);
+ if (sb->p < pindex) {
+ for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ return (sb->p + i);
}
+ sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+ roundup(pindex, SWAP_META_PAGES));
+ if (sb == NULL)
+ return (object->size);
}
- i = object->size;
-found:
- mtx_unlock(&swhash_mtx);
- return (i);
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ return (sb->p + i);
+ }
+
+ /*
+ * We get here if a swblk is present in the trie but it
+ * doesn't map any blocks.
+ */
+ MPASS(0);
+ return (object->size);
}
/*
@@ -2035,7 +2090,7 @@ sys_swapon(struct thread *td, struct swapon_args *uap)
* Swap metadata may not fit in the KVM if we have physical
* memory of >1GB.
*/
- if (swap_zone == NULL) {
+ if (swblk_zone == NULL) {
error = ENOMEM;
goto done;
}
@@ -2071,15 +2126,16 @@ done:
/*
* Check that the total amount of swap currently configured does not
* exceed half the theoretical maximum. If it does, print a warning
- * message and return -1; otherwise, return 0.
+ * message.
*/
-static int
-swapon_check_swzone(unsigned long npages)
+static void
+swapon_check_swzone(void)
{
- unsigned long maxpages;
+ unsigned long maxpages, npages;
+ npages = swap_total / PAGE_SIZE;
/* absolute maximum we can handle assuming 100% efficiency */
- maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES;
+ maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES;
/* recommend using no more than half that amount */
if (npages > maxpages / 2) {
@@ -2088,9 +2144,7 @@ swapon_check_swzone(unsigned long npages)
npages, maxpages / 2);
printf("warning: increase kern.maxswzone "
"or reduce amount of swap.\n");
- return (-1);
}
- return (0);
}
static void
@@ -2158,7 +2212,7 @@ swaponsomething(struct vnode *vp, void *id, u_long nblks,
nswapdev++;
swap_pager_avail += nblks - 2;
swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
- swapon_check_swzone(swap_total / PAGE_SIZE);
+ swapon_check_swzone();
swp_sizecheck();
mtx_unlock(&sw_dev_mtx);
}
@@ -2379,15 +2433,9 @@ SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
"Swap statistics by device");
/*
- * vmspace_swap_count() - count the approximate swap usage in pages for a
- * vmspace.
- *
- * The map must be locked.
- *
- * Swap usage is determined by taking the proportional swap used by
- * VM objects backing the VM map. To make up for fractional losses,
- * if the VM object has any swap use at all the associated map entries
- * count for at least 1 swap page.
+ * Count the approximate swap usage in pages for a vmspace. The
+ * shadowed or not yet copied on write swap blocks are not accounted.
+ * The map must be locked.
*/
long
vmspace_swap_count(struct vmspace *vmspace)
@@ -2395,23 +2443,38 @@ vmspace_swap_count(struct vmspace *vmspace)
vm_map_t map;
vm_map_entry_t cur;
vm_object_t object;
- long count, n;
+ struct swblk *sb;
+ vm_pindex_t e, pi;
+ long count;
+ int i;
map = &vmspace->vm_map;
count = 0;
for (cur = map->header.next; cur != &map->header; cur = cur->next) {
- if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
- (object = cur->object.vm_object) != NULL) {
- VM_OBJECT_WLOCK(object);
- if (object->type == OBJT_SWAP &&
- object->un_pager.swp.swp_bcount != 0) {
- n = (cur->end - cur->start) / PAGE_SIZE;
- count += object->un_pager.swp.swp_bcount *
- SWAP_META_PAGES * n / object->size + 1;
+ if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
+ continue;
+ object = cur->object.vm_object;
+ if (object == NULL || object->type != OBJT_SWAP)
+ continue;
+ VM_OBJECT_RLOCK(object);
+ if (object->type != OBJT_SWAP)
+ goto unlock;
+ pi = OFF_TO_IDX(cur->offset);
+ e = pi + OFF_TO_IDX(cur->end - cur->start);
+ for (;; pi = sb->p + SWAP_META_PAGES) {
+ sb = SWAP_PCTRIE_LOOKUP_GE(
+ &object->un_pager.swp.swp_blks, pi);
+ if (sb == NULL || sb->p >= e)
+ break;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->p + i < e &&
+ sb->d[i] != SWAPBLK_NONE)
+ count++;
}
- VM_OBJECT_WUNLOCK(object);
}
+unlock:
+ VM_OBJECT_RUNLOCK(object);
}
return (count);
}
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index 0131297..1abded5 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -73,7 +73,6 @@ struct swdevt {
#ifdef _KERNEL
-extern int swap_pager_full;
extern int swap_pager_avail;
struct xswdev;
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index f4c2de8..55b9e61 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -296,6 +296,7 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
#define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */
#define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */
#define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */
+#define UMA_ALIGNOF(type) (_Alignof(type) - 1) /* Alignment fit for 'type' */
/*
* Destroys an empty uma zone. If the zone is not empty uma complains loudly.
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 26439dc..8504a72 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -1326,10 +1326,6 @@ keg_large_init(uma_keg_t keg)
keg->uk_ipers = 1;
keg->uk_rsize = keg->uk_size;
- /* We can't do OFFPAGE if we're internal, bail out here. */
- if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
- return;
-
/* Check whether we have enough space to not do OFFPAGE. */
if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
shsize = sizeof(struct uma_slab);
@@ -1337,8 +1333,17 @@ keg_large_init(uma_keg_t keg)
shsize = (shsize & ~UMA_ALIGN_PTR) +
(UMA_ALIGN_PTR + 1);
- if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
- keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
+ /*
+ * We can't do OFFPAGE if we're internal, in which case
+ * we need an extra page per allocation to contain the
+ * slab header.
+ */
+ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
+ keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ else
+ keg->uk_ppera++;
+ }
}
if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 8423d1c..ee315ab 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -28,6 +28,7 @@
*
*/
+#include <sys/_bitset.h>
#include <sys/_task.h>
/*
@@ -210,7 +211,7 @@ struct uma_keg {
vm_offset_t uk_kva; /* Zone base KVA */
uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */
- uint16_t uk_pgoff; /* Offset to uma_slab struct */
+ uint32_t uk_pgoff; /* Offset to uma_slab struct */
uint16_t uk_ppera; /* pages per allocation from backend */
uint16_t uk_ipers; /* Items per slab */
uint32_t uk_flags; /* Internal flags */
diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h
index 0489b6e..8e55273 100644
--- a/sys/vm/vm_extern.h
+++ b/sys/vm/vm_extern.h
@@ -71,7 +71,6 @@ void kmem_init(vm_offset_t, vm_offset_t);
void kmem_init_zero_region(void);
void kmeminit(void);
-void swapout_procs(int);
int kernacc(void *, int, int);
int useracc(void *, int, int);
int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index d2147f6..92c761b 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -236,14 +236,15 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
* written NOW so dirty it explicitly to save on
* pmap_is_modified() calls later.
*
- * Also tell the backing pager, if any, that it should remove
- * any swap backing since the page is now dirty.
+ * Also, since the page is now dirty, we can possibly tell
+ * the pager to release any swap backing the page. Calling
+ * the pager requires a write lock on the object.
*/
if (need_dirty)
vm_page_dirty(m);
if (!set_wd)
vm_page_unlock(m);
- if (need_dirty)
+ else if (need_dirty)
vm_pager_page_unswapped(m);
}
@@ -266,8 +267,12 @@ static int
vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
{
- vm_page_t m;
- int rv;
+ vm_page_t m, m_map;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+ vm_page_t m_super;
+ int flags;
+#endif
+ int psind, rv;
MPASS(fs->vp == NULL);
m = vm_page_lookup(fs->first_object, fs->first_pindex);
@@ -275,14 +280,46 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
return (KERN_FAILURE);
- rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
- PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0);
+ m_map = m;
+ psind = 0;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+ if ((m->flags & PG_FICTITIOUS) == 0 &&
+ (m_super = vm_reserv_to_superpage(m)) != NULL &&
+ rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
+ roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
+ (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
+ (pagesizes[m_super->psind] - 1)) &&
+ pmap_ps_enabled(fs->map->pmap)) {
+ flags = PS_ALL_VALID;
+ if ((prot & VM_PROT_WRITE) != 0) {
+ /*
+ * Create a superpage mapping allowing write access
+ * only if none of the constituent pages are busy and
+ * all of them are already dirty (except possibly for
+ * the page that was faulted on).
+ */
+ flags |= PS_NONE_BUSY;
+ if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
+ flags |= PS_ALL_DIRTY;
+ }
+ if (vm_page_ps_test(m_super, flags, m)) {
+ m_map = m_super;
+ psind = m_super->psind;
+ vaddr = rounddown2(vaddr, pagesizes[psind]);
+ /* Preset the modified bit for dirty superpages. */
+ if ((flags & PS_ALL_DIRTY) != 0)
+ fault_type |= VM_PROT_WRITE;
+ }
+ }
+#endif
+ rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type |
+ PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind);
if (rv != KERN_SUCCESS)
return (rv);
vm_fault_fill_hold(m_hold, m);
vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
VM_OBJECT_RUNLOCK(fs->first_object);
- if (!wired)
+ if (psind == 0 && !wired)
vm_fault_prefault(fs, vaddr, PFBAK, PFFOR);
vm_map_lookup_done(fs->map, fs->entry);
curthread->td_ru.ru_minflt++;
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 60b822e..14ec78a 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -101,13 +101,6 @@ __FBSDID("$FreeBSD$");
#include <machine/cpu.h>
-#ifndef NO_SWAPPING
-static int swapout(struct proc *);
-static void swapclear(struct proc *);
-static void vm_thread_swapin(struct thread *td);
-static void vm_thread_swapout(struct thread *td);
-#endif
-
/*
* MPSAFE
*
@@ -308,10 +301,6 @@ SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
"");
-#ifndef KSTACK_MAX_PAGES
-#define KSTACK_MAX_PAGES 32
-#endif
-
/*
* Create the kernel stack (including pcb for i386) for a new thread.
* This routine directly affects the fork perf for a process and
@@ -322,7 +311,7 @@ vm_thread_new(struct thread *td, int pages)
{
vm_object_t ksobj;
vm_offset_t ks;
- vm_page_t m, ma[KSTACK_MAX_PAGES];
+ vm_page_t ma[KSTACK_MAX_PAGES];
struct kstack_cache_entry *ks_ce;
int i;
@@ -391,15 +380,10 @@ vm_thread_new(struct thread *td, int pages)
* page of stack.
*/
VM_OBJECT_WLOCK(ksobj);
- for (i = 0; i < pages; i++) {
- /*
- * Get a kernel stack page.
- */
- m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
- VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
- ma[i] = m;
- m->valid = VM_PAGE_BITS_ALL;
- }
+ (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
+ VM_ALLOC_WIRED, ma, pages);
+ for (i = 0; i < pages; i++)
+ ma[i]->valid = VM_PAGE_BITS_ALL;
VM_OBJECT_WUNLOCK(ksobj);
pmap_qenter(ks, ma, pages);
return (1);
@@ -532,80 +516,6 @@ intr_prof_stack_use(struct thread *td, struct trapframe *frame)
}
#endif /* KSTACK_USAGE_PROF */
-#ifndef NO_SWAPPING
-/*
- * Allow a thread's kernel stack to be paged out.
- */
-static void
-vm_thread_swapout(struct thread *td)
-{
- vm_object_t ksobj;
- vm_page_t m;
- int i, pages;
-
- cpu_thread_swapout(td);
- pages = td->td_kstack_pages;
- ksobj = td->td_kstack_obj;
- pmap_qremove(td->td_kstack, pages);
- VM_OBJECT_WLOCK(ksobj);
- for (i = 0; i < pages; i++) {
- m = vm_page_lookup(ksobj, i);
- if (m == NULL)
- panic("vm_thread_swapout: kstack already missing?");
- vm_page_dirty(m);
- vm_page_lock(m);
- vm_page_unwire(m, PQ_INACTIVE);
- vm_page_unlock(m);
- }
- VM_OBJECT_WUNLOCK(ksobj);
-}
-
-/*
- * Bring the kernel stack for a specified thread back in.
- */
-static void
-vm_thread_swapin(struct thread *td)
-{
- vm_object_t ksobj;
- vm_page_t ma[KSTACK_MAX_PAGES];
- int pages;
-
- pages = td->td_kstack_pages;
- ksobj = td->td_kstack_obj;
- VM_OBJECT_WLOCK(ksobj);
- for (int i = 0; i < pages; i++)
- ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
- VM_ALLOC_WIRED);
- for (int i = 0; i < pages;) {
- int j, a, count, rv;
-
- vm_page_assert_xbusied(ma[i]);
- if (ma[i]->valid == VM_PAGE_BITS_ALL) {
- vm_page_xunbusy(ma[i]);
- i++;
- continue;
- }
- vm_object_pip_add(ksobj, 1);
- for (j = i + 1; j < pages; j++)
- if (ma[j]->valid == VM_PAGE_BITS_ALL)
- break;
- rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
- KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
- count = min(a + 1, j - i);
- rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
- KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
- __func__, td->td_proc->p_pid));
- vm_object_pip_wakeup(ksobj);
- for (j = i; j < i + count; j++)
- vm_page_xunbusy(ma[j]);
- i += count;
- }
- VM_OBJECT_WUNLOCK(ksobj);
- pmap_qenter(td->td_kstack, ma, pages);
- cpu_thread_swapin(td);
-}
-#endif /* !NO_SWAPPING */
-
/*
* Implement fork's actions on an address space.
* Here we arrange for the address space to be copied or referenced,
@@ -679,425 +589,8 @@ vm_waitproc(p)
}
void
-faultin(p)
- struct proc *p;
-{
-#ifdef NO_SWAPPING
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
- if ((p->p_flag & P_INMEM) == 0)
- panic("faultin: proc swapped out with NO_SWAPPING!");
-#else /* !NO_SWAPPING */
- struct thread *td;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
- /*
- * If another process is swapping in this process,
- * just wait until it finishes.
- */
- if (p->p_flag & P_SWAPPINGIN) {
- while (p->p_flag & P_SWAPPINGIN)
- msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
- return;
- }
- if ((p->p_flag & P_INMEM) == 0) {
- /*
- * Don't let another thread swap process p out while we are
- * busy swapping it in.
- */
- ++p->p_lock;
- p->p_flag |= P_SWAPPINGIN;
- PROC_UNLOCK(p);
-
- /*
- * We hold no lock here because the list of threads
- * can not change while all threads in the process are
- * swapped out.
- */
- FOREACH_THREAD_IN_PROC(p, td)
- vm_thread_swapin(td);
- PROC_LOCK(p);
- swapclear(p);
- p->p_swtick = ticks;
-
- wakeup(&p->p_flag);
-
- /* Allow other threads to swap p out now. */
- --p->p_lock;
- }
-#endif /* NO_SWAPPING */
-}
-
-/*
- * This swapin algorithm attempts to swap-in processes only if there
- * is enough space for them. Of course, if a process waits for a long
- * time, it will be swapped in anyway.
- */
-void
-swapper(void)
-{
- struct proc *p;
- struct thread *td;
- struct proc *pp;
- int slptime;
- int swtime;
- int ppri;
- int pri;
-
-loop:
- if (vm_page_count_min()) {
- VM_WAIT;
- goto loop;
- }
-
- pp = NULL;
- ppri = INT_MIN;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW ||
- p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
- PROC_UNLOCK(p);
- continue;
- }
- swtime = (ticks - p->p_swtick) / hz;
- FOREACH_THREAD_IN_PROC(p, td) {
- /*
- * An otherwise runnable thread of a process
- * swapped out has only the TDI_SWAPPED bit set.
- *
- */
- thread_lock(td);
- if (td->td_inhibitors == TDI_SWAPPED) {
- slptime = (ticks - td->td_slptick) / hz;
- pri = swtime + slptime;
- if ((td->td_flags & TDF_SWAPINREQ) == 0)
- pri -= p->p_nice * 8;
- /*
- * if this thread is higher priority
- * and there is enough space, then select
- * this process instead of the previous
- * selection.
- */
- if (pri > ppri) {
- pp = p;
- ppri = pri;
- }
- }
- thread_unlock(td);
- }
- PROC_UNLOCK(p);
- }
- sx_sunlock(&allproc_lock);
-
- /*
- * Nothing to do, back to sleep.
- */
- if ((p = pp) == NULL) {
- tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
- goto loop;
- }
- PROC_LOCK(p);
-
- /*
- * Another process may be bringing or may have already
- * brought this process in while we traverse all threads.
- * Or, this process may even be being swapped out again.
- */
- if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
- PROC_UNLOCK(p);
- goto loop;
- }
-
- /*
- * We would like to bring someone in. (only if there is space).
- * [What checks the space? ]
- */
- faultin(p);
- PROC_UNLOCK(p);
- goto loop;
-}
-
-void
kick_proc0(void)
{
wakeup(&proc0);
}
-
-#ifndef NO_SWAPPING
-
-/*
- * Swap_idle_threshold1 is the guaranteed swapped in time for a process
- */
-static int swap_idle_threshold1 = 2;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
- &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
-
-/*
- * Swap_idle_threshold2 is the time that a process can be idle before
- * it will be swapped out, if idle swapping is enabled.
- */
-static int swap_idle_threshold2 = 10;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
- &swap_idle_threshold2, 0, "Time before a process will be swapped out");
-
-/*
- * First, if any processes have been sleeping or stopped for at least
- * "swap_idle_threshold1" seconds, they are swapped out. If, however,
- * no such processes exist, then the longest-sleeping or stopped
- * process is swapped out. Finally, and only as a last resort, if
- * there are no sleeping or stopped processes, the longest-resident
- * process is swapped out.
- */
-void
-swapout_procs(action)
-int action;
-{
- struct proc *p;
- struct thread *td;
- int didswap = 0;
-
-retry:
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- struct vmspace *vm;
- int minslptime = 100000;
- int slptime;
-
- PROC_LOCK(p);
- /*
- * Watch out for a process in
- * creation. It may have no
- * address space or lock yet.
- */
- if (p->p_state == PRS_NEW) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * An aio daemon switches its
- * address space while running.
- * Perform a quick check whether
- * a process has P_SYSTEM.
- * Filter out exiting processes.
- */
- if ((p->p_flag & (P_SYSTEM | P_WEXIT)) != 0) {
- PROC_UNLOCK(p);
- continue;
- }
- _PHOLD_LITE(p);
- PROC_UNLOCK(p);
- sx_sunlock(&allproc_lock);
-
- /*
- * Do not swapout a process that
- * is waiting for VM data
- * structures as there is a possible
- * deadlock. Test this first as
- * this may block.
- *
- * Lock the map until swapout
- * finishes, or a thread of this
- * process may attempt to alter
- * the map.
- */
- vm = vmspace_acquire_ref(p);
- if (vm == NULL)
- goto nextproc2;
- if (!vm_map_trylock(&vm->vm_map))
- goto nextproc1;
-
- PROC_LOCK(p);
- if (p->p_lock != 1 || (p->p_flag & (P_STOPPED_SINGLE |
- P_TRACED | P_SYSTEM)) != 0)
- goto nextproc;
-
- /*
- * only aiod changes vmspace, however it will be
- * skipped because of the if statement above checking
- * for P_SYSTEM
- */
- if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
- goto nextproc;
-
- switch (p->p_state) {
- default:
- /* Don't swap out processes in any sort
- * of 'special' state. */
- break;
-
- case PRS_NORMAL:
- /*
- * do not swapout a realtime process
- * Check all the thread groups..
- */
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (PRI_IS_REALTIME(td->td_pri_class)) {
- thread_unlock(td);
- goto nextproc;
- }
- slptime = (ticks - td->td_slptick) / hz;
- /*
- * Guarantee swap_idle_threshold1
- * time in memory.
- */
- if (slptime < swap_idle_threshold1) {
- thread_unlock(td);
- goto nextproc;
- }
-
- /*
- * Do not swapout a process if it is
- * waiting on a critical event of some
- * kind or there is a thread whose
- * pageable memory may be accessed.
- *
- * This could be refined to support
- * swapping out a thread.
- */
- if (!thread_safetoswapout(td)) {
- thread_unlock(td);
- goto nextproc;
- }
- /*
- * If the system is under memory stress,
- * or if we are swapping
- * idle processes >= swap_idle_threshold2,
- * then swap the process out.
- */
- if (((action & VM_SWAP_NORMAL) == 0) &&
- (((action & VM_SWAP_IDLE) == 0) ||
- (slptime < swap_idle_threshold2))) {
- thread_unlock(td);
- goto nextproc;
- }
-
- if (minslptime > slptime)
- minslptime = slptime;
- thread_unlock(td);
- }
-
- /*
- * If the pageout daemon didn't free enough pages,
- * or if this process is idle and the system is
- * configured to swap proactively, swap it out.
- */
- if ((action & VM_SWAP_NORMAL) ||
- ((action & VM_SWAP_IDLE) &&
- (minslptime > swap_idle_threshold2))) {
- _PRELE(p);
- if (swapout(p) == 0)
- didswap++;
- PROC_UNLOCK(p);
- vm_map_unlock(&vm->vm_map);
- vmspace_free(vm);
- goto retry;
- }
- }
-nextproc:
- PROC_UNLOCK(p);
- vm_map_unlock(&vm->vm_map);
-nextproc1:
- vmspace_free(vm);
-nextproc2:
- sx_slock(&allproc_lock);
- PRELE(p);
- }
- sx_sunlock(&allproc_lock);
- /*
- * If we swapped something out, and another process needed memory,
- * then wakeup the sched process.
- */
- if (didswap)
- wakeup(&proc0);
-}
-
-static void
-swapclear(p)
- struct proc *p;
-{
- struct thread *td;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
-
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- td->td_flags |= TDF_INMEM;
- td->td_flags &= ~TDF_SWAPINREQ;
- TD_CLR_SWAPPED(td);
- if (TD_CAN_RUN(td))
- if (setrunnable(td)) {
-#ifdef INVARIANTS
- /*
- * XXX: We just cleared TDI_SWAPPED
- * above and set TDF_INMEM, so this
- * should never happen.
- */
- panic("not waking up swapper");
-#endif
- }
- thread_unlock(td);
- }
- p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
- p->p_flag |= P_INMEM;
-}
-
-static int
-swapout(p)
- struct proc *p;
-{
- struct thread *td;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
-#if defined(SWAP_DEBUG)
- printf("swapping out %d\n", p->p_pid);
-#endif
-
- /*
- * The states of this process and its threads may have changed
- * by now. Assuming that there is only one pageout daemon thread,
- * this process should still be in memory.
- */
- KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
- ("swapout: lost a swapout race?"));
-
- /*
- * remember the process resident count
- */
- p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
- /*
- * Check and mark all threads before we proceed.
- */
- p->p_flag &= ~P_INMEM;
- p->p_flag |= P_SWAPPINGOUT;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!thread_safetoswapout(td)) {
- thread_unlock(td);
- swapclear(p);
- return (EBUSY);
- }
- td->td_flags &= ~TDF_INMEM;
- TD_SET_SWAPPED(td);
- thread_unlock(td);
- }
- td = FIRST_THREAD_IN_PROC(p);
- ++td->td_ru.ru_nswap;
- PROC_UNLOCK(p);
-
- /*
- * This list is stable because all threads are now prevented from
- * running. The list is only modified in the context of a running
- * thread in this process.
- */
- FOREACH_THREAD_IN_PROC(p, td)
- vm_thread_swapout(td);
-
- PROC_LOCK(p);
- p->p_flag &= ~P_SWAPPINGOUT;
- p->p_swtick = ticks;
- return (0);
-}
-#endif /* !NO_SWAPPING */
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index e1c3db0..3778f76 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -121,8 +121,7 @@ SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
* a mapping on demand through vm_fault() will result in a panic.
*/
vm_offset_t
-kva_alloc(size)
- vm_size_t size;
+kva_alloc(vm_size_t size)
{
vm_offset_t addr;
@@ -143,9 +142,7 @@ kva_alloc(size)
* This routine may not block on kernel maps.
*/
void
-kva_free(addr, size)
- vm_offset_t addr;
- vm_size_t size;
+kva_free(vm_offset_t addr, vm_size_t size)
{
size = round_page(size);
@@ -430,9 +427,7 @@ kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size)
* This routine may block.
*/
vm_offset_t
-kmap_alloc_wait(map, size)
- vm_map_t map;
- vm_size_t size;
+kmap_alloc_wait(vm_map_t map, vm_size_t size)
{
vm_offset_t addr;
@@ -470,10 +465,7 @@ kmap_alloc_wait(map, size)
* waiting for memory in that map.
*/
void
-kmap_free_wakeup(map, addr, size)
- vm_map_t map;
- vm_offset_t addr;
- vm_size_t size;
+kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size)
{
vm_map_lock(map);
@@ -517,8 +509,7 @@ kmem_init_zero_region(void)
* `start' as allocated, and the range between `start' and `end' as free.
*/
void
-kmem_init(start, end)
- vm_offset_t start, end;
+kmem_init(vm_offset_t start, vm_offset_t end)
{
vm_map_t m;
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index ce1696f..4a5eadf 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -1556,6 +1556,18 @@ again:
return (result);
}
+/*
+ * vm_map_find_min() is a variant of vm_map_find() that takes an
+ * additional parameter (min_addr) and treats the given address
+ * (*addr) differently. Specifically, it treats *addr as a hint
+ * and not as the minimum address where the mapping is created.
+ *
+ * This function works in two phases. First, it tries to
+ * allocate above the hint. If that fails and the hint is
+ * greater than min_addr, it performs a second pass, replacing
+ * the hint with min_addr as the minimum address for the
+ * allocation.
+ */
int
vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
@@ -1962,7 +1974,7 @@ vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
(pagesizes[p->psind] - 1)) == 0) {
mask = atop(pagesizes[p->psind]) - 1;
if (tmpidx + mask < psize &&
- vm_page_ps_is_valid(p)) {
+ vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
p += mask;
threshold += mask;
}
@@ -3610,12 +3622,13 @@ vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
("bi-dir stack"));
- sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
if (addrbos < vm_map_min(map) ||
- addrbos > vm_map_max(map) ||
- addrbos + max_ssize < addrbos ||
- sgp >= max_ssize)
- return (KERN_NO_SPACE);
+ addrbos + max_ssize > vm_map_max(map) ||
+ addrbos + max_ssize <= addrbos)
+ return (KERN_INVALID_ADDRESS);
+ sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
+ if (sgp >= max_ssize)
+ return (KERN_INVALID_ARGUMENT);
init_ssize = growsize;
if (max_ssize < init_ssize + sgp)
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index 17631f9..258a96e 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -1196,7 +1196,7 @@ vm_mmap_vnode(struct thread *td, vm_size_t objsize,
{
struct vattr va;
vm_object_t obj;
- vm_offset_t foff;
+ vm_ooffset_t foff;
struct ucred *cred;
int error, flags, locktype;
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 5bf9e26..11e1f84 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -73,6 +73,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/kernel.h>
+#include <sys/pctrie.h>
#include <sys/sysctl.h>
#include <sys/mutex.h>
#include <sys/proc.h> /* for curproc, pageproc */
@@ -208,6 +209,7 @@ vm_object_zinit(void *mem, int size, int flags)
object->paging_in_progress = 0;
object->resident_page_count = 0;
object->shadow_count = 0;
+ object->flags = OBJ_DEAD;
mtx_lock(&vm_object_list_mtx);
TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -223,6 +225,16 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
LIST_INIT(&object->shadow_head);
object->type = type;
+ if (type == OBJT_SWAP)
+ pctrie_init(&object->un_pager.swp.swp_blks);
+
+ /*
+ * Ensure that swap_pager_swapoff() iteration over object_list
+ * sees up to date type and pctrie head if it observed
+ * non-dead object.
+ */
+ atomic_thread_fence_rel();
+
switch (type) {
case OBJT_DEAD:
panic("_vm_object_allocate: can't create OBJT_DEAD");
@@ -694,6 +706,89 @@ vm_object_destroy(vm_object_t object)
}
/*
+ * vm_object_terminate_pages removes any remaining pageable pages
+ * from the object and resets the object to an empty state.
+ */
+static void
+vm_object_terminate_pages(vm_object_t object)
+{
+ vm_page_t p, p_next;
+ struct mtx *mtx, *mtx1;
+ struct vm_pagequeue *pq, *pq1;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ mtx = NULL;
+ pq = NULL;
+
+ /*
+ * Free any remaining pageable pages. This also removes them from the
+ * paging queues. However, don't free wired pages, just remove them
+ * from the object. Rather than incrementally removing each page from
+ * the object, the page and object are reset to any empty state.
+ */
+ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
+ vm_page_assert_unbusied(p);
+ if ((object->flags & OBJ_UNMANAGED) == 0) {
+ /*
+ * vm_page_free_prep() only needs the page
+ * lock for managed pages.
+ */
+ mtx1 = vm_page_lockptr(p);
+ if (mtx1 != mtx) {
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+ if (pq != NULL) {
+ vm_pagequeue_unlock(pq);
+ pq = NULL;
+ }
+ mtx = mtx1;
+ mtx_lock(mtx);
+ }
+ }
+ p->object = NULL;
+ if (p->wire_count != 0)
+ goto unlist;
+ PCPU_INC(cnt.v_pfree);
+ p->flags &= ~PG_ZERO;
+ if (p->queue != PQ_NONE) {
+ KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
+ "page %p is not queued", p));
+ pq1 = vm_page_pagequeue(p);
+ if (pq != pq1) {
+ if (pq != NULL)
+ vm_pagequeue_unlock(pq);
+ pq = pq1;
+ vm_pagequeue_lock(pq);
+ }
+ }
+ if (vm_page_free_prep(p, true))
+ continue;
+unlist:
+ TAILQ_REMOVE(&object->memq, p, listq);
+ }
+ if (pq != NULL)
+ vm_pagequeue_unlock(pq);
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+
+ vm_page_free_phys_pglist(&object->memq);
+
+ /*
+ * If the object contained any pages, then reset it to an empty state.
+ * None of the object's fields, including "resident_page_count", were
+ * modified by the preceding loop.
+ */
+ if (object->resident_page_count != 0) {
+ vm_radix_reclaim_allnodes(&object->rtree);
+ TAILQ_INIT(&object->memq);
+ object->resident_page_count = 0;
+ if (object->type == OBJT_VNODE)
+ vdrop(object->handle);
+ }
+}
+
+/*
* vm_object_terminate actually destroys the specified object, freeing
* up all previously used resources.
*
@@ -703,7 +798,6 @@ vm_object_destroy(vm_object_t object)
void
vm_object_terminate(vm_object_t object)
{
- vm_page_t p, p_next;
VM_OBJECT_ASSERT_WLOCKED(object);
@@ -746,41 +840,8 @@ vm_object_terminate(vm_object_t object)
("vm_object_terminate: object with references, ref_count=%d",
object->ref_count));
- /*
- * Free any remaining pageable pages. This also removes them from the
- * paging queues. However, don't free wired pages, just remove them
- * from the object. Rather than incrementally removing each page from
- * the object, the page and object are reset to any empty state.
- */
- TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
- vm_page_assert_unbusied(p);
- vm_page_lock(p);
- /*
- * Optimize the page's removal from the object by resetting
- * its "object" field. Specifically, if the page is not
- * wired, then the effect of this assignment is that
- * vm_page_free()'s call to vm_page_remove() will return
- * immediately without modifying the page or the object.
- */
- p->object = NULL;
- if (p->wire_count == 0) {
- vm_page_free(p);
- PCPU_INC(cnt.v_pfree);
- }
- vm_page_unlock(p);
- }
- /*
- * If the object contained any pages, then reset it to an empty state.
- * None of the object's fields, including "resident_page_count", were
- * modified by the preceding loop.
- */
- if (object->resident_page_count != 0) {
- vm_radix_reclaim_allnodes(&object->rtree);
- TAILQ_INIT(&object->memq);
- object->resident_page_count = 0;
- if (object->type == OBJT_VNODE)
- vdrop(object->handle);
- }
+ if ((object->flags & OBJ_PG_DTOR) == 0)
+ vm_object_terminate_pages(object);
#if VM_NRESERVLEVEL > 0
if (__predict_false(!LIST_EMPTY(&object->rvq)))
@@ -1022,8 +1083,8 @@ vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
* I/O.
*/
if (object->type == OBJT_VNODE &&
- (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
- vp = object->handle;
+ (object->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+ ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) {
VM_OBJECT_WUNLOCK(object);
(void) vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
@@ -1898,6 +1959,8 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
int options)
{
vm_page_t p, next;
+ struct mtx *mtx;
+ struct pglist pgl;
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@@ -1906,8 +1969,10 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
if (object->resident_page_count == 0)
return;
vm_object_pip_add(object, 1);
+ TAILQ_INIT(&pgl);
again:
p = vm_page_find_least(object, start);
+ mtx = NULL;
/*
* Here, the variable "p" is either (1) the page with the least pindex
@@ -1924,7 +1989,7 @@ again:
* however, be invalidated if the option OBJPR_CLEANONLY is
* not specified.
*/
- vm_page_lock(p);
+ vm_page_change_lock(p, &mtx);
if (vm_page_xbusied(p)) {
VM_OBJECT_WUNLOCK(object);
vm_page_busy_sleep(p, "vmopax", true);
@@ -1932,13 +1997,14 @@ again:
goto again;
}
if (p->wire_count != 0) {
- if ((options & OBJPR_NOTMAPPED) == 0)
+ if ((options & OBJPR_NOTMAPPED) == 0 &&
+ object->ref_count != 0)
pmap_remove_all(p);
if ((options & OBJPR_CLEANONLY) == 0) {
p->valid = 0;
vm_page_undirty(p);
}
- goto next;
+ continue;
}
if (vm_page_busied(p)) {
VM_OBJECT_WUNLOCK(object);
@@ -1949,17 +2015,21 @@ again:
KASSERT((p->flags & PG_FICTITIOUS) == 0,
("vm_object_page_remove: page %p is fictitious", p));
if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
- if ((options & OBJPR_NOTMAPPED) == 0)
+ if ((options & OBJPR_NOTMAPPED) == 0 &&
+ object->ref_count != 0)
pmap_remove_write(p);
- if (p->dirty)
- goto next;
+ if (p->dirty != 0)
+ continue;
}
- if ((options & OBJPR_NOTMAPPED) == 0)
+ if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
pmap_remove_all(p);
- vm_page_free(p);
-next:
- vm_page_unlock(p);
+ p->flags &= ~PG_ZERO;
+ if (vm_page_free_prep(p, false))
+ TAILQ_INSERT_TAIL(&pgl, p, listq);
}
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+ vm_page_free_phys_pglist(&pgl);
vm_object_pip_wakeup(object);
}
@@ -1982,7 +2052,7 @@ next:
void
vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
{
- struct mtx *mtx, *new_mtx;
+ struct mtx *mtx;
vm_page_t p, next;
VM_OBJECT_ASSERT_LOCKED(object);
@@ -1999,17 +2069,7 @@ vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
mtx = NULL;
for (; p != NULL && (p->pindex < end || end == 0); p = next) {
next = TAILQ_NEXT(p, listq);
-
- /*
- * Avoid releasing and reacquiring the same page lock.
- */
- new_mtx = vm_page_lockptr(p);
- if (mtx != new_mtx) {
- if (mtx != NULL)
- mtx_unlock(mtx);
- mtx = new_mtx;
- mtx_lock(mtx);
- }
+ vm_page_change_lock(p, &mtx);
vm_page_deactivate_noreuse(p);
}
if (mtx != NULL)
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 9b2192e..17a885e 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -70,6 +70,7 @@
#include <sys/queue.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
+#include <sys/_pctrie.h>
#include <sys/_rwlock.h>
#include <vm/_vm_radix.h>
@@ -86,12 +87,17 @@
*
*/
+#ifndef VM_PAGE_HAVE_PGLIST
+TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
+
struct vm_object {
struct rwlock lock;
TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
- TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
+ struct pglist memq; /* list of resident pages */
struct vm_radix rtree; /* root of the resident page radix trie*/
vm_pindex_t size; /* Object size */
int generation; /* generation ID */
@@ -151,13 +157,12 @@ struct vm_object {
* the handle changed and hash-chain
* invalid.
*
- * swp_bcount - number of swap 'swblock' metablocks, each
- * contains up to 16 swapblk assignments.
- * see vm/swap_pager.h
+ * swp_blks - pc-trie of the allocated swap blocks.
+ *
*/
struct {
void *swp_tmpfs;
- int swp_bcount;
+ struct pctrie swp_blks;
} swp;
} un_pager;
struct ucred *cred;
@@ -171,11 +176,12 @@ struct vm_object {
#define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */
#define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */
#define OBJ_POPULATE 0x0004 /* pager implements populate() */
-#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */
+#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */
#define OBJ_NOSPLIT 0x0010 /* dont split this object */
#define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */
-#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */
-#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */
+#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */
+#define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */
+#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */
#define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */
#define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */
#define OBJ_COLORED 0x1000 /* pg_color is defined */
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index d8a9c21..16dc868 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -158,6 +158,7 @@ static uma_zone_t fakepg_zone;
static void vm_page_alloc_check(vm_page_t m);
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_free_phys(vm_page_t m);
static void vm_page_free_wakeup(void);
static void vm_page_init_fakepg(void *dummy);
static int vm_page_insert_after(vm_page_t m, vm_object_t object,
@@ -407,6 +408,29 @@ vm_page_domain_init(struct vm_domain *vmd)
}
/*
+ * Initialize a physical page in preparation for adding it to the free
+ * lists.
+ */
+static void
+vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
+{
+
+ m->object = NULL;
+ m->wire_count = 0;
+ m->busy_lock = VPB_UNBUSIED;
+ m->hold_count = 0;
+ m->flags = 0;
+ m->phys_addr = pa;
+ m->queue = PQ_NONE;
+ m->psind = 0;
+ m->segind = segind;
+ m->order = VM_NFREEORDER;
+ m->pool = VM_FREEPOOL_DEFAULT;
+ m->valid = m->dirty = 0;
+ pmap_page_init(m);
+}
+
+/*
* vm_page_startup:
*
* Initializes the resident memory module. Allocates physical memory for
@@ -417,17 +441,15 @@ vm_page_domain_init(struct vm_domain *vmd)
vm_offset_t
vm_page_startup(vm_offset_t vaddr)
{
- vm_offset_t mapped;
- vm_paddr_t high_avail, low_avail, page_range, size;
- vm_paddr_t new_end;
- int i;
- vm_paddr_t pa;
- vm_paddr_t last_pa;
+ struct vm_domain *vmd;
+ struct vm_phys_seg *seg;
+ vm_page_t m;
char *list, *listend;
- vm_paddr_t end;
- vm_paddr_t biggestsize;
- int biggestone;
- int pages_per_zone;
+ vm_offset_t mapped;
+ vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
+ vm_paddr_t biggestsize, last_pa, pa;
+ u_long pagecount;
+ int biggestone, i, pages_per_zone, segind;
biggestsize = 0;
biggestone = 0;
@@ -463,7 +485,8 @@ vm_page_startup(vm_offset_t vaddr)
* in proportion to the zone structure size.
*/
pages_per_zone = howmany(sizeof(struct uma_zone) +
- sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE);
+ sizeof(struct uma_cache) * (mp_maxid + 1) +
+ roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE);
if (pages_per_zone > 1) {
/* Reserve more pages so that we don't run out. */
boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
@@ -507,6 +530,8 @@ vm_page_startup(vm_offset_t vaddr)
vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
bzero((void *)vm_page_dump, vm_page_dump_size);
+#else
+ (void)last_pa;
#endif
#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
/*
@@ -611,7 +636,9 @@ vm_page_startup(vm_offset_t vaddr)
new_end = trunc_page(end - page_range * sizeof(struct vm_page));
mapped = pmap_map(&vaddr, new_end, end,
VM_PROT_READ | VM_PROT_WRITE);
- vm_page_array = (vm_page_t) mapped;
+ vm_page_array = (vm_page_t)mapped;
+ vm_page_array_size = page_range;
+
#if VM_NRESERVLEVEL > 0
/*
* Allocate physical memory for the reservation management system's
@@ -638,33 +665,53 @@ vm_page_startup(vm_offset_t vaddr)
vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
/*
- * Clear all of the page structures
- */
- bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
- for (i = 0; i < page_range; i++)
- vm_page_array[i].order = VM_NFREEORDER;
- vm_page_array_size = page_range;
-
- /*
* Initialize the physical memory allocator.
*/
vm_phys_init();
/*
- * Add every available physical page that is not blacklisted to
- * the free lists.
+ * Initialize the page structures and add every available page to the
+ * physical memory allocator's free lists.
*/
vm_cnt.v_page_count = 0;
vm_cnt.v_free_count = 0;
- for (i = 0; phys_avail[i + 1] != 0; i += 2) {
- pa = phys_avail[i];
- last_pa = phys_avail[i + 1];
- while (pa < last_pa) {
- vm_phys_add_page(pa);
- pa += PAGE_SIZE;
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ for (m = seg->first_page, pa = seg->start; pa < seg->end;
+ m++, pa += PAGE_SIZE)
+ vm_page_init_page(m, pa, segind);
+
+ /*
+ * Add the segment to the free lists only if it is covered by
+ * one of the ranges in phys_avail. Because we've added the
+ * ranges to the vm_phys_segs array, we can assume that each
+ * segment is either entirely contained in one of the ranges,
+ * or doesn't overlap any of them.
+ */
+ for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+ if (seg->start < phys_avail[i] ||
+ seg->end > phys_avail[i + 1])
+ continue;
+
+ m = seg->first_page;
+ pagecount = (u_long)atop(seg->end - seg->start);
+
+ mtx_lock(&vm_page_queue_free_mtx);
+ vm_phys_free_contig(m, pagecount);
+ vm_phys_freecnt_adj(m, (int)pagecount);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ vm_cnt.v_page_count += (u_int)pagecount;
+
+ vmd = &vm_dom[seg->domain];
+ vmd->vmd_page_count += (u_int)pagecount;
+ vmd->vmd_segs |= 1UL << m->segind;
+ break;
}
}
+ /*
+ * Remove blacklisted pages from the physical memory allocator.
+ */
TAILQ_INIT(&blacklist_head);
vm_page_blacklist_load(&list, &listend);
vm_page_blacklist_check(list, listend);
@@ -905,6 +952,23 @@ vm_page_flash(vm_page_t m)
}
/*
+ * Avoid releasing and reacquiring the same page lock.
+ */
+void
+vm_page_change_lock(vm_page_t m, struct mtx **mtx)
+{
+ struct mtx *mtx1;
+
+ mtx1 = vm_page_lockptr(m);
+ if (*mtx == mtx1)
+ return;
+ if (*mtx != NULL)
+ mtx_unlock(*mtx);
+ *mtx = mtx1;
+ mtx_lock(mtx1);
+}
+
+/*
* Keep page from being freed by the page daemon
* much of the same effect as wiring, except much lower
* overhead and should be used only for *very* temporary
@@ -937,20 +1001,11 @@ vm_page_unhold(vm_page_t mem)
void
vm_page_unhold_pages(vm_page_t *ma, int count)
{
- struct mtx *mtx, *new_mtx;
+ struct mtx *mtx;
mtx = NULL;
for (; count != 0; count--) {
- /*
- * Avoid releasing and reacquiring the same page lock.
- */
- new_mtx = vm_page_lockptr(*ma);
- if (mtx != new_mtx) {
- if (mtx != NULL)
- mtx_unlock(mtx);
- mtx = new_mtx;
- mtx_lock(mtx);
- }
+ vm_page_change_lock(*ma, &mtx);
vm_page_unhold(*ma);
ma++;
}
@@ -1989,7 +2044,7 @@ vm_page_t
vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
u_long alignment, vm_paddr_t boundary, int options)
{
- struct mtx *m_mtx, *new_mtx;
+ struct mtx *m_mtx;
vm_object_t object;
vm_paddr_t pa;
vm_page_t m, m_run;
@@ -2005,8 +2060,10 @@ vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
run_len = 0;
m_mtx = NULL;
for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
- KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
- ("page %p is PG_FICTITIOUS or PG_MARKER", m));
+ KASSERT((m->flags & PG_MARKER) == 0,
+ ("page %p is PG_MARKER", m));
+ KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
+ ("fictitious page %p has invalid wire count", m));
/*
* If the current page would be the start of a run, check its
@@ -2032,16 +2089,7 @@ vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
} else
KASSERT(m_run != NULL, ("m_run == NULL"));
- /*
- * Avoid releasing and reacquiring the same page lock.
- */
- new_mtx = vm_page_lockptr(m);
- if (m_mtx != new_mtx) {
- if (m_mtx != NULL)
- mtx_unlock(m_mtx);
- m_mtx = new_mtx;
- mtx_lock(m_mtx);
- }
+ vm_page_change_lock(m, &m_mtx);
m_inc = 1;
retry:
if (m->wire_count != 0 || m->hold_count != 0)
@@ -2191,7 +2239,7 @@ static int
vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
vm_paddr_t high)
{
- struct mtx *m_mtx, *new_mtx;
+ struct mtx *m_mtx;
struct spglist free;
vm_object_t object;
vm_paddr_t pa;
@@ -2212,13 +2260,7 @@ vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
/*
* Avoid releasing and reacquiring the same page lock.
*/
- new_mtx = vm_page_lockptr(m);
- if (m_mtx != new_mtx) {
- if (m_mtx != NULL)
- mtx_unlock(m_mtx);
- m_mtx = new_mtx;
- mtx_lock(m_mtx);
- }
+ vm_page_change_lock(m, &m_mtx);
retry:
if (m->wire_count != 0 || m->hold_count != 0)
error = EBUSY;
@@ -2331,12 +2373,7 @@ retry:
* The new page must be deactivated
* before the object is unlocked.
*/
- new_mtx = vm_page_lockptr(m_new);
- if (m_mtx != new_mtx) {
- mtx_unlock(m_mtx);
- m_mtx = new_mtx;
- mtx_lock(m_mtx);
- }
+ vm_page_change_lock(m_new, &m_mtx);
vm_page_deactivate(m_new);
} else {
m->flags &= ~PG_ZERO;
@@ -2379,13 +2416,7 @@ unlock:
mtx_lock(&vm_page_queue_free_mtx);
do {
SLIST_REMOVE_HEAD(&free, plinks.s.ss);
- vm_phys_freecnt_adj(m, 1);
-#if VM_NRESERVLEVEL > 0
- if (!vm_reserv_free_page(m))
-#else
- if (true)
-#endif
- vm_phys_free_pages(m, 0);
+ vm_page_free_phys(m);
} while ((m = SLIST_FIRST(&free)) != NULL);
vm_page_zero_idle_wakeup();
vm_page_free_wakeup();
@@ -2722,7 +2753,7 @@ vm_page_activate(vm_page_t m)
*
* The page queues must be locked.
*/
-static inline void
+static void
vm_page_free_wakeup(void)
{
@@ -2748,17 +2779,30 @@ vm_page_free_wakeup(void)
}
/*
- * vm_page_free_toq:
+ * vm_page_free_prep:
*
- * Returns the given page to the free list,
- * disassociating it with any VM object.
+ * Prepares the given page to be put on the free list,
+ * disassociating it from any VM object. The caller may return
+ * the page to the free list only if this function returns true.
*
- * The object must be locked. The page must be locked if it is managed.
+ * The object must be locked. The page must be locked if it is
+ * managed. For a queued managed page, the pagequeue_locked
+ * argument specifies whether the page queue is already locked.
*/
-void
-vm_page_free_toq(vm_page_t m)
+bool
+vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
{
+#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
+ if ((m->flags & PG_ZERO) != 0) {
+ uint64_t *p;
+ int i;
+ p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+ for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
+ KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
+ m, i, (uintmax_t)*p));
+ }
+#endif
if ((m->oflags & VPO_UNMANAGED) == 0) {
vm_page_lock_assert(m, MA_OWNED);
KASSERT(!pmap_page_is_mapped(m),
@@ -2777,16 +2821,20 @@ vm_page_free_toq(vm_page_t m)
* callback routine until after we've put the page on the
* appropriate free queue.
*/
- vm_page_remque(m);
+ if (m->queue != PQ_NONE) {
+ if (pagequeue_locked)
+ vm_page_dequeue_locked(m);
+ else
+ vm_page_dequeue(m);
+ }
vm_page_remove(m);
/*
* If fictitious remove object association and
* return, otherwise delay object association removal.
*/
- if ((m->flags & PG_FICTITIOUS) != 0) {
- return;
- }
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ return (false);
m->valid = 0;
vm_page_undirty(m);
@@ -2798,32 +2846,72 @@ vm_page_free_toq(vm_page_t m)
KASSERT((m->flags & PG_UNHOLDFREE) == 0,
("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
m->flags |= PG_UNHOLDFREE;
- } else {
- /*
- * Restore the default memory attribute to the page.
- */
- if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
- pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+ return (false);
+ }
- /*
- * Insert the page into the physical memory allocator's free
- * page queues.
- */
- mtx_lock(&vm_page_queue_free_mtx);
- vm_phys_freecnt_adj(m, 1);
+ /*
+ * Restore the default memory attribute to the page.
+ */
+ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
+ pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+
+ return (true);
+}
+
+/*
+ * Insert the page into the physical memory allocator's free page
+ * queues. This is the last step to free a page.
+ */
+static void
+vm_page_free_phys(vm_page_t m)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+ vm_phys_freecnt_adj(m, 1);
#if VM_NRESERVLEVEL > 0
- if (!vm_reserv_free_page(m))
-#else
- if (TRUE)
+ if (!vm_reserv_free_page(m))
#endif
vm_phys_free_pages(m, 0);
- if ((m->flags & PG_ZERO) != 0)
- ++vm_page_zero_count;
- else
- vm_page_zero_idle_wakeup();
- vm_page_free_wakeup();
- mtx_unlock(&vm_page_queue_free_mtx);
- }
+ if ((m->flags & PG_ZERO) != 0)
+ ++vm_page_zero_count;
+ else
+ vm_page_zero_idle_wakeup();
+}
+
+void
+vm_page_free_phys_pglist(struct pglist *tq)
+{
+ vm_page_t m;
+
+ if (TAILQ_EMPTY(tq))
+ return;
+ mtx_lock(&vm_page_queue_free_mtx);
+ TAILQ_FOREACH(m, tq, listq)
+ vm_page_free_phys(m);
+ vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
+}
+
+/*
+ * vm_page_free_toq:
+ *
+ * Returns the given page to the free list, disassociating it
+ * from any VM object.
+ *
+ * The object must be locked. The page must be locked if it is
+ * managed.
+ */
+void
+vm_page_free_toq(vm_page_t m)
+{
+
+ if (!vm_page_free_prep(m, false))
+ return;
+ mtx_lock(&vm_page_queue_free_mtx);
+ vm_page_free_phys(m);
+ vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
}
/*
@@ -3006,23 +3094,25 @@ vm_page_launder(vm_page_t m)
* vm_page_try_to_free()
*
* Attempt to free the page. If we cannot free it, we do nothing.
- * 1 is returned on success, 0 on failure.
+ * true is returned on success, false on failure.
*/
-int
+bool
vm_page_try_to_free(vm_page_t m)
{
- vm_page_lock_assert(m, MA_OWNED);
+ vm_page_assert_locked(m);
if (m->object != NULL)
VM_OBJECT_ASSERT_WLOCKED(m->object);
- if (m->dirty || m->hold_count || m->wire_count ||
+ if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 ||
(m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
- return (0);
- pmap_remove_all(m);
- if (m->dirty)
- return (0);
+ return (false);
+ if (m->object != NULL && m->object->ref_count != 0) {
+ pmap_remove_all(m);
+ if (m->dirty != 0)
+ return (false);
+ }
vm_page_free(m);
- return (1);
+ return (true);
}
/*
@@ -3142,6 +3232,107 @@ retrylookup:
}
/*
+ * Return the specified range of pages from the given object. For each
+ * page offset within the range, if a page already exists within the object
+ * at that offset and it is busy, then wait for it to change state. If,
+ * instead, the page doesn't exist, then allocate it.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ * VM_ALLOC_NORMAL normal process request
+ * VM_ALLOC_SYSTEM system *really* needs the pages
+ *
+ * The caller must always specify that the pages are to be busied and/or
+ * wired.
+ *
+ * optional allocation flags:
+ * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages
+ * VM_ALLOC_NOBUSY do not exclusive busy the page
+ * VM_ALLOC_NOWAIT do not sleep
+ * VM_ALLOC_SBUSY set page to sbusy state
+ * VM_ALLOC_WIRED wire the pages
+ * VM_ALLOC_ZERO zero and validate any invalid pages
+ *
+ * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it
+ * may return a partial prefix of the requested range.
+ */
+int
+vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+ vm_page_t *ma, int count)
+{
+ vm_page_t m;
+ int i;
+ bool sleep;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
+ ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
+ KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
+ (allocflags & VM_ALLOC_WIRED) != 0,
+ ("vm_page_grab_pages: the pages must be busied or wired"));
+ KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+ (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+ ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
+ if (count == 0)
+ return (0);
+ i = 0;
+retrylookup:
+ m = vm_page_lookup(object, pindex + i);
+ for (; i < count; i++) {
+ if (m != NULL) {
+ sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+ vm_page_xbusied(m) : vm_page_busied(m);
+ if (sleep) {
+ if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+ break;
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_busy_sleep(m, "grbmaw", (allocflags &
+ VM_ALLOC_IGN_SBUSY) != 0);
+ VM_OBJECT_WLOCK(object);
+ goto retrylookup;
+ }
+ if ((allocflags & VM_ALLOC_WIRED) != 0) {
+ vm_page_lock(m);
+ vm_page_wire(m);
+ vm_page_unlock(m);
+ }
+ if ((allocflags & (VM_ALLOC_NOBUSY |
+ VM_ALLOC_SBUSY)) == 0)
+ vm_page_xbusy(m);
+ if ((allocflags & VM_ALLOC_SBUSY) != 0)
+ vm_page_sbusy(m);
+ } else {
+ m = vm_page_alloc(object, pindex + i, (allocflags &
+ ~VM_ALLOC_IGN_SBUSY) | VM_ALLOC_COUNT(count - i));
+ if (m == NULL) {
+ if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+ break;
+ VM_OBJECT_WUNLOCK(object);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(object);
+ goto retrylookup;
+ }
+ }
+ if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
+ if ((m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ m->valid = VM_PAGE_BITS_ALL;
+ }
+ ma[i] = m;
+ m = vm_page_next(m);
+ }
+ return (i);
+}
+
+/*
* Mapping function for valid or dirty bits in a page.
*
* Inputs are required to range within a page.
@@ -3459,16 +3650,17 @@ vm_page_is_valid(vm_page_t m, int base, int size)
}
/*
- * vm_page_ps_is_valid:
- *
- * Returns TRUE if the entire (super)page is valid and FALSE otherwise.
+ * Returns true if all of the specified predicates are true for the entire
+ * (super)page and false otherwise.
*/
-boolean_t
-vm_page_ps_is_valid(vm_page_t m)
+bool
+vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
{
+ vm_object_t object;
int i, npages;
- VM_OBJECT_ASSERT_LOCKED(m->object);
+ object = m->object;
+ VM_OBJECT_ASSERT_LOCKED(object);
npages = atop(pagesizes[m->psind]);
/*
@@ -3477,10 +3669,28 @@ vm_page_ps_is_valid(vm_page_t m)
* occupy adjacent entries in vm_page_array[].
*/
for (i = 0; i < npages; i++) {
- if (m[i].valid != VM_PAGE_BITS_ALL)
- return (FALSE);
+ /* Always test object consistency, including "skip_m". */
+ if (m[i].object != object)
+ return (false);
+ if (&m[i] == skip_m)
+ continue;
+ if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
+ return (false);
+ if ((flags & PS_ALL_DIRTY) != 0) {
+ /*
+ * Calling vm_page_test_dirty() or pmap_is_modified()
+ * might stop this case from spuriously returning
+ * "false". However, that would require a write lock
+ * on the object containing "m[i]".
+ */
+ if (m[i].dirty != VM_PAGE_BITS_ALL)
+ return (false);
+ }
+ if ((flags & PS_ALL_VALID) != 0 &&
+ m[i].valid != VM_PAGE_BITS_ALL)
+ return (false);
}
- return (TRUE);
+ return (true);
}
/*
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 1ee8dde..d27fe10 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -209,7 +209,10 @@ struct vm_page {
#define PQ_LAUNDRY 2
#define PQ_COUNT 3
+#ifndef VM_PAGE_HAVE_PGLIST
TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
SLIST_HEAD(spglist, vm_page);
struct vm_pagequeue {
@@ -387,6 +390,9 @@ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
* vm_page_alloc_freelist(). Some functions support only a subset
* of the flags, and ignore others, see the flags legend.
*
+ * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*()
+ * and the vm_page_grab*() functions. See these functions for details.
+ *
* Bits 0 - 1 define class.
* Bits 2 - 15 dedicated for flags.
* Legend:
@@ -394,6 +400,7 @@ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
* (c) - vm_page_alloc_contig() supports the flag.
* (f) - vm_page_alloc_freelist() supports the flag.
* (g) - vm_page_grab() supports the flag.
+ * (p) - vm_page_grab_pages() supports the flag.
* Bits above 15 define the count of additional pages that the caller
* intends to allocate.
*/
@@ -401,16 +408,16 @@ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
#define VM_ALLOC_INTERRUPT 1
#define VM_ALLOC_SYSTEM 2
#define VM_ALLOC_CLASS_MASK 3
-#define VM_ALLOC_WIRED 0x0020 /* (acfg) Allocate non pageable page */
-#define VM_ALLOC_ZERO 0x0040 /* (acfg) Try to obtain a zeroed page */
+#define VM_ALLOC_WIRED 0x0020 /* (acfgp) Allocate a wired page */
+#define VM_ALLOC_ZERO 0x0040 /* (acfgp) Allocate a prezeroed page */
#define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */
-#define VM_ALLOC_NOBUSY 0x0200 /* (acg) Do not busy the page */
+#define VM_ALLOC_NOBUSY 0x0200 /* (acgp) Do not excl busy the page */
#define VM_ALLOC_IFCACHED 0x0400
#define VM_ALLOC_IFNOTCACHED 0x0800
-#define VM_ALLOC_IGN_SBUSY 0x1000 /* (g) Ignore shared busy flag */
+#define VM_ALLOC_IGN_SBUSY 0x1000 /* (gp) Ignore shared busy flag */
#define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */
-#define VM_ALLOC_SBUSY 0x4000 /* (acg) Shared busy the page */
-#define VM_ALLOC_NOWAIT 0x8000 /* (g) Do not sleep, return NULL */
+#define VM_ALLOC_SBUSY 0x4000 /* (acgp) Shared busy the page */
+#define VM_ALLOC_NOWAIT 0x8000 /* (gp) Do not sleep */
#define VM_ALLOC_COUNT_SHIFT 16
#define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT)
@@ -433,6 +440,18 @@ malloc2vm_flags(int malloc_flags)
}
#endif
+/*
+ * Predicates supported by vm_page_ps_test():
+ *
+ * PS_ALL_DIRTY is true only if the entire (super)page is dirty.
+ * However, it can be spuriously false when the (super)page has become
+ * dirty in the pmap but that information has not been propagated to the
+ * machine-independent layer.
+ */
+#define PS_ALL_DIRTY 0x1
+#define PS_ALL_VALID 0x2
+#define PS_NONE_BUSY 0x4
+
void vm_page_busy_downgrade(vm_page_t m);
void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
void vm_page_flash(vm_page_t m);
@@ -448,13 +467,17 @@ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
vm_paddr_t boundary, vm_memattr_t memattr);
vm_page_t vm_page_alloc_freelist(int, int);
+void vm_page_change_lock(vm_page_t m, struct mtx **mtx);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
-int vm_page_try_to_free (vm_page_t);
+int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+ vm_page_t *ma, int count);
void vm_page_deactivate (vm_page_t);
void vm_page_deactivate_noreuse(vm_page_t);
void vm_page_dequeue(vm_page_t m);
void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
+void vm_page_free_phys_pglist(struct pglist *tq);
+bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
@@ -464,7 +487,7 @@ vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
vm_page_t vm_page_prev(vm_page_t m);
-boolean_t vm_page_ps_is_valid(vm_page_t m);
+bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m);
void vm_page_putfake(vm_page_t m);
void vm_page_readahead_finish(vm_page_t m);
bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
@@ -483,6 +506,7 @@ void vm_page_set_valid_range(vm_page_t m, int base, int size);
int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
vm_offset_t vm_page_startup(vm_offset_t vaddr);
void vm_page_sunbusy(vm_page_t m);
+bool vm_page_try_to_free(vm_page_t m);
int vm_page_trysbusy(vm_page_t m);
void vm_page_unhold_pages(vm_page_t *ma, int count);
boolean_t vm_page_unwire(vm_page_t m, uint8_t queue);
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index c9646cf..793e2e9 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -141,19 +141,6 @@ SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
SDT_PROVIDER_DEFINE(vm);
SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
-#if !defined(NO_SWAPPING)
-/* the kernel process "vm_daemon"*/
-static void vm_daemon(void);
-static struct proc *vmproc;
-
-static struct kproc_desc vm_kp = {
- "vmdaemon",
- vm_daemon,
- &vmproc
-};
-SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
-#endif
-
/* Pagedaemon activity rates, in subdivisions of one second. */
#define VM_LAUNDER_RATE 10
#define VM_INACT_SCAN_RATE 2
@@ -171,26 +158,11 @@ static enum {
VM_LAUNDRY_SHORTFALL
} vm_laundry_request = VM_LAUNDRY_IDLE;
-#if !defined(NO_SWAPPING)
-static int vm_pageout_req_swapout; /* XXX */
-static int vm_daemon_needed;
-static struct mtx vm_daemon_mtx;
-/* Allow for use by vm_pageout before vm_daemon is initialized. */
-MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
-#endif
static int vm_pageout_update_period;
static int disable_swap_pageouts;
static int lowmem_period = 10;
static time_t lowmem_uptime;
-#if defined(NO_SWAPPING)
-static int vm_swap_enabled = 0;
-static int vm_swap_idle_enabled = 0;
-#else
-static int vm_swap_enabled = 1;
-static int vm_swap_idle_enabled = 0;
-#endif
-
static int vm_panic_on_oom = 0;
SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
@@ -198,56 +170,44 @@ SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
"panic on out of memory instead of killing the largest process");
SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
- CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
+ CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0,
"free page threshold for waking up the pageout daemon");
SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
- CTLFLAG_RW, &vm_pageout_update_period, 0,
+ CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
"Maximum active LRU update period");
-SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
+SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
"Low memory callback period");
-#if defined(NO_SWAPPING)
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
- CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
- CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#else
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
- CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
- CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#endif
-
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
- CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
+ CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
static int pageout_lock_miss;
SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
- CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+ CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
"back-to-back calls to oom detector to start OOM");
static int act_scan_laundry_weight = 3;
-SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW,
+SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
&act_scan_laundry_weight, 0,
"weight given to clean vs. dirty pages in active queue scans");
static u_int vm_background_launder_target;
-SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW,
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN,
&vm_background_launder_target, 0,
"background laundering target, in pages");
static u_int vm_background_launder_rate = 4096;
-SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW,
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
&vm_background_launder_rate, 0,
"background laundering rate, in kilobytes per second");
static u_int vm_background_launder_max = 20 * 1024;
-SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW,
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
&vm_background_launder_max, 0, "background laundering cap, in kilobytes");
int vm_pageout_page_count = 32;
@@ -261,11 +221,6 @@ static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
static int vm_pageout_launder(struct vm_domain *vmd, int launder,
bool in_shortfall);
static void vm_pageout_laundry_worker(void *arg);
-#if !defined(NO_SWAPPING)
-static void vm_pageout_map_deactivate_pages(vm_map_t, long);
-static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
-static void vm_req_vmdaemon(int req);
-#endif
static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
/*
@@ -401,6 +356,8 @@ vm_pageout_cluster(vm_page_t m)
*/
vm_page_assert_unbusied(m);
KASSERT(m->hold_count == 0, ("page %p is held", m));
+
+ pmap_remove_write(m);
vm_page_unlock(m);
mc[vm_pageout_page_count] = pb = ps = m;
@@ -443,6 +400,7 @@ more:
ib = 0;
break;
}
+ pmap_remove_write(p);
vm_page_unlock(p);
mc[--page_base] = pb = p;
++pageout_count;
@@ -468,6 +426,7 @@ more:
vm_page_unlock(p);
break;
}
+ pmap_remove_write(p);
vm_page_unlock(p);
mc[page_base + pageout_count] = ps = p;
++pageout_count;
@@ -512,8 +471,8 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
VM_OBJECT_ASSERT_WLOCKED(object);
/*
- * Initiate I/O. Bump the vm_page_t->busy counter and
- * mark the pages read-only.
+ * Initiate I/O. Mark the pages busy and verify that they're valid
+ * and read-only.
*
* We do not have to fixup the clean/dirty bits here... we can
* allow the pager to do it after the I/O completes.
@@ -525,8 +484,9 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
("vm_pageout_flush: partially invalid page %p index %d/%d",
mc[i], i, count));
+ KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
+ ("vm_pageout_flush: writeable page %p", mc[i]));
vm_page_sbusy(mc[i]);
- pmap_remove_write(mc[i]);
}
vm_object_pip_add(object, count);
@@ -599,171 +559,6 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
return (numpagedout);
}
-#if !defined(NO_SWAPPING)
-/*
- * vm_pageout_object_deactivate_pages
- *
- * Deactivate enough pages to satisfy the inactive target
- * requirements.
- *
- * The object and map must be locked.
- */
-static void
-vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
- long desired)
-{
- vm_object_t backing_object, object;
- vm_page_t p;
- int act_delta, remove_mode;
-
- VM_OBJECT_ASSERT_LOCKED(first_object);
- if ((first_object->flags & OBJ_FICTITIOUS) != 0)
- return;
- for (object = first_object;; object = backing_object) {
- if (pmap_resident_count(pmap) <= desired)
- goto unlock_return;
- VM_OBJECT_ASSERT_LOCKED(object);
- if ((object->flags & OBJ_UNMANAGED) != 0 ||
- object->paging_in_progress != 0)
- goto unlock_return;
-
- remove_mode = 0;
- if (object->shadow_count > 1)
- remove_mode = 1;
- /*
- * Scan the object's entire memory queue.
- */
- TAILQ_FOREACH(p, &object->memq, listq) {
- if (pmap_resident_count(pmap) <= desired)
- goto unlock_return;
- if (vm_page_busied(p))
- continue;
- PCPU_INC(cnt.v_pdpages);
- vm_page_lock(p);
- if (p->wire_count != 0 || p->hold_count != 0 ||
- !pmap_page_exists_quick(pmap, p)) {
- vm_page_unlock(p);
- continue;
- }
- act_delta = pmap_ts_referenced(p);
- if ((p->aflags & PGA_REFERENCED) != 0) {
- if (act_delta == 0)
- act_delta = 1;
- vm_page_aflag_clear(p, PGA_REFERENCED);
- }
- if (!vm_page_active(p) && act_delta != 0) {
- vm_page_activate(p);
- p->act_count += act_delta;
- } else if (vm_page_active(p)) {
- if (act_delta == 0) {
- p->act_count -= min(p->act_count,
- ACT_DECLINE);
- if (!remove_mode && p->act_count == 0) {
- pmap_remove_all(p);
- vm_page_deactivate(p);
- } else
- vm_page_requeue(p);
- } else {
- vm_page_activate(p);
- if (p->act_count < ACT_MAX -
- ACT_ADVANCE)
- p->act_count += ACT_ADVANCE;
- vm_page_requeue(p);
- }
- } else if (vm_page_inactive(p))
- pmap_remove_all(p);
- vm_page_unlock(p);
- }
- if ((backing_object = object->backing_object) == NULL)
- goto unlock_return;
- VM_OBJECT_RLOCK(backing_object);
- if (object != first_object)
- VM_OBJECT_RUNLOCK(object);
- }
-unlock_return:
- if (object != first_object)
- VM_OBJECT_RUNLOCK(object);
-}
-
-/*
- * deactivate some number of pages in a map, try to do it fairly, but
- * that is really hard to do.
- */
-static void
-vm_pageout_map_deactivate_pages(map, desired)
- vm_map_t map;
- long desired;
-{
- vm_map_entry_t tmpe;
- vm_object_t obj, bigobj;
- int nothingwired;
-
- if (!vm_map_trylock(map))
- return;
-
- bigobj = NULL;
- nothingwired = TRUE;
-
- /*
- * first, search out the biggest object, and try to free pages from
- * that.
- */
- tmpe = map->header.next;
- while (tmpe != &map->header) {
- if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
- obj = tmpe->object.vm_object;
- if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
- if (obj->shadow_count <= 1 &&
- (bigobj == NULL ||
- bigobj->resident_page_count < obj->resident_page_count)) {
- if (bigobj != NULL)
- VM_OBJECT_RUNLOCK(bigobj);
- bigobj = obj;
- } else
- VM_OBJECT_RUNLOCK(obj);
- }
- }
- if (tmpe->wired_count > 0)
- nothingwired = FALSE;
- tmpe = tmpe->next;
- }
-
- if (bigobj != NULL) {
- vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
- VM_OBJECT_RUNLOCK(bigobj);
- }
- /*
- * Next, hunt around for other pages to deactivate. We actually
- * do this search sort of wrong -- .text first is not the best idea.
- */
- tmpe = map->header.next;
- while (tmpe != &map->header) {
- if (pmap_resident_count(vm_map_pmap(map)) <= desired)
- break;
- if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
- obj = tmpe->object.vm_object;
- if (obj != NULL) {
- VM_OBJECT_RLOCK(obj);
- vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
- VM_OBJECT_RUNLOCK(obj);
- }
- }
- tmpe = tmpe->next;
- }
-
- /*
- * Remove all mappings if a process is swapped out, this will free page
- * table pages.
- */
- if (desired == 0 && nothingwired) {
- pmap_remove(vm_map_pmap(map), vm_map_min(map),
- vm_map_max(map));
- }
-
- vm_map_unlock(map);
-}
-#endif /* !defined(NO_SWAPPING) */
-
/*
* Attempt to acquire all of the necessary locks to launder a page and
* then call through the clustering layer to PUTPAGES. Wait a short
@@ -822,7 +617,17 @@ vm_pageout_clean(vm_page_t m, int *numpagedout)
goto unlock_mp;
}
VM_OBJECT_WLOCK(object);
+
+ /*
+ * Ensure that the object and vnode were not disassociated
+ * while locks were dropped.
+ */
+ if (vp->v_object != object) {
+ error = ENOENT;
+ goto unlock_all;
+ }
vm_page_lock(m);
+
/*
* While the object and page were unlocked, the page
* may have been:
@@ -1506,14 +1311,12 @@ drop_page:
vm_pagequeue_unlock(pq);
}
-#if !defined(NO_SWAPPING)
/*
* Wakeup the swapout daemon if we didn't free the targeted number of
* pages.
*/
- if (vm_swap_enabled && page_shortage > 0)
- vm_req_vmdaemon(VM_SWAP_NORMAL);
-#endif
+ if (page_shortage > 0)
+ vm_swapout_run();
/*
* If the inactive queue scan fails repeatedly to meet its
@@ -1663,18 +1466,8 @@ drop_page:
vm_page_unlock(m);
}
vm_pagequeue_unlock(pq);
-#if !defined(NO_SWAPPING)
- /*
- * Idle process swapout -- run once per second.
- */
- if (vm_swap_idle_enabled) {
- static long lsec;
- if (time_second != lsec) {
- vm_req_vmdaemon(VM_SWAP_IDLE);
- lsec = time_second;
- }
- }
-#endif
+ if (pass > 0)
+ vm_swapout_run_idle();
return (page_shortage <= 0);
}
@@ -1971,7 +1764,7 @@ vm_pageout_worker(void *arg)
*/
mtx_unlock(&vm_page_queue_free_mtx);
if (pass >= 1)
- pause("psleep", hz / VM_INACT_SCAN_RATE);
+ pause("pwait", hz / VM_INACT_SCAN_RATE);
pass++;
} else {
/*
@@ -2100,167 +1893,3 @@ pagedaemon_wakeup(void)
wakeup(&vm_pageout_wanted);
}
}
-
-#if !defined(NO_SWAPPING)
-static void
-vm_req_vmdaemon(int req)
-{
- static int lastrun = 0;
-
- mtx_lock(&vm_daemon_mtx);
- vm_pageout_req_swapout |= req;
- if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
- wakeup(&vm_daemon_needed);
- lastrun = ticks;
- }
- mtx_unlock(&vm_daemon_mtx);
-}
-
-static void
-vm_daemon(void)
-{
- struct rlimit rsslim;
- struct proc *p;
- struct thread *td;
- struct vmspace *vm;
- int breakout, swapout_flags, tryagain, attempts;
-#ifdef RACCT
- uint64_t rsize, ravailable;
-#endif
-
- while (TRUE) {
- mtx_lock(&vm_daemon_mtx);
- msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
-#ifdef RACCT
- racct_enable ? hz : 0
-#else
- 0
-#endif
- );
- swapout_flags = vm_pageout_req_swapout;
- vm_pageout_req_swapout = 0;
- mtx_unlock(&vm_daemon_mtx);
- if (swapout_flags)
- swapout_procs(swapout_flags);
-
- /*
- * scan the processes for exceeding their rlimits or if
- * process is swapped out -- deactivate pages
- */
- tryagain = 0;
- attempts = 0;
-again:
- attempts++;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- vm_pindex_t limit, size;
-
- /*
- * if this is a system process or if we have already
- * looked at this process, skip it.
- */
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL ||
- p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * if the process is in a non-running type state,
- * don't touch it.
- */
- breakout = 0;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td)) {
- thread_unlock(td);
- breakout = 1;
- break;
- }
- thread_unlock(td);
- }
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * get a limit
- */
- lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
- limit = OFF_TO_IDX(
- qmin(rsslim.rlim_cur, rsslim.rlim_max));
-
- /*
- * let processes that are swapped out really be
- * swapped out set the limit to nothing (will force a
- * swap-out.)
- */
- if ((p->p_flag & P_INMEM) == 0)
- limit = 0; /* XXX */
- vm = vmspace_acquire_ref(p);
- _PHOLD_LITE(p);
- PROC_UNLOCK(p);
- if (vm == NULL) {
- PRELE(p);
- continue;
- }
- sx_sunlock(&allproc_lock);
-
- size = vmspace_resident_count(vm);
- if (size >= limit) {
- vm_pageout_map_deactivate_pages(
- &vm->vm_map, limit);
- size = vmspace_resident_count(vm);
- }
-#ifdef RACCT
- if (racct_enable) {
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- if (p->p_state == PRS_NORMAL)
- racct_set(p, RACCT_RSS, rsize);
- ravailable = racct_get_available(p, RACCT_RSS);
- PROC_UNLOCK(p);
- if (rsize > ravailable) {
- /*
- * Don't be overly aggressive; this
- * might be an innocent process,
- * and the limit could've been exceeded
- * by some memory hog. Don't try
- * to deactivate more than 1/4th
- * of process' resident set size.
- */
- if (attempts <= 8) {
- if (ravailable < rsize -
- (rsize / 4)) {
- ravailable = rsize -
- (rsize / 4);
- }
- }
- vm_pageout_map_deactivate_pages(
- &vm->vm_map,
- OFF_TO_IDX(ravailable));
- /* Update RSS usage after paging out. */
- size = vmspace_resident_count(vm);
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- if (p->p_state == PRS_NORMAL)
- racct_set(p, RACCT_RSS, rsize);
- PROC_UNLOCK(p);
- if (rsize > ravailable)
- tryagain = 1;
- }
- }
-#endif
- vmspace_free(vm);
- sx_slock(&allproc_lock);
- PRELE(p);
- }
- sx_sunlock(&allproc_lock);
- if (tryagain != 0 && attempts <= 10)
- goto again;
- }
-}
-#endif /* !defined(NO_SWAPPING) */
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index b44ca2f..6f8e0e9 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -77,12 +77,6 @@ extern int vm_pageout_page_count;
extern bool vm_pageout_wanted;
extern bool vm_pages_needed;
-/*
- * Swap out requests
- */
-#define VM_SWAP_NORMAL 1
-#define VM_SWAP_IDLE 2
-
#define VM_OOM_MEM 1
#define VM_OOM_SWAPZ 2
@@ -109,5 +103,8 @@ extern void vm_waitpfault(void);
#ifdef _KERNEL
int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
void vm_pageout_oom(int shortage);
+
+void vm_swapout_run(void);
+void vm_swapout_run_idle(void);
#endif
#endif /* _VM_VM_PAGEOUT_H_ */
diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h
index 68e5cd1..20dc143 100644
--- a/sys/vm/vm_param.h
+++ b/sys/vm/vm_param.h
@@ -84,7 +84,8 @@
#define VM_V_PAGEOUT_FREE_MIN 9 /* vm_cnt.v_pageout_free_min */
#define VM_OBSOLETE_10 10 /* pageout algorithm */
#define VM_SWAPPING_ENABLED 11 /* swapping enabled */
-#define VM_MAXID 12 /* number of valid vm ids */
+#define VM_OVERCOMMIT 12 /* vm.overcommit */
+#define VM_MAXID 13 /* number of valid vm ids */
/*
* Structure for swap device statistics
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
index 484417b..60b452f 100644
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -175,7 +175,6 @@ static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
vm_paddr_t boundary);
static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
-static int vm_phys_paddr_to_segind(vm_paddr_t pa);
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
int order);
@@ -731,35 +730,6 @@ vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
}
/*
- * Initialize a physical page and add it to the free lists.
- */
-void
-vm_phys_add_page(vm_paddr_t pa)
-{
- vm_page_t m;
- struct vm_domain *vmd;
-
- vm_cnt.v_page_count++;
- m = vm_phys_paddr_to_vm_page(pa);
- m->busy_lock = VPB_UNBUSIED;
- m->phys_addr = pa;
- m->queue = PQ_NONE;
- m->segind = vm_phys_paddr_to_segind(pa);
- vmd = vm_phys_domain(m);
- vmd->vmd_page_count++;
- vmd->vmd_segs |= 1UL << m->segind;
- KASSERT(m->order == VM_NFREEORDER,
- ("vm_phys_add_page: page %p has unexpected order %d",
- m, m->order));
- m->pool = VM_FREEPOOL_DEFAULT;
- pmap_page_init(m);
- mtx_lock(&vm_page_queue_free_mtx);
- vm_phys_freecnt_adj(m, 1);
- vm_phys_free_pages(m, 0);
- mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
* Allocate a contiguous, power of two-sized set of physical pages
* from the free lists.
*
@@ -912,6 +882,7 @@ vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
{
long i;
+ bzero(range, page_count * sizeof(*range));
for (i = 0; i < page_count; i++) {
vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
range[i].oflags &= ~VPO_UNMANAGED;
@@ -986,7 +957,7 @@ vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
alloc:
#endif
fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
- M_WAITOK | M_ZERO);
+ M_WAITOK);
#ifdef VM_PHYSSEG_DENSE
}
#endif
@@ -1067,24 +1038,6 @@ vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
}
/*
- * Find the segment containing the given physical address.
- */
-static int
-vm_phys_paddr_to_segind(vm_paddr_t pa)
-{
- struct vm_phys_seg *seg;
- int segind;
-
- for (segind = 0; segind < vm_phys_nsegs; segind++) {
- seg = &vm_phys_segs[segind];
- if (pa >= seg->start && pa < seg->end)
- return (segind);
- }
- panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
- (uintmax_t)pa);
-}
-
-/*
* Free a contiguous, power of two-sized set of physical pages.
*
* The free page queues must be locked.
diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h
index ee4aa2d..c5dd58d 100644
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@@ -69,7 +69,6 @@ extern int vm_phys_nsegs;
/*
* The following functions are only to be used by the virtual memory system.
*/
-void vm_phys_add_page(vm_paddr_t pa);
void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
u_long alignment, vm_paddr_t boundary);
diff --git a/sys/vm/vm_reserv.c b/sys/vm/vm_reserv.c
index 7e2bfb6..ce3289e 100644
--- a/sys/vm/vm_reserv.c
+++ b/sys/vm/vm_reserv.c
@@ -1120,4 +1120,18 @@ vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
return (new_end);
}
+/*
+ * Returns the superpage containing the given page.
+ */
+vm_page_t
+vm_reserv_to_superpage(vm_page_t m)
+{
+ vm_reserv_t rv;
+
+ VM_OBJECT_ASSERT_LOCKED(m->object);
+ rv = vm_reserv_from_page(m);
+ return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
+ rv->pages : NULL);
+}
+
#endif /* VM_NRESERVLEVEL > 0 */
diff --git a/sys/vm/vm_reserv.h b/sys/vm/vm_reserv.h
index 8b33b48..3d9472d 100644
--- a/sys/vm/vm_reserv.h
+++ b/sys/vm/vm_reserv.h
@@ -64,6 +64,7 @@ void vm_reserv_rename(vm_page_t m, vm_object_t new_object,
int vm_reserv_size(int level);
vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end,
vm_paddr_t high_water);
+vm_page_t vm_reserv_to_superpage(vm_page_t m);
#endif /* VM_NRESERVLEVEL > 0 */
#endif /* _KERNEL */
diff --git a/sys/vm/vm_swapout.c b/sys/vm/vm_swapout.c
new file mode 100644
index 0000000..91b6422
--- /dev/null
+++ b/sys/vm/vm_swapout.c
@@ -0,0 +1,975 @@
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kstack_pages.h"
+#include "opt_kstack_max_pages.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/_kstack_cache.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/mount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
+#include <vm/swap_pager.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+/* the kernel process "vm_daemon" */
+static void vm_daemon(void);
+static struct proc *vmproc;
+
+static struct kproc_desc vm_kp = {
+ "vmdaemon",
+ vm_daemon,
+ &vmproc
+};
+SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
+
+static int vm_swap_enabled = 1;
+static int vm_swap_idle_enabled = 0;
+
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
+ &vm_swap_enabled, 0,
+ "Enable entire process swapout");
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
+ &vm_swap_idle_enabled, 0,
+ "Allow swapout on idle criteria");
+
+/*
+ * Swap_idle_threshold1 is the guaranteed swapped in time for a process
+ */
+static int swap_idle_threshold1 = 2;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
+ &swap_idle_threshold1, 0,
+ "Guaranteed swapped in time for a process");
+
+/*
+ * Swap_idle_threshold2 is the time that a process can be idle before
+ * it will be swapped out, if idle swapping is enabled.
+ */
+static int swap_idle_threshold2 = 10;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
+ &swap_idle_threshold2, 0,
+ "Time before a process will be swapped out");
+
+static int vm_pageout_req_swapout; /* XXX */
+static int vm_daemon_needed;
+static struct mtx vm_daemon_mtx;
+/* Allow for use by vm_pageout before vm_daemon is initialized. */
+MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
+
+static void swapclear(struct proc *);
+static int swapout(struct proc *);
+static void vm_swapout_map_deactivate_pages(vm_map_t, long);
+static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
+static void swapout_procs(int action);
+static void vm_req_vmdaemon(int req);
+static void vm_thread_swapin(struct thread *td);
+static void vm_thread_swapout(struct thread *td);
+
+/*
+ * vm_swapout_object_deactivate_pages
+ *
+ * Deactivate enough pages to satisfy the inactive target
+ * requirements.
+ *
+ * The object and map must be locked.
+ */
+static void
+vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
+ long desired)
+{
+ vm_object_t backing_object, object;
+ vm_page_t p;
+ int act_delta, remove_mode;
+
+ VM_OBJECT_ASSERT_LOCKED(first_object);
+ if ((first_object->flags & OBJ_FICTITIOUS) != 0)
+ return;
+ for (object = first_object;; object = backing_object) {
+ if (pmap_resident_count(pmap) <= desired)
+ goto unlock_return;
+ VM_OBJECT_ASSERT_LOCKED(object);
+ if ((object->flags & OBJ_UNMANAGED) != 0 ||
+ object->paging_in_progress != 0)
+ goto unlock_return;
+
+ remove_mode = 0;
+ if (object->shadow_count > 1)
+ remove_mode = 1;
+ /*
+ * Scan the object's entire memory queue.
+ */
+ TAILQ_FOREACH(p, &object->memq, listq) {
+ if (pmap_resident_count(pmap) <= desired)
+ goto unlock_return;
+ if (vm_page_busied(p))
+ continue;
+ PCPU_INC(cnt.v_pdpages);
+ vm_page_lock(p);
+ if (p->wire_count != 0 || p->hold_count != 0 ||
+ !pmap_page_exists_quick(pmap, p)) {
+ vm_page_unlock(p);
+ continue;
+ }
+ act_delta = pmap_ts_referenced(p);
+ if ((p->aflags & PGA_REFERENCED) != 0) {
+ if (act_delta == 0)
+ act_delta = 1;
+ vm_page_aflag_clear(p, PGA_REFERENCED);
+ }
+ if (!vm_page_active(p) && act_delta != 0) {
+ vm_page_activate(p);
+ p->act_count += act_delta;
+ } else if (vm_page_active(p)) {
+ if (act_delta == 0) {
+ p->act_count -= min(p->act_count,
+ ACT_DECLINE);
+ if (!remove_mode && p->act_count == 0) {
+ pmap_remove_all(p);
+ vm_page_deactivate(p);
+ } else
+ vm_page_requeue(p);
+ } else {
+ vm_page_activate(p);
+ if (p->act_count < ACT_MAX -
+ ACT_ADVANCE)
+ p->act_count += ACT_ADVANCE;
+ vm_page_requeue(p);
+ }
+ } else if (vm_page_inactive(p))
+ pmap_remove_all(p);
+ vm_page_unlock(p);
+ }
+ if ((backing_object = object->backing_object) == NULL)
+ goto unlock_return;
+ VM_OBJECT_RLOCK(backing_object);
+ if (object != first_object)
+ VM_OBJECT_RUNLOCK(object);
+ }
+unlock_return:
+ if (object != first_object)
+ VM_OBJECT_RUNLOCK(object);
+}
+
+/*
+ * deactivate some number of pages in a map, try to do it fairly, but
+ * that is really hard to do.
+ */
+static void
+vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
+{
+ vm_map_entry_t tmpe;
+ vm_object_t obj, bigobj;
+ int nothingwired;
+
+ if (!vm_map_trylock(map))
+ return;
+
+ bigobj = NULL;
+ nothingwired = TRUE;
+
+ /*
+ * first, search out the biggest object, and try to free pages from
+ * that.
+ */
+ tmpe = map->header.next;
+ while (tmpe != &map->header) {
+ if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+ obj = tmpe->object.vm_object;
+ if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
+ if (obj->shadow_count <= 1 &&
+ (bigobj == NULL ||
+ bigobj->resident_page_count <
+ obj->resident_page_count)) {
+ if (bigobj != NULL)
+ VM_OBJECT_RUNLOCK(bigobj);
+ bigobj = obj;
+ } else
+ VM_OBJECT_RUNLOCK(obj);
+ }
+ }
+ if (tmpe->wired_count > 0)
+ nothingwired = FALSE;
+ tmpe = tmpe->next;
+ }
+
+ if (bigobj != NULL) {
+ vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired);
+ VM_OBJECT_RUNLOCK(bigobj);
+ }
+ /*
+ * Next, hunt around for other pages to deactivate. We actually
+ * do this search sort of wrong -- .text first is not the best idea.
+ */
+ tmpe = map->header.next;
+ while (tmpe != &map->header) {
+ if (pmap_resident_count(vm_map_pmap(map)) <= desired)
+ break;
+ if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+ obj = tmpe->object.vm_object;
+ if (obj != NULL) {
+ VM_OBJECT_RLOCK(obj);
+ vm_swapout_object_deactivate_pages(map->pmap,
+ obj, desired);
+ VM_OBJECT_RUNLOCK(obj);
+ }
+ }
+ tmpe = tmpe->next;
+ }
+
+ /*
+ * Remove all mappings if a process is swapped out, this will free page
+ * table pages.
+ */
+ if (desired == 0 && nothingwired) {
+ pmap_remove(vm_map_pmap(map), vm_map_min(map),
+ vm_map_max(map));
+ }
+
+ vm_map_unlock(map);
+}
+
+/*
+ * Swap out requests
+ */
+#define VM_SWAP_NORMAL 1
+#define VM_SWAP_IDLE 2
+
+void
+vm_swapout_run(void)
+{
+
+ if (vm_swap_enabled)
+ vm_req_vmdaemon(VM_SWAP_NORMAL);
+}
+
+/*
+ * Idle process swapout -- run once per second when pagedaemons are
+ * reclaiming pages.
+ */
+void
+vm_swapout_run_idle(void)
+{
+ static long lsec;
+
+ if (!vm_swap_idle_enabled || time_second == lsec)
+ return;
+ vm_req_vmdaemon(VM_SWAP_IDLE);
+ lsec = time_second;
+}
+
+static void
+vm_req_vmdaemon(int req)
+{
+ static int lastrun = 0;
+
+ mtx_lock(&vm_daemon_mtx);
+ vm_pageout_req_swapout |= req;
+ if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
+ wakeup(&vm_daemon_needed);
+ lastrun = ticks;
+ }
+ mtx_unlock(&vm_daemon_mtx);
+}
+
+static void
+vm_daemon(void)
+{
+ struct rlimit rsslim;
+ struct proc *p;
+ struct thread *td;
+ struct vmspace *vm;
+ int breakout, swapout_flags, tryagain, attempts;
+#ifdef RACCT
+ uint64_t rsize, ravailable;
+#endif
+
+ while (TRUE) {
+ mtx_lock(&vm_daemon_mtx);
+ msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
+#ifdef RACCT
+ racct_enable ? hz : 0
+#else
+ 0
+#endif
+ );
+ swapout_flags = vm_pageout_req_swapout;
+ vm_pageout_req_swapout = 0;
+ mtx_unlock(&vm_daemon_mtx);
+ if (swapout_flags)
+ swapout_procs(swapout_flags);
+
+ /*
+ * scan the processes for exceeding their rlimits or if
+ * process is swapped out -- deactivate pages
+ */
+ tryagain = 0;
+ attempts = 0;
+again:
+ attempts++;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ vm_pindex_t limit, size;
+
+ /*
+ * if this is a system process or if we have already
+ * looked at this process, skip it.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL ||
+ p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * if the process is in a non-running type state,
+ * don't touch it.
+ */
+ breakout = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td) &&
+ !TD_IS_SUSPENDED(td)) {
+ thread_unlock(td);
+ breakout = 1;
+ break;
+ }
+ thread_unlock(td);
+ }
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * get a limit
+ */
+ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
+ limit = OFF_TO_IDX(
+ qmin(rsslim.rlim_cur, rsslim.rlim_max));
+
+ /*
+ * let processes that are swapped out really be
+ * swapped out set the limit to nothing (will force a
+ * swap-out.)
+ */
+ if ((p->p_flag & P_INMEM) == 0)
+ limit = 0; /* XXX */
+ vm = vmspace_acquire_ref(p);
+ _PHOLD_LITE(p);
+ PROC_UNLOCK(p);
+ if (vm == NULL) {
+ PRELE(p);
+ continue;
+ }
+ sx_sunlock(&allproc_lock);
+
+ size = vmspace_resident_count(vm);
+ if (size >= limit) {
+ vm_swapout_map_deactivate_pages(
+ &vm->vm_map, limit);
+ size = vmspace_resident_count(vm);
+ }
+#ifdef RACCT
+ if (racct_enable) {
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS, rsize);
+ ravailable = racct_get_available(p, RACCT_RSS);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable) {
+ /*
+ * Don't be overly aggressive; this
+ * might be an innocent process,
+ * and the limit could've been exceeded
+ * by some memory hog. Don't try
+ * to deactivate more than 1/4th
+ * of process' resident set size.
+ */
+ if (attempts <= 8) {
+ if (ravailable < rsize -
+ (rsize / 4)) {
+ ravailable = rsize -
+ (rsize / 4);
+ }
+ }
+ vm_swapout_map_deactivate_pages(
+ &vm->vm_map,
+ OFF_TO_IDX(ravailable));
+ /* Update RSS usage after paging out. */
+ size = vmspace_resident_count(vm);
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS, rsize);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable)
+ tryagain = 1;
+ }
+ }
+#endif
+ vmspace_free(vm);
+ sx_slock(&allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&allproc_lock);
+ if (tryagain != 0 && attempts <= 10)
+ goto again;
+ }
+}
+
+/*
+ * Allow a thread's kernel stack to be paged out.
+ */
+static void
+vm_thread_swapout(struct thread *td)
+{
+ vm_object_t ksobj;
+ vm_page_t m;
+ int i, pages;
+
+ cpu_thread_swapout(td);
+ pages = td->td_kstack_pages;
+ ksobj = td->td_kstack_obj;
+ pmap_qremove(td->td_kstack, pages);
+ VM_OBJECT_WLOCK(ksobj);
+ for (i = 0; i < pages; i++) {
+ m = vm_page_lookup(ksobj, i);
+ if (m == NULL)
+ panic("vm_thread_swapout: kstack already missing?");
+ vm_page_dirty(m);
+ vm_page_lock(m);
+ vm_page_unwire(m, PQ_INACTIVE);
+ vm_page_unlock(m);
+ }
+ VM_OBJECT_WUNLOCK(ksobj);
+}
+
+/*
+ * Bring the kernel stack for a specified thread back in.
+ */
+static void
+vm_thread_swapin(struct thread *td)
+{
+ vm_object_t ksobj;
+ vm_page_t ma[KSTACK_MAX_PAGES];
+ int pages;
+
+ pages = td->td_kstack_pages;
+ ksobj = td->td_kstack_obj;
+ VM_OBJECT_WLOCK(ksobj);
+ (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma,
+ pages);
+ for (int i = 0; i < pages;) {
+ int j, a, count, rv;
+
+ vm_page_assert_xbusied(ma[i]);
+ if (ma[i]->valid == VM_PAGE_BITS_ALL) {
+ vm_page_xunbusy(ma[i]);
+ i++;
+ continue;
+ }
+ vm_object_pip_add(ksobj, 1);
+ for (j = i + 1; j < pages; j++)
+ if (ma[j]->valid == VM_PAGE_BITS_ALL)
+ break;
+ rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
+ KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
+ count = min(a + 1, j - i);
+ rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
+ KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
+ __func__, td->td_proc->p_pid));
+ vm_object_pip_wakeup(ksobj);
+ for (j = i; j < i + count; j++)
+ vm_page_xunbusy(ma[j]);
+ i += count;
+ }
+ VM_OBJECT_WUNLOCK(ksobj);
+ pmap_qenter(td->td_kstack, ma, pages);
+ cpu_thread_swapin(td);
+}
+
+void
+faultin(struct proc *p)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ /*
+ * If another process is swapping in this process,
+ * just wait until it finishes.
+ */
+ if (p->p_flag & P_SWAPPINGIN) {
+ while (p->p_flag & P_SWAPPINGIN)
+ msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
+ return;
+ }
+ if ((p->p_flag & P_INMEM) == 0) {
+ /*
+ * Don't let another thread swap process p out while we are
+ * busy swapping it in.
+ */
+ ++p->p_lock;
+ p->p_flag |= P_SWAPPINGIN;
+ PROC_UNLOCK(p);
+
+ /*
+ * We hold no lock here because the list of threads
+ * can not change while all threads in the process are
+ * swapped out.
+ */
+ FOREACH_THREAD_IN_PROC(p, td)
+ vm_thread_swapin(td);
+ PROC_LOCK(p);
+ swapclear(p);
+ p->p_swtick = ticks;
+
+ wakeup(&p->p_flag);
+
+ /* Allow other threads to swap p out now. */
+ --p->p_lock;
+ }
+}
+
+/*
+ * This swapin algorithm attempts to swap-in processes only if there
+ * is enough space for them. Of course, if a process waits for a long
+ * time, it will be swapped in anyway.
+ */
+void
+swapper(void)
+{
+ struct proc *p;
+ struct thread *td;
+ struct proc *pp;
+ int slptime;
+ int swtime;
+ int ppri;
+ int pri;
+
+loop:
+ if (vm_page_count_min()) {
+ VM_WAIT;
+ goto loop;
+ }
+
+ pp = NULL;
+ ppri = INT_MIN;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW ||
+ p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ swtime = (ticks - p->p_swtick) / hz;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ /*
+ * An otherwise runnable thread of a process
+ * swapped out has only the TDI_SWAPPED bit set.
+ */
+ thread_lock(td);
+ if (td->td_inhibitors == TDI_SWAPPED) {
+ slptime = (ticks - td->td_slptick) / hz;
+ pri = swtime + slptime;
+ if ((td->td_flags & TDF_SWAPINREQ) == 0)
+ pri -= p->p_nice * 8;
+ /*
+ * if this thread is higher priority
+ * and there is enough space, then select
+ * this process instead of the previous
+ * selection.
+ */
+ if (pri > ppri) {
+ pp = p;
+ ppri = pri;
+ }
+ }
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+
+ /*
+ * Nothing to do, back to sleep.
+ */
+ if ((p = pp) == NULL) {
+ tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
+ goto loop;
+ }
+ PROC_LOCK(p);
+
+ /*
+ * Another process may be bringing or may have already
+ * brought this process in while we traverse all threads.
+ * Or, this process may even be being swapped out again.
+ */
+ if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
+ PROC_UNLOCK(p);
+ goto loop;
+ }
+
+ /*
+ * We would like to bring someone in. (only if there is space).
+ * [What checks the space? ]
+ */
+ faultin(p);
+ PROC_UNLOCK(p);
+ goto loop;
+}
+
+/*
+ * First, if any processes have been sleeping or stopped for at least
+ * "swap_idle_threshold1" seconds, they are swapped out. If, however,
+ * no such processes exist, then the longest-sleeping or stopped
+ * process is swapped out. Finally, and only as a last resort, if
+ * there are no sleeping or stopped processes, the longest-resident
+ * process is swapped out.
+ */
+static void
+swapout_procs(int action)
+{
+ struct proc *p;
+ struct thread *td;
+ int didswap = 0;
+
+retry:
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ struct vmspace *vm;
+ int minslptime = 100000;
+ int slptime;
+
+ PROC_LOCK(p);
+ /*
+ * Watch out for a process in
+ * creation. It may have no
+ * address space or lock yet.
+ */
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * An aio daemon switches its
+ * address space while running.
+ * Perform a quick check whether
+ * a process has P_SYSTEM.
+ * Filter out exiting processes.
+ */
+ if ((p->p_flag & (P_SYSTEM | P_WEXIT)) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ _PHOLD_LITE(p);
+ PROC_UNLOCK(p);
+ sx_sunlock(&allproc_lock);
+
+ /*
+ * Do not swapout a process that
+ * is waiting for VM data
+ * structures as there is a possible
+ * deadlock. Test this first as
+ * this may block.
+ *
+ * Lock the map until swapout
+ * finishes, or a thread of this
+ * process may attempt to alter
+ * the map.
+ */
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL)
+ goto nextproc2;
+ if (!vm_map_trylock(&vm->vm_map))
+ goto nextproc1;
+
+ PROC_LOCK(p);
+ if (p->p_lock != 1 || (p->p_flag & (P_STOPPED_SINGLE |
+ P_TRACED | P_SYSTEM)) != 0)
+ goto nextproc;
+
+ /*
+ * only aiod changes vmspace, however it will be
+ * skipped because of the if statement above checking
+ * for P_SYSTEM
+ */
+ if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
+ goto nextproc;
+
+ switch (p->p_state) {
+ default:
+ /* Don't swap out processes in any sort
+ * of 'special' state. */
+ break;
+
+ case PRS_NORMAL:
+ /*
+ * do not swapout a realtime process
+ * Check all the thread groups..
+ */
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (PRI_IS_REALTIME(td->td_pri_class)) {
+ thread_unlock(td);
+ goto nextproc;
+ }
+ slptime = (ticks - td->td_slptick) / hz;
+ /*
+ * Guarantee swap_idle_threshold1
+ * time in memory.
+ */
+ if (slptime < swap_idle_threshold1) {
+ thread_unlock(td);
+ goto nextproc;
+ }
+
+ /*
+ * Do not swapout a process if it is
+ * waiting on a critical event of some
+ * kind or there is a thread whose
+ * pageable memory may be accessed.
+ *
+ * This could be refined to support
+ * swapping out a thread.
+ */
+ if (!thread_safetoswapout(td)) {
+ thread_unlock(td);
+ goto nextproc;
+ }
+ /*
+ * If the system is under memory stress,
+ * or if we are swapping
+ * idle processes >= swap_idle_threshold2,
+ * then swap the process out.
+ */
+ if (((action & VM_SWAP_NORMAL) == 0) &&
+ (((action & VM_SWAP_IDLE) == 0) ||
+ (slptime < swap_idle_threshold2))) {
+ thread_unlock(td);
+ goto nextproc;
+ }
+
+ if (minslptime > slptime)
+ minslptime = slptime;
+ thread_unlock(td);
+ }
+
+ /*
+ * If the pageout daemon didn't free enough pages,
+ * or if this process is idle and the system is
+ * configured to swap proactively, swap it out.
+ */
+ if ((action & VM_SWAP_NORMAL) ||
+ ((action & VM_SWAP_IDLE) &&
+ (minslptime > swap_idle_threshold2))) {
+ _PRELE(p);
+ if (swapout(p) == 0)
+ didswap++;
+ PROC_UNLOCK(p);
+ vm_map_unlock(&vm->vm_map);
+ vmspace_free(vm);
+ goto retry;
+ }
+ }
+nextproc:
+ PROC_UNLOCK(p);
+ vm_map_unlock(&vm->vm_map);
+nextproc1:
+ vmspace_free(vm);
+nextproc2:
+ sx_slock(&allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&allproc_lock);
+ /*
+ * If we swapped something out, and another process needed memory,
+ * then wakeup the sched process.
+ */
+ if (didswap)
+ wakeup(&proc0);
+}
+
+static void
+swapclear(struct proc *p)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ td->td_flags |= TDF_INMEM;
+ td->td_flags &= ~TDF_SWAPINREQ;
+ TD_CLR_SWAPPED(td);
+ if (TD_CAN_RUN(td))
+ if (setrunnable(td)) {
+#ifdef INVARIANTS
+ /*
+ * XXX: We just cleared TDI_SWAPPED
+ * above and set TDF_INMEM, so this
+ * should never happen.
+ */
+ panic("not waking up swapper");
+#endif
+ }
+ thread_unlock(td);
+ }
+ p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
+ p->p_flag |= P_INMEM;
+}
+
+static int
+swapout(struct proc *p)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /*
+ * The states of this process and its threads may have changed
+ * by now. Assuming that there is only one pageout daemon thread,
+ * this process should still be in memory.
+ */
+ KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) ==
+ P_INMEM, ("swapout: lost a swapout race?"));
+
+ /*
+ * remember the process resident count
+ */
+ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
+ /*
+ * Check and mark all threads before we proceed.
+ */
+ p->p_flag &= ~P_INMEM;
+ p->p_flag |= P_SWAPPINGOUT;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!thread_safetoswapout(td)) {
+ thread_unlock(td);
+ swapclear(p);
+ return (EBUSY);
+ }
+ td->td_flags &= ~TDF_INMEM;
+ TD_SET_SWAPPED(td);
+ thread_unlock(td);
+ }
+ td = FIRST_THREAD_IN_PROC(p);
+ ++td->td_ru.ru_nswap;
+ PROC_UNLOCK(p);
+
+ /*
+ * This list is stable because all threads are now prevented from
+ * running. The list is only modified in the context of a running
+ * thread in this process.
+ */
+ FOREACH_THREAD_IN_PROC(p, td)
+ vm_thread_swapout(td);
+
+ PROC_LOCK(p);
+ p->p_flag &= ~P_SWAPPINGOUT;
+ p->p_swtick = ticks;
+ return (0);
+}
diff --git a/sys/vm/vm_swapout_dummy.c b/sys/vm/vm_swapout_dummy.c
new file mode 100644
index 0000000..9cdfcb8
--- /dev/null
+++ b/sys/vm/vm_swapout_dummy.c
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+
+static int vm_swap_enabled = 0;
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD,
+ &vm_swap_enabled, 0,
+ "Enable entire process swapout");
+
+static int vm_swap_idle_enabled = 0;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD,
+ &vm_swap_idle_enabled, 0,
+ "Allow swapout on idle criteria");
+
+void
+vm_swapout_run(void)
+{
+}
+
+void
+vm_swapout_run_idle(void)
+{
+}
+
+void
+faultin(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if ((p->p_flag & P_INMEM) == 0)
+ panic("faultin: proc %p swapped out with NO_SWAPPING", p);
+}
+
+void
+swapper(void)
+{
+
+ for (;;)
+ tsleep(&proc0, PVM, "swapin", MAXSLP * hz);
+}
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 4c0dae9..30b02c4 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -1138,6 +1138,23 @@ vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
VM_OBJECT_WLOCK(object);
}
+static int
+vn_off2bidx(vm_ooffset_t offset)
+{
+
+ return ((offset & PAGE_MASK) / DEV_BSIZE);
+}
+
+static bool
+vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
+{
+
+ KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
+ offset < IDX_TO_OFF(m->pindex + 1),
+ ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
+ (uintmax_t)offset));
+ return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
+}
/*
* This is now called from local media FS's to operate against their
@@ -1154,10 +1171,12 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
{
vm_object_t object;
vm_page_t m;
- vm_ooffset_t poffset;
+ vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
struct uio auio;
struct iovec aiov;
+ off_t prev_resid, wrsz;
int count, error, i, maxsize, ncount, pgoff, ppscheck;
+ bool in_hole;
static struct timeval lastfail;
static int curfail;
@@ -1192,8 +1211,14 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
* We do not under any circumstances truncate the valid bits, as
* this will screw up bogus page replacement.
*/
- VM_OBJECT_WLOCK(object);
+ VM_OBJECT_RLOCK(object);
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
+ if (!VM_OBJECT_TRYUPGRADE(object)) {
+ VM_OBJECT_RUNLOCK(object);
+ VM_OBJECT_WLOCK(object);
+ if (maxsize + poffset <= object->un_pager.vnp.vnp_size)
+ goto downgrade;
+ }
if (object->un_pager.vnp.vnp_size > poffset) {
maxsize = object->un_pager.vnp.vnp_size - poffset;
ncount = btoc(maxsize);
@@ -1218,35 +1243,105 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
}
for (i = ncount; i < count; i++)
rtvals[i] = VM_PAGER_BAD;
+downgrade:
+ VM_OBJECT_LOCK_DOWNGRADE(object);
}
- for (i = 0; i < ncount - ((btoc(maxsize) & PAGE_MASK) != 0); i++)
- MPASS(ma[i]->dirty == VM_PAGE_BITS_ALL);
- VM_OBJECT_WUNLOCK(object);
- aiov.iov_base = NULL;
- aiov.iov_len = maxsize;
auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = poffset;
auio.uio_segflg = UIO_NOCOPY;
auio.uio_rw = UIO_WRITE;
- auio.uio_resid = maxsize;
auio.uio_td = NULL;
- error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags),
- curthread->td_ucred);
- PCPU_INC(cnt.v_vnodeout);
- PCPU_ADD(cnt.v_vnodepgsout, ncount);
-
- ppscheck = 0;
- if (error != 0 && (ppscheck = ppsratecheck(&lastfail, &curfail, 1))
- != 0)
- printf("vnode_pager_putpages: I/O error %d\n", error);
- if (auio.uio_resid != 0 && (ppscheck != 0 ||
- ppsratecheck(&lastfail, &curfail, 1) != 0))
- printf("vnode_pager_putpages: residual I/O %zd at %ju\n",
- auio.uio_resid, (uintmax_t)ma[0]->pindex);
- for (i = 0; i < ncount; i++)
+ maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);
+
+ for (prev_offset = poffset; prev_offset < maxblksz;) {
+ /* Skip clean blocks. */
+ for (in_hole = true; in_hole && prev_offset < maxblksz;) {
+ m = ma[OFF_TO_IDX(prev_offset - poffset)];
+ for (i = vn_off2bidx(prev_offset);
+ i < sizeof(vm_page_bits_t) * NBBY &&
+ prev_offset < maxblksz; i++) {
+ if (vn_dirty_blk(m, prev_offset)) {
+ in_hole = false;
+ break;
+ }
+ prev_offset += DEV_BSIZE;
+ }
+ }
+ if (in_hole)
+ goto write_done;
+
+ /* Find longest run of dirty blocks. */
+ for (next_offset = prev_offset; next_offset < maxblksz;) {
+ m = ma[OFF_TO_IDX(next_offset - poffset)];
+ for (i = vn_off2bidx(next_offset);
+ i < sizeof(vm_page_bits_t) * NBBY &&
+ next_offset < maxblksz; i++) {
+ if (!vn_dirty_blk(m, next_offset))
+ goto start_write;
+ next_offset += DEV_BSIZE;
+ }
+ }
+start_write:
+ if (next_offset > poffset + maxsize)
+ next_offset = poffset + maxsize;
+
+ /*
+ * Getting here requires finding a dirty block in the
+ * 'skip clean blocks' loop.
+ */
+ MPASS(prev_offset < next_offset);
+
+ VM_OBJECT_RUNLOCK(object);
+ aiov.iov_base = NULL;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = prev_offset;
+ prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
+ prev_offset;
+ error = VOP_WRITE(vp, &auio,
+ vnode_pager_putpages_ioflags(flags), curthread->td_ucred);
+
+ wrsz = prev_resid - auio.uio_resid;
+ if (wrsz == 0) {
+ if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
+ vn_printf(vp, "vnode_pager_putpages: "
+ "zero-length write at %ju resid %zd\n",
+ auio.uio_offset, auio.uio_resid);
+ }
+ VM_OBJECT_RLOCK(object);
+ break;
+ }
+
+ /* Adjust the starting offset for next iteration. */
+ prev_offset += wrsz;
+ MPASS(auio.uio_offset == prev_offset);
+
+ ppscheck = 0;
+ if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
+ &curfail, 1)) != 0)
+ vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
+ error);
+ if (auio.uio_resid != 0 && (ppscheck != 0 ||
+ ppsratecheck(&lastfail, &curfail, 1) != 0))
+ vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
+ "at %ju\n", auio.uio_resid,
+ (uintmax_t)ma[0]->pindex);
+ VM_OBJECT_RLOCK(object);
+ if (error != 0 || auio.uio_resid != 0)
+ break;
+ }
+write_done:
+ /* Mark completely processed pages. */
+ for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
rtvals[i] = VM_PAGER_OK;
+ /* Mark partial EOF page. */
+ if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
+ rtvals[i++] = VM_PAGER_OK;
+ /* Unwritten pages in range, free bonus if the page is clean. */
+ for (; i < ncount; i++)
+ rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
+ VM_OBJECT_RUNLOCK(object);
+ PCPU_ADD(cnt.v_vnodepgsout, i);
+ PCPU_INC(cnt.v_vnodeout);
return (rtvals[0]);
}
OpenPOWER on IntegriCloud