summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libz/FREEBSD-upgrade2
-rw-r--r--lib/libz/Makefile2
-rw-r--r--lib/libz/Symbol.map2
-rw-r--r--lib/libz/Versions.def2
-rw-r--r--lib/libz/zopen.c2
-rw-r--r--sys/amd64/amd64/pmap.c151
-rw-r--r--sys/amd64/include/pmap.h16
-rw-r--r--sys/cddl/compat/opensolaris/sys/vnode.h3
-rw-r--r--sys/conf/files1
-rw-r--r--sys/fs/tmpfs/tmpfs_fifoops.c1
-rw-r--r--sys/fs/tmpfs/tmpfs_vnops.c61
-rw-r--r--sys/i386/i386/pmap.c155
-rw-r--r--sys/i386/include/pmap.h16
-rw-r--r--sys/kern/subr_uio.c11
-rw-r--r--sys/kern/uipc_shm.c3
-rw-r--r--sys/vm/device_pager.c9
-rw-r--r--sys/vm/sg_pager.c5
-rw-r--r--sys/vm/vm_fault.c2
-rw-r--r--sys/vm/vm_init.c2
-rw-r--r--sys/vm/vm_mmap.c12
-rw-r--r--sys/vm/vm_object.c431
-rw-r--r--sys/vm/vm_object.h6
-rw-r--r--sys/vm/vm_page.c533
-rw-r--r--sys/vm/vm_page.h10
-rw-r--r--sys/vm/vm_radix.c956
-rw-r--r--sys/vm/vm_radix.h115
-rw-r--r--sys/vm/vm_reserv.c64
-rw-r--r--sys/vm/vnode_pager.c47
28 files changed, 1860 insertions, 760 deletions
diff --git a/lib/libz/FREEBSD-upgrade b/lib/libz/FREEBSD-upgrade
index d2d251b..7482228 100644
--- a/lib/libz/FREEBSD-upgrade
+++ b/lib/libz/FREEBSD-upgrade
@@ -1,4 +1,4 @@
-$FreeBSD: head/lib/libz/FREEBSD-upgrade 146082 2005-05-11 03:50:50Z kientzle $
+$FreeBSD: user/attilio/vmcontention/lib/libz/FREEBSD-upgrade 146082 2005-05-11 03:50:50Z kientzle $
ZLib 1.2.2
diff --git a/lib/libz/Makefile b/lib/libz/Makefile
index 4ecb022..bd1fa60 100644
--- a/lib/libz/Makefile
+++ b/lib/libz/Makefile
@@ -1,5 +1,5 @@
#
-# $FreeBSD: head/lib/libz/Makefile 232263 2012-02-28 18:30:18Z dim $
+# $FreeBSD: user/attilio/vmcontention/lib/libz/Makefile 232325 2012-03-01 00:27:51Z attilio $
#
LIB= z
diff --git a/lib/libz/Symbol.map b/lib/libz/Symbol.map
index c9eef6f..088cd0c 100644
--- a/lib/libz/Symbol.map
+++ b/lib/libz/Symbol.map
@@ -1,5 +1,5 @@
/*
- * $FreeBSD: head/lib/libz/Symbol.map 206709 2010-04-16 20:07:24Z delphij $
+ * $FreeBSD: user/attilio/vmcontention/lib/libz/Symbol.map 206709 2010-04-16 20:07:24Z delphij $
*/
ZLIB_1.2.7.0 {
diff --git a/lib/libz/Versions.def b/lib/libz/Versions.def
index a5aeb28..6317174 100644
--- a/lib/libz/Versions.def
+++ b/lib/libz/Versions.def
@@ -1,4 +1,4 @@
-# $FreeBSD: head/lib/libz/Versions.def 205486 2010-03-22 22:12:27Z delphij $
+# $FreeBSD: user/attilio/vmcontention/lib/libz/Versions.def 205486 2010-03-22 22:12:27Z delphij $
ZLIB_1.2.4.0 {
};
diff --git a/lib/libz/zopen.c b/lib/libz/zopen.c
index 822a5e1..8c95ab6 100644
--- a/lib/libz/zopen.c
+++ b/lib/libz/zopen.c
@@ -3,7 +3,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/lib/libz/zopen.c 84228 2001-09-30 22:39:00Z dillon $");
+__FBSDID("$FreeBSD: user/attilio/vmcontention/lib/libz/zopen.c 84228 2001-09-30 22:39:00Z dillon $");
#include <stdio.h>
#include <zlib.h>
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index f73e956..5436b6f 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -302,6 +302,7 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
pd_entry_t newpde);
static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+static vm_page_t pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root);
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
struct rwlock **lockp);
@@ -1452,7 +1453,8 @@ pmap_free_zero_pages(vm_page_t free)
while (free != NULL) {
m = free;
- free = m->right;
+ free = (void *)m->object;
+ m->object = NULL;
/* Preserve the page's PG_ZERO setting. */
vm_page_free_toq(m);
}
@@ -1471,7 +1473,7 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
m->flags |= PG_ZERO;
else
m->flags &= ~PG_ZERO;
- m->right = *free;
+ m->object = (void *)*free;
*free = m;
}
@@ -1489,20 +1491,20 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
root = pmap->pm_root;
if (root == NULL) {
- mpte->left = NULL;
- mpte->right = NULL;
+ mpte->md.pv_left = NULL;
+ mpte->md.pv_right = NULL;
} else {
- root = vm_page_splay(mpte->pindex, root);
+ root = pmap_vmpage_splay(mpte->pindex, root);
if (mpte->pindex < root->pindex) {
- mpte->left = root->left;
- mpte->right = root;
- root->left = NULL;
+ mpte->md.pv_left = root->md.pv_left;
+ mpte->md.pv_right = root;
+ root->md.pv_left = NULL;
} else if (mpte->pindex == root->pindex)
panic("pmap_insert_pt_page: pindex already inserted");
else {
- mpte->right = root->right;
- mpte->left = root;
- root->right = NULL;
+ mpte->md.pv_right = root->md.pv_right;
+ mpte->md.pv_left = root;
+ root->md.pv_right = NULL;
}
}
pmap->pm_root = mpte;
@@ -1521,7 +1523,7 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
- mpte = vm_page_splay(pindex, mpte);
+ mpte = pmap_vmpage_splay(pindex, mpte);
if ((pmap->pm_root = mpte)->pindex != pindex)
mpte = NULL;
}
@@ -1540,18 +1542,24 @@ pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
if (mpte != pmap->pm_root) {
- root = vm_page_splay(mpte->pindex, pmap->pm_root);
+ root = pmap_vmpage_splay(mpte->pindex, pmap->pm_root);
KASSERT(mpte == root,
("pmap_remove_pt_page: mpte %p is missing from pmap %p",
mpte, pmap));
}
- if (mpte->left == NULL)
- root = mpte->right;
+ if (mpte->md.pv_left == NULL)
+ root = mpte->md.pv_right;
else {
- root = vm_page_splay(mpte->pindex, mpte->left);
- root->right = mpte->right;
+ root = pmap_vmpage_splay(mpte->pindex, mpte->md.pv_left);
+ root->md.pv_right = mpte->md.pv_right;
}
pmap->pm_root = root;
+
+ /*
+ * Reinitialize the pv_list which could be dirty now because of the
+ * splay tree work.
+ */
+ TAILQ_INIT(&mpte->md.pv_list);
}
/*
@@ -1627,6 +1635,61 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
}
/*
+ * Implements Sleator and Tarjan's top-down splay algorithm. Returns
+ * the vm_page containing the given pindex. If, however, that
+ * pindex is not found in the pmap, returns a vm_page that is
+ * adjacent to the pindex, coming before or after it.
+ */
+static vm_page_t
+pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root)
+{
+ struct vm_page dummy;
+ vm_page_t lefttreemax, righttreemin, y;
+
+ if (root == NULL)
+ return (root);
+ lefttreemax = righttreemin = &dummy;
+ for (;; root = y) {
+ if (pindex < root->pindex) {
+ if ((y = root->md.pv_left) == NULL)
+ break;
+ if (pindex < y->pindex) {
+ /* Rotate right. */
+ root->md.pv_left = y->md.pv_right;
+ y->md.pv_right = root;
+ root = y;
+ if ((y = root->md.pv_left) == NULL)
+ break;
+ }
+ /* Link into the new root's right tree. */
+ righttreemin->md.pv_left = root;
+ righttreemin = root;
+ } else if (pindex > root->pindex) {
+ if ((y = root->md.pv_right) == NULL)
+ break;
+ if (pindex > y->pindex) {
+ /* Rotate left. */
+ root->md.pv_right = y->md.pv_left;
+ y->md.pv_left = root;
+ root = y;
+ if ((y = root->md.pv_right) == NULL)
+ break;
+ }
+ /* Link into the new root's left tree. */
+ lefttreemax->md.pv_right = root;
+ lefttreemax = root;
+ } else
+ break;
+ }
+ /* Assemble the new root. */
+ lefttreemax->md.pv_right = root->md.pv_left;
+ righttreemin->md.pv_left = root->md.pv_right;
+ root->md.pv_left = dummy.md.pv_right;
+ root->md.pv_right = dummy.md.pv_left;
+ return (root);
+}
+
+/*
* After removing a page table entry, this routine is used to
* conditionally free the page, and manage the hold/wire counts.
*/
@@ -2176,7 +2239,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
if ((tpte & PG_A) != 0)
vm_page_aflag_set(m, PGA_REFERENCED);
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
if (TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2228,7 +2291,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
}
if (m_pc == NULL && free != NULL) {
m_pc = free;
- free = m_pc->right;
+ free = (void *)m_pc->object;
/* Recycle a freed page table page. */
m_pc->wire_count = 1;
atomic_add_int(&cnt.v_wire_count, 1);
@@ -2460,9 +2523,9 @@ pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
pv_entry_t pv;
rw_assert(&pvh_global_lock, RA_LOCKED);
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
- TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
break;
}
}
@@ -2501,7 +2564,7 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
pv = pmap_pvh_remove(pvh, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
m = PHYS_TO_VM_PAGE(pa);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
/* Instantiate the remaining NPTEPG - 1 pv entries. */
PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
va_last = va + NBPDR - PAGE_SIZE;
@@ -2519,7 +2582,7 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
m++;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_pv_demote_pde: page %p is not managed", m));
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
if (va == va_last)
goto out;
}
@@ -2567,7 +2630,7 @@ pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
pv = pmap_pvh_remove(&m->md, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
pvh = pa_to_pvh(pa);
- TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
/* Free the remaining NPTEPG - 1 pv entries. */
va_last = va + NBPDR - PAGE_SIZE;
do {
@@ -2608,7 +2671,7 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
return (TRUE);
} else
return (FALSE);
@@ -2632,7 +2695,7 @@ pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
pvh = pa_to_pvh(pa);
- TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
return (TRUE);
} else
return (FALSE);
@@ -3111,7 +3174,7 @@ small_mappings:
vm_page_dirty(m);
pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
pmap_invalidate_page(pmap, pv->pv_va);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
free_pv_entry(pmap, pv);
PMAP_UNLOCK(pmap);
}
@@ -4250,7 +4313,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
rw_rlock(&pvh_global_lock);
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
rw_rlock(lock);
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
@@ -4261,7 +4324,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
}
if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
@@ -4313,7 +4376,7 @@ pmap_pvh_wired_mappings(struct md_page *pvh, int count)
pv_entry_t pv;
rw_assert(&pvh_global_lock, RA_WLOCKED);
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pte = pmap_pte(pmap, pv->pv_va);
@@ -4442,7 +4505,7 @@ pmap_remove_pages(pmap_t pmap)
if ((tpte & PG_PS) != 0) {
pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
pvh = pa_to_pvh(tpte & PG_PS_FRAME);
- TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
if (TAILQ_EMPTY(&pvh->pv_list)) {
for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
if ((mt->aflags & PGA_WRITEABLE) != 0 &&
@@ -4461,7 +4524,7 @@ pmap_remove_pages(pmap_t pmap)
}
} else {
pmap_resident_count_dec(pmap, 1);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
if ((m->aflags & PGA_WRITEABLE) != 0 &&
TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
@@ -4536,7 +4599,7 @@ pmap_is_modified_pvh(struct md_page *pvh)
rw_assert(&pvh_global_lock, RA_WLOCKED);
rv = FALSE;
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pte = pmap_pte(pmap, pv->pv_va);
@@ -4607,7 +4670,7 @@ pmap_is_referenced_pvh(struct md_page *pvh)
rw_assert(&pvh_global_lock, RA_WLOCKED);
rv = FALSE;
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pte = pmap_pte(pmap, pv->pv_va);
@@ -4648,7 +4711,7 @@ pmap_remove_write(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
va = pv->pv_va;
@@ -4658,7 +4721,7 @@ pmap_remove_write(vm_page_t m)
PMAP_UNLOCK(pmap);
}
small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -4711,7 +4774,7 @@ pmap_ts_referenced(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
va = pv->pv_va;
@@ -4745,9 +4808,9 @@ small_mappings:
if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
pvf = pv;
do {
- pvn = TAILQ_NEXT(pv, pv_list);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ pvn = TAILQ_NEXT(pv, pv_next);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -4799,7 +4862,7 @@ pmap_clear_modify(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
va = pv->pv_va;
@@ -4831,7 +4894,7 @@ pmap_clear_modify(vm_page_t m)
PMAP_UNLOCK(pmap);
}
small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -4868,7 +4931,7 @@ pmap_clear_reference(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
va = pv->pv_va;
@@ -4891,7 +4954,7 @@ pmap_clear_reference(vm_page_t m)
PMAP_UNLOCK(pmap);
}
small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 71045ae..492354d 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -240,10 +240,20 @@ struct pv_entry;
struct pv_chunk;
struct md_page {
- TAILQ_HEAD(,pv_entry) pv_list;
- int pat_mode;
+ union {
+ TAILQ_HEAD(,pv_entry) pvi_list;
+ struct {
+ vm_page_t pii_left;
+ vm_page_t pii_right;
+ } pvi_siters;
+ } pv_structs;
+ int pat_mode;
};
+#define pv_list pv_structs.pvi_list
+#define pv_left pv_structs.pvi_siters.pii_left
+#define pv_right pv_structs.pvi_siters.pii_right
+
/*
* The kernel virtual address (KVA) of the level 4 page table page is always
* within the direct map (DMAP) region.
@@ -282,7 +292,7 @@ extern struct pmap kernel_pmap_store;
*/
typedef struct pv_entry {
vm_offset_t pv_va; /* virtual address for mapping */
- TAILQ_ENTRY(pv_entry) pv_list;
+ TAILQ_ENTRY(pv_entry) pv_next;
} *pv_entry_t;
/*
diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h
index eee00a0..d90ed91 100644
--- a/sys/cddl/compat/opensolaris/sys/vnode.h
+++ b/sys/cddl/compat/opensolaris/sys/vnode.h
@@ -75,8 +75,7 @@ vn_is_readonly(vnode_t *vp)
#define vn_mountedvfs(vp) ((vp)->v_mountedhere)
#define vn_has_cached_data(vp) \
((vp)->v_object != NULL && \
- ((vp)->v_object->resident_page_count > 0 || \
- (vp)->v_object->cache != NULL))
+ (vp)->v_object->cached_page_count > 0)
#define vn_exists(vp) do { } while (0)
#define vn_invalid(vp) do { } while (0)
#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0)
diff --git a/sys/conf/files b/sys/conf/files
index f4961e9..2a3db4f 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3626,6 +3626,7 @@ vm/vm_page.c standard
vm/vm_pageout.c standard
vm/vm_pager.c standard
vm/vm_phys.c standard
+vm/vm_radix.c standard
vm/vm_reserv.c standard
vm/vm_unix.c standard
vm/vm_zeroidle.c standard
diff --git a/sys/fs/tmpfs/tmpfs_fifoops.c b/sys/fs/tmpfs/tmpfs_fifoops.c
index 347732b..e312a1e 100644
--- a/sys/fs/tmpfs/tmpfs_fifoops.c
+++ b/sys/fs/tmpfs/tmpfs_fifoops.c
@@ -39,6 +39,7 @@
#include <sys/param.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
+#include <sys/systm.h>
#include <sys/vnode.h>
#include <vm/vm.h>
diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c
index 5e2ea65..7ffff53 100644
--- a/sys/fs/tmpfs/tmpfs_vnops.c
+++ b/sys/fs/tmpfs/tmpfs_vnops.c
@@ -511,11 +511,14 @@ tmpfs_mappedread(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio
offset = addr & PAGE_MASK;
tlen = MIN(PAGE_SIZE - offset, len);
- if ((vobj == NULL) ||
- (vobj->resident_page_count == 0 && vobj->cache == NULL))
+ if (vobj == NULL)
goto nocache;
VM_OBJECT_LOCK(vobj);
+ if (vobj->resident_page_count == 0 && vobj->cached_page_count == 0) {
+ VM_OBJECT_UNLOCK(vobj);
+ goto nocache;
+ }
lookupvpg:
if (((m = vm_page_lookup(vobj, idx)) != NULL) &&
vm_page_is_valid(m, offset, tlen)) {
@@ -639,35 +642,47 @@ tmpfs_mappedwrite(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *ui
offset = addr & PAGE_MASK;
tlen = MIN(PAGE_SIZE - offset, len);
- if ((vobj == NULL) ||
- (vobj->resident_page_count == 0 && vobj->cache == NULL)) {
+ if (vobj == NULL) {
vpg = NULL;
goto nocache;
}
VM_OBJECT_LOCK(vobj);
-lookupvpg:
- if (((vpg = vm_page_lookup(vobj, idx)) != NULL) &&
- vm_page_is_valid(vpg, offset, tlen)) {
- if ((vpg->oflags & VPO_BUSY) != 0) {
- /*
- * Reference the page before unlocking and sleeping so
- * that the page daemon is less likely to reclaim it.
- */
- vm_page_reference(vpg);
- vm_page_sleep(vpg, "tmfsmw");
- goto lookupvpg;
- }
- vm_page_busy(vpg);
- vm_page_undirty(vpg);
- VM_OBJECT_UNLOCK(vobj);
- error = uiomove_fromphys(&vpg, offset, tlen, uio);
- } else {
- if (vm_page_is_cached(vobj, idx))
- vm_page_cache_free(vobj, idx, idx + 1);
+ if (vobj->resident_page_count == 0 && vobj->cached_page_count == 0) {
VM_OBJECT_UNLOCK(vobj);
vpg = NULL;
+ goto nocache;
}
+lookupvpg:
+ vpg = vm_radix_lookup(&vobj->rtree, idx, VM_RADIX_ANY);
+ if (vpg != NULL) {
+ if (vm_page_is_valid(vpg, offset, tlen)) {
+ if ((vpg->oflags & VPO_BUSY) != 0) {
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_reference(vpg);
+ vm_page_sleep(vpg, "tmfsmw");
+ goto lookupvpg;
+ }
+ vm_page_busy(vpg);
+ vm_page_undirty(vpg);
+ VM_OBJECT_UNLOCK(vobj);
+ error = uiomove_fromphys(&vpg, offset, tlen, uio);
+ } else {
+ if (vpg->flags & PG_CACHED) {
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (vpg->object == vobj)
+ vm_page_cache_free(vpg);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ }
+ VM_OBJECT_UNLOCK(vobj);
+ vpg = NULL;
+ }
+ } else
+ VM_OBJECT_UNLOCK(vobj);
nocache:
VM_OBJECT_LOCK(tobj);
tpg = vm_page_grab(tobj, idx, VM_ALLOC_WIRED |
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index a9d031d..f6ce067 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -330,6 +330,7 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
pd_entry_t newpde);
static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+static vm_page_t pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root);
static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
@@ -1574,7 +1575,8 @@ pmap_free_zero_pages(vm_page_t free)
while (free != NULL) {
m = free;
- free = m->right;
+ free = (void *)m->object;
+ m->object = NULL;
/* Preserve the page's PG_ZERO setting. */
vm_page_free_toq(m);
}
@@ -1593,7 +1595,7 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
m->flags |= PG_ZERO;
else
m->flags &= ~PG_ZERO;
- m->right = *free;
+ m->object = (void *)*free;
*free = m;
}
@@ -1611,20 +1613,20 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
root = pmap->pm_root;
if (root == NULL) {
- mpte->left = NULL;
- mpte->right = NULL;
+ mpte->md.pv_left = NULL;
+ mpte->md.pv_right = NULL;
} else {
- root = vm_page_splay(mpte->pindex, root);
+ root = pmap_vmpage_splay(mpte->pindex, root);
if (mpte->pindex < root->pindex) {
- mpte->left = root->left;
- mpte->right = root;
- root->left = NULL;
+ mpte->md.pv_left = root->md.pv_left;
+ mpte->md.pv_right = root;
+ root->md.pv_left = NULL;
} else if (mpte->pindex == root->pindex)
panic("pmap_insert_pt_page: pindex already inserted");
else {
- mpte->right = root->right;
- mpte->left = root;
- root->right = NULL;
+ mpte->md.pv_right = root->md.pv_right;
+ mpte->md.pv_left = root;
+ root->md.pv_right = NULL;
}
}
pmap->pm_root = mpte;
@@ -1643,7 +1645,7 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
- mpte = vm_page_splay(pindex, mpte);
+ mpte = pmap_vmpage_splay(pindex, mpte);
if ((pmap->pm_root = mpte)->pindex != pindex)
mpte = NULL;
}
@@ -1662,14 +1664,20 @@ pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
if (mpte != pmap->pm_root)
- vm_page_splay(mpte->pindex, pmap->pm_root);
- if (mpte->left == NULL)
- root = mpte->right;
+ pmap_vmpage_splay(mpte->pindex, pmap->pm_root);
+ if (mpte->md.pv_left == NULL)
+ root = mpte->md.pv_right;
else {
- root = vm_page_splay(mpte->pindex, mpte->left);
- root->right = mpte->right;
+ root = pmap_vmpage_splay(mpte->pindex, mpte->md.pv_left);
+ root->md.pv_right = mpte->md.pv_right;
}
pmap->pm_root = root;
+
+ /*
+ * Reinitialize the pv_list which could be dirty now because of the
+ * splay tree work.
+ */
+ TAILQ_INIT(&mpte->md.pv_list);
}
/*
@@ -1723,6 +1731,61 @@ _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free)
}
/*
+ * Implements Sleator and Tarjan's top-down splay algorithm. Returns
+ * the vm_page containing the given pindex. If, however, that
+ * pindex is not found in the pmap, returns a vm_page that is
+ * adjacent to the pindex, coming before or after it.
+ */
+static vm_page_t
+pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root)
+{
+ struct vm_page dummy;
+ vm_page_t lefttreemax, righttreemin, y;
+
+ if (root == NULL)
+ return (root);
+ lefttreemax = righttreemin = &dummy;
+ for (;; root = y) {
+ if (pindex < root->pindex) {
+ if ((y = root->md.pv_left) == NULL)
+ break;
+ if (pindex < y->pindex) {
+ /* Rotate right. */
+ root->md.pv_left = y->md.pv_right;
+ y->md.pv_right = root;
+ root = y;
+ if ((y = root->md.pv_left) == NULL)
+ break;
+ }
+ /* Link into the new root's right tree. */
+ righttreemin->md.pv_left = root;
+ righttreemin = root;
+ } else if (pindex > root->pindex) {
+ if ((y = root->md.pv_right) == NULL)
+ break;
+ if (pindex > y->pindex) {
+ /* Rotate left. */
+ root->md.pv_right = y->md.pv_left;
+ y->md.pv_left = root;
+ root = y;
+ if ((y = root->md.pv_right) == NULL)
+ break;
+ }
+ /* Link into the new root's left tree. */
+ lefttreemax->md.pv_right = root;
+ lefttreemax = root;
+ } else
+ break;
+ }
+ /* Assemble the new root. */
+ lefttreemax->md.pv_right = root->md.pv_left;
+ righttreemin->md.pv_left = root->md.pv_right;
+ root->md.pv_left = dummy.md.pv_right;
+ root->md.pv_right = dummy.md.pv_left;
+ return (root);
+}
+
+/*
* After removing a page table entry, this routine is used to
* conditionally free the page, and manage the hold/wire counts.
*/
@@ -2286,7 +2349,7 @@ pmap_pv_reclaim(pmap_t locked_pmap)
vm_page_dirty(m);
if ((tpte & PG_A) != 0)
vm_page_aflag_set(m, PGA_REFERENCED);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
if (TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2344,7 +2407,7 @@ out:
}
if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
m_pc = free;
- free = m_pc->right;
+ free = (void *)m_pc->object;
/* Recycle a freed page table page. */
m_pc->wire_count = 1;
atomic_add_int(&cnt.v_wire_count, 1);
@@ -2491,9 +2554,9 @@ pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
pv_entry_t pv;
rw_assert(&pvh_global_lock, RA_WLOCKED);
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
- TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
break;
}
}
@@ -2521,7 +2584,7 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
pv = pmap_pvh_remove(pvh, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
m = PHYS_TO_VM_PAGE(pa);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
/* Instantiate the remaining NPTEPG - 1 pv entries. */
va_last = va + NBPDR - PAGE_SIZE;
do {
@@ -2557,7 +2620,7 @@ pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
pv = pmap_pvh_remove(&m->md, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
pvh = pa_to_pvh(pa);
- TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
/* Free the remaining NPTEPG - 1 pv entries. */
va_last = va + NBPDR - PAGE_SIZE;
do {
@@ -2604,7 +2667,7 @@ pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
pv = get_pv_entry(pmap, FALSE);
pv->pv_va = va;
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
}
/*
@@ -2620,7 +2683,7 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
if (pv_entry_count < pv_entry_high_water &&
(pv = get_pv_entry(pmap, TRUE)) != NULL) {
pv->pv_va = va;
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
return (TRUE);
} else
return (FALSE);
@@ -2640,7 +2703,7 @@ pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
(pv = get_pv_entry(pmap, TRUE)) != NULL) {
pv->pv_va = va;
pvh = pa_to_pvh(pa);
- TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
return (TRUE);
} else
return (FALSE);
@@ -3095,7 +3158,7 @@ small_mappings:
vm_page_dirty(m);
pmap_unuse_pt(pmap, pv->pv_va, &free);
pmap_invalidate_page(pmap, pv->pv_va);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
free_pv_entry(pmap, pv);
PMAP_UNLOCK(pmap);
}
@@ -3551,7 +3614,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
if (pv == NULL)
pv = get_pv_entry(pmap, FALSE);
pv->pv_va = va;
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
pa |= PG_MANAGED;
} else if (pv != NULL)
free_pv_entry(pmap, pv);
@@ -4259,7 +4322,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
("pmap_page_exists_quick: page %p is not managed", m));
rv = FALSE;
rw_wlock(&pvh_global_lock);
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
@@ -4270,7 +4333,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
}
if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
@@ -4322,7 +4385,7 @@ pmap_pvh_wired_mappings(struct md_page *pvh, int count)
rw_assert(&pvh_global_lock, RA_WLOCKED);
sched_pin();
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pte = pmap_pte_quick(pmap, pv->pv_va);
@@ -4449,7 +4512,7 @@ pmap_remove_pages(pmap_t pmap)
if ((tpte & PG_PS) != 0) {
pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
pvh = pa_to_pvh(tpte & PG_PS_FRAME);
- TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
if (TAILQ_EMPTY(&pvh->pv_list)) {
for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
if (TAILQ_EMPTY(&mt->md.pv_list))
@@ -4467,7 +4530,7 @@ pmap_remove_pages(pmap_t pmap)
}
} else {
pmap->pm_stats.resident_count--;
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
if (TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4537,7 +4600,7 @@ pmap_is_modified_pvh(struct md_page *pvh)
rw_assert(&pvh_global_lock, RA_WLOCKED);
rv = FALSE;
sched_pin();
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pte = pmap_pte_quick(pmap, pv->pv_va);
@@ -4610,7 +4673,7 @@ pmap_is_referenced_pvh(struct md_page *pvh)
rw_assert(&pvh_global_lock, RA_WLOCKED);
rv = FALSE;
sched_pin();
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pte = pmap_pte_quick(pmap, pv->pv_va);
@@ -4653,7 +4716,7 @@ pmap_remove_write(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
va = pv->pv_va;
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
@@ -4663,7 +4726,7 @@ pmap_remove_write(vm_page_t m)
PMAP_UNLOCK(pmap);
}
small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -4722,7 +4785,7 @@ pmap_ts_referenced(vm_page_t m)
sched_pin();
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) {
va = pv->pv_va;
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
@@ -4756,9 +4819,9 @@ small_mappings:
if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
pvf = pv;
do {
- pvn = TAILQ_NEXT(pv, pv_list);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ pvn = TAILQ_NEXT(pv, pv_next);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -4812,7 +4875,7 @@ pmap_clear_modify(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
va = pv->pv_va;
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
@@ -4849,7 +4912,7 @@ pmap_clear_modify(vm_page_t m)
PMAP_UNLOCK(pmap);
}
small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -4893,7 +4956,7 @@ pmap_clear_reference(vm_page_t m)
if ((m->flags & PG_FICTITIOUS) != 0)
goto small_mappings;
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
va = pv->pv_va;
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
@@ -4916,7 +4979,7 @@ pmap_clear_reference(vm_page_t m)
PMAP_UNLOCK(pmap);
}
small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, pv->pv_va);
@@ -5427,7 +5490,7 @@ pmap_pvdump(vm_paddr_t pa)
printf("pa %x", pa);
m = PHYS_TO_VM_PAGE(pa);
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
pads(pmap);
diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h
index 7a827f8..9ba2752 100644
--- a/sys/i386/include/pmap.h
+++ b/sys/i386/include/pmap.h
@@ -426,10 +426,20 @@ struct pv_entry;
struct pv_chunk;
struct md_page {
- TAILQ_HEAD(,pv_entry) pv_list;
- int pat_mode;
+ union {
+ TAILQ_HEAD(,pv_entry) pvi_list;
+ struct {
+ vm_page_t pii_left;
+ vm_page_t pii_right;
+ } pvi_siters;
+ } pv_structs;
+ int pat_mode;
};
+#define pv_list pv_structs.pvi_list
+#define pv_left pv_structs.pvi_siters.pii_left
+#define pv_right pv_structs.pvi_siters.pii_right
+
struct pmap {
struct mtx pm_mtx;
pd_entry_t *pm_pdir; /* KVA of page directory */
@@ -468,7 +478,7 @@ extern struct pmap kernel_pmap_store;
*/
typedef struct pv_entry {
vm_offset_t pv_va; /* virtual address for mapping */
- TAILQ_ENTRY(pv_entry) pv_list;
+ TAILQ_ENTRY(pv_entry) pv_next;
} *pv_entry_t;
/*
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
index 7b593487..becc0b8 100644
--- a/sys/kern/subr_uio.c
+++ b/sys/kern/subr_uio.c
@@ -85,6 +85,7 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
vm_map_entry_t entry;
vm_pindex_t upindex;
vm_prot_t prot;
+ vm_page_bits_t vbits;
boolean_t wired;
KASSERT((uaddr & PAGE_MASK) == 0,
@@ -95,6 +96,7 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
* unwired in sf_buf_mext().
*/
kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
+ vbits = kern_pg->valid;
kern_pg->valid = VM_PAGE_BITS_ALL;
KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1,
("vm_pgmoveco: kern_pg is not correctly wired"));
@@ -105,6 +107,13 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
return(EFAULT);
}
VM_OBJECT_LOCK(uobject);
+ if (vm_page_insert(kern_pg, uobject, upindex) != 0) {
+ kern_pg->valid = vbits;
+ VM_OBJECT_UNLOCK(uobject);
+ vm_map_lookup_done(map, entry);
+ return(ENOMEM);
+ }
+ vm_page_dirty(kern_pg);
retry:
if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco"))
@@ -122,8 +131,6 @@ retry:
if (uobject->backing_object != NULL)
pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
}
- vm_page_insert(kern_pg, uobject, upindex);
- vm_page_dirty(kern_pg);
VM_OBJECT_UNLOCK(uobject);
vm_map_lookup_done(map, entry);
return(KERN_SUCCESS);
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 7f75bdc..64169b2 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -278,7 +278,8 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
if (base != 0) {
idx = OFF_TO_IDX(length);
retry:
- m = vm_page_lookup(object, idx);
+ m = vm_radix_lookup(&object->rtree, idx,
+ VM_RADIX_BLACK);
if (m != NULL) {
if ((m->oflags & VPO_BUSY) != 0 ||
m->busy != 0) {
diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c
index 809c32c..30aaac0 100644
--- a/sys/vm/device_pager.c
+++ b/sys/vm/device_pager.c
@@ -54,6 +54,8 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_phys.h>
#include <vm/uma.h>
+#include <machine/cpu.h>
+
static void dev_pager_init(void);
static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *);
@@ -299,7 +301,7 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
struct file *fpop;
struct thread *td;
vm_memattr_t memattr;
- int ref, ret;
+ int i, ref, ret;
pidx = OFF_TO_IDX(offset);
memattr = object->memattr;
@@ -351,7 +353,10 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
vm_page_free(*mres);
vm_page_unlock(*mres);
*mres = page;
- vm_page_insert(page, object, pidx);
+ while (vm_page_insert(page, object, pidx) != 0) {
+ for (i = 0; i < 10000000; i++)
+ cpu_spinwait();
+ }
}
page->valid = VM_PAGE_BITS_ALL;
return (VM_PAGER_OK);
diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c
index 097039e..92c1857 100644
--- a/sys/vm/sg_pager.c
+++ b/sys/vm/sg_pager.c
@@ -181,6 +181,10 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
/* Construct a new fake page. */
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_LOCK(object);
+ if (vm_page_insert(page, object, offset) != 0) {
+ vm_page_putfake(page);
+ return (VM_PAGER_FAIL);
+ }
TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, pageq);
/* Free the original pages and insert this fake page into the object. */
@@ -189,7 +193,6 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- vm_page_insert(page, object, offset);
m[reqpage] = page;
page->valid = VM_PAGE_BITS_ALL;
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index b79b3f5..9883dcf 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -741,9 +741,7 @@ vnode_locked:
* process'es object. The page is
* automatically made dirty.
*/
- vm_page_lock(fs.m);
vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
- vm_page_unlock(fs.m);
vm_page_busy(fs.m);
fs.first_m = fs.m;
fs.m = NULL;
diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c
index c507691..9761b21 100644
--- a/sys/vm/vm_init.c
+++ b/sys/vm/vm_init.c
@@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
#include <vm/vm_map.h>
#include <vm/vm_pager.h>
#include <vm/vm_extern.h>
@@ -123,6 +124,7 @@ vm_mem_init(dummy)
vm_object_init();
vm_map_startup();
kmem_init(virtual_avail, virtual_end);
+ vm_radix_init();
pmap_init();
vm_pager_init();
}
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index 05bb8ae..4ae1f90 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_pageout.h>
#include <vm/vm_extern.h>
#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
#include <vm/vnode_pager.h>
#ifdef HWPMC_HOOKS
@@ -912,10 +913,15 @@ RestartScan:
object->type == OBJT_VNODE) {
pindex = OFF_TO_IDX(current->offset +
(addr - current->start));
- m = vm_page_lookup(object, pindex);
- if (m == NULL &&
- vm_page_is_cached(object, pindex))
+ m = vm_radix_lookup(&object->rtree,
+ pindex, VM_RADIX_ANY);
+
+ /* Lock just for consistency. */
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (m != NULL &&
+ (m->flags & PG_CACHED) != 0)
mincoreinfo = MINCORE_INCORE;
+ mtx_unlock(&vm_page_queue_free_mtx);
if (m != NULL && m->valid == 0)
m = NULL;
if (m != NULL)
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 32b0779..570befc 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -164,6 +164,9 @@ vm_object_zdtor(void *mem, int size, void *arg)
vm_object_t object;
object = (vm_object_t)mem;
+ KASSERT(object->resident_page_count == 0,
+ ("object %p resident_page_count = %d",
+ object, object->resident_page_count));
KASSERT(TAILQ_EMPTY(&object->memq),
("object %p has resident pages",
object));
@@ -172,15 +175,12 @@ vm_object_zdtor(void *mem, int size, void *arg)
("object %p has reservations",
object));
#endif
- KASSERT(object->cache == NULL,
+ KASSERT(object->cached_page_count == 0,
("object %p has cached pages",
object));
KASSERT(object->paging_in_progress == 0,
("object %p paging_in_progress = %d",
object, object->paging_in_progress));
- KASSERT(object->resident_page_count == 0,
- ("object %p resident_page_count = %d",
- object, object->resident_page_count));
KASSERT(object->shadow_count == 0,
("object %p shadow_count = %d",
object, object->shadow_count));
@@ -210,7 +210,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
TAILQ_INIT(&object->memq);
LIST_INIT(&object->shadow_head);
- object->root = NULL;
+ object->rtree.rt_root = 0;
object->type = type;
switch (type) {
case OBJT_DEAD:
@@ -248,7 +248,6 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
#if VM_NRESERVLEVEL > 0
LIST_INIT(&object->rvq);
#endif
- object->cache = NULL;
mtx_lock(&vm_object_list_mtx);
TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -326,7 +325,7 @@ vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
case OBJT_SG:
case OBJT_SWAP:
case OBJT_VNODE:
- if (!TAILQ_EMPTY(&object->memq))
+ if (object->resident_page_count == 0)
return (KERN_FAILURE);
break;
case OBJT_DEAD:
@@ -673,7 +672,11 @@ vm_object_destroy(vm_object_t object)
void
vm_object_terminate(vm_object_t object)
{
- vm_page_t p, p_next;
+ vm_page_t pa[VM_RADIX_STACK];
+ vm_page_t p;
+ vm_pindex_t start;
+ u_int exhausted;
+ int n, i;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
@@ -718,43 +721,78 @@ vm_object_terminate(vm_object_t object)
* from the object. Rather than incrementally removing each page from
* the object, the page and object are reset to any empty state.
*/
- TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
- KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
- ("vm_object_terminate: freeing busy page %p", p));
- vm_page_lock(p);
- /*
- * Optimize the page's removal from the object by resetting
- * its "object" field. Specifically, if the page is not
- * wired, then the effect of this assignment is that
- * vm_page_free()'s call to vm_page_remove() will return
- * immediately without modifying the page or the object.
- */
- p->object = NULL;
- if (p->wire_count == 0) {
- vm_page_free(p);
- PCPU_INC(cnt.v_pfree);
+ start = 0;
+ exhausted = 0;
+ while (exhausted == 0 && (n = vm_radix_lookupn(&object->rtree, start,
+ 0, VM_RADIX_ANY, (void **)pa, VM_RADIX_STACK, &start,
+ &exhausted)) != 0) {
+ for (i = 0; i < n; i++) {
+ p = pa[i];
+ /*
+ * Another thread may allocate this cached page from
+ * the queue before we acquire the page queue free
+ * mtx.
+ */
+ if (p->flags & PG_CACHED) {
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (p->object == object) {
+ p->object = NULL;
+ p->valid = 0;
+ /* Clear PG_CACHED and set PG_FREE. */
+ p->flags ^= PG_CACHED | PG_FREE;
+ cnt.v_cache_count--;
+ cnt.v_free_count++;
+ }
+ mtx_unlock(&vm_page_queue_free_mtx);
+ continue;
+ } else if (p->object != object)
+ continue;
+ KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
+ ("vm_object_terminate: freeing busy page %p", p));
+ vm_page_lock(p);
+ /*
+ * Optimize the page's removal from the object by
+ * resetting its "object" field. Specifically, if
+ * the page is not wired, then the effect of this
+ * assignment is that vm_page_free()'s call to
+ * vm_page_remove() will return immediately without
+ * modifying the page or the object.
+ * Anyway, the radix tree cannot be accessed anymore
+ * from within the object, thus all the nodes need
+ * to be reclaimed later on.
+ */
+ p->object = NULL;
+ if (p->wire_count == 0) {
+ vm_page_free(p);
+ PCPU_INC(cnt.v_pfree);
+ }
+ vm_page_unlock(p);
}
- vm_page_unlock(p);
+ if (n < VM_RADIX_STACK)
+ break;
}
+ vm_radix_reclaim_allnodes(&object->rtree);
/*
* If the object contained any pages, then reset it to an empty state.
* None of the object's fields, including "resident_page_count", were
* modified by the preceding loop.
*/
if (object->resident_page_count != 0) {
- object->root = NULL;
TAILQ_INIT(&object->memq);
object->resident_page_count = 0;
if (object->type == OBJT_VNODE)
vdrop(object->handle);
}
+ if (object->cached_page_count != 0) {
+ object->cached_page_count = 0;
+ if (object->type == OBJT_VNODE)
+ vdrop(object->handle);
+ }
#if VM_NRESERVLEVEL > 0
if (__predict_false(!LIST_EMPTY(&object->rvq)))
vm_reserv_break_all(object);
#endif
- if (__predict_false(object->cache != NULL))
- vm_page_cache_free(object, 0, 0);
/*
* Let the pager know object is dead.
@@ -1266,10 +1304,13 @@ vm_object_shadow(
void
vm_object_split(vm_map_entry_t entry)
{
- vm_page_t m, m_next;
+ vm_page_t ma[VM_RADIX_STACK];
+ vm_page_t m;
vm_object_t orig_object, new_object, source;
- vm_pindex_t idx, offidxstart;
+ vm_pindex_t idx, offidxstart, start;
vm_size_t size;
+ u_int exhausted;
+ int i, n;
orig_object = entry->object.vm_object;
if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
@@ -1322,46 +1363,66 @@ vm_object_split(vm_map_entry_t entry)
("orig_object->charge < 0"));
orig_object->charge -= ptoa(size);
}
+ start = offidxstart;
retry:
- m = vm_page_find_least(orig_object, offidxstart);
- for (; m != NULL && (idx = m->pindex - offidxstart) < size;
- m = m_next) {
- m_next = TAILQ_NEXT(m, listq);
-
- /*
- * We must wait for pending I/O to complete before we can
- * rename the page.
- *
- * We do not have to VM_PROT_NONE the page as mappings should
- * not be changed by this operation.
- */
- if ((m->oflags & VPO_BUSY) || m->busy) {
- VM_OBJECT_UNLOCK(new_object);
- m->oflags |= VPO_WANTED;
- msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
- VM_OBJECT_LOCK(new_object);
- goto retry;
- }
+ exhausted = 0;
+ while (exhausted == 0 && (n = vm_radix_lookupn(&orig_object->rtree,
+ start, offidxstart + size, VM_RADIX_ANY, (void **)ma,
+ VM_RADIX_STACK, &start, &exhausted)) != 0) {
+ for (i = 0; i < n; i++) {
+ m = ma[i];
+ idx = m->pindex - offidxstart;
+ if (m->flags & PG_CACHED) {
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (m->object == orig_object)
+ vm_page_cache_rename(m, new_object,
+ idx);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ continue;
+ } else if (m->object != orig_object)
+ continue;
+ /*
+ * We must wait for pending I/O to complete before
+ * we can rename the page.
+ *
+ * We do not have to VM_PROT_NONE the page as mappings
+ * should not be changed by this operation.
+ */
+ if ((m->oflags & VPO_BUSY) || m->busy) {
+ start = m->pindex;
+ VM_OBJECT_UNLOCK(new_object);
+ m->oflags |= VPO_WANTED;
+ msleep(m, VM_OBJECT_MTX(orig_object), PVM,
+ "spltwt", 0);
+ VM_OBJECT_LOCK(new_object);
+ goto retry;
+ }
#if VM_NRESERVLEVEL > 0
- /*
- * If some of the reservation's allocated pages remain with
- * the original object, then transferring the reservation to
- * the new object is neither particularly beneficial nor
- * particularly harmful as compared to leaving the reservation
- * with the original object. If, however, all of the
- * reservation's allocated pages are transferred to the new
- * object, then transferring the reservation is typically
- * beneficial. Determining which of these two cases applies
- * would be more costly than unconditionally renaming the
- * reservation.
- */
- vm_reserv_rename(m, new_object, orig_object, offidxstart);
+ /*
+ * If some of the reservation's allocated pages remain
+ * with the original object, then transferring the
+ * reservation to the new object is neither
+ * particularly beneficial nor particularly harmful as
+ * compared to leaving the reservation with the
+ * original object. If, however, all of the
+ * reservation's allocated pages are transferred to
+ * the new object, then transferring the reservation
+ * is typically beneficial. Determining which of
+ * these two cases applies would be more costly than
+ * unconditionally renaming the reservation.
+ */
+ vm_reserv_rename(m, new_object, orig_object,
+ offidxstart);
#endif
- vm_page_lock(m);
- vm_page_rename(m, new_object, idx);
- vm_page_unlock(m);
- /* page automatically made dirty by rename and cache handled */
- vm_page_busy(m);
+ vm_page_rename(m, new_object, idx);
+ /*
+ * page automatically made dirty by rename and
+ * cache handled
+ */
+ vm_page_busy(m);
+ }
+ if (n < VM_RADIX_STACK)
+ break;
}
if (orig_object->type == OBJT_SWAP) {
/*
@@ -1369,19 +1430,6 @@ retry:
* and new_object's locks are released and reacquired.
*/
swap_pager_copy(orig_object, new_object, offidxstart, 0);
-
- /*
- * Transfer any cached pages from orig_object to new_object.
- * If swap_pager_copy() found swapped out pages within the
- * specified range of orig_object, then it changed
- * new_object's type to OBJT_SWAP when it transferred those
- * pages to new_object. Otherwise, new_object's type
- * should still be OBJT_DEFAULT and orig_object should not
- * contain any cached pages within the specified range.
- */
- if (__predict_false(orig_object->cache != NULL))
- vm_page_cache_transfer(orig_object, offidxstart,
- new_object);
}
VM_OBJECT_UNLOCK(orig_object);
TAILQ_FOREACH(m, &new_object->memq, listq)
@@ -1400,10 +1448,14 @@ retry:
static int
vm_object_backing_scan(vm_object_t object, int op)
{
- int r = 1;
+ vm_page_t pa[VM_RADIX_STACK];
vm_page_t p;
vm_object_t backing_object;
- vm_pindex_t backing_offset_index;
+ vm_pindex_t backing_offset_index, new_pindex;
+ vm_pindex_t start;
+ u_int exhausted;
+ int color, i, n;
+ int r = 1;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
@@ -1431,15 +1483,41 @@ vm_object_backing_scan(vm_object_t object, int op)
if (op & OBSC_COLLAPSE_WAIT) {
vm_object_set_flag(backing_object, OBJ_DEAD);
}
-
+ color = VM_RADIX_BLACK;
+ if (op & OBSC_COLLAPSE_WAIT)
+ color |= VM_RADIX_RED;
/*
* Our scan
*/
- p = TAILQ_FIRST(&backing_object->memq);
- while (p) {
- vm_page_t next = TAILQ_NEXT(p, listq);
- vm_pindex_t new_pindex = p->pindex - backing_offset_index;
+restart:
+ start = 0;
+ i = n = VM_RADIX_STACK;
+ exhausted = 0;
+ for (;;) {
+ if (i == n) {
+ if (n < VM_RADIX_STACK)
+ break;
+ if (exhausted != 0 ||
+ (n = vm_radix_lookupn(&backing_object->rtree,
+ start, 0, color, (void **)pa, VM_RADIX_STACK,
+ &start, &exhausted)) == 0)
+ break;
+ i = 0;
+ }
+ p = pa[i++];
+ /*
+ * Free cached pages. XXX Why? Emulating old behavior here.
+ */
+ if (p->flags & PG_CACHED) {
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (p->object == backing_object)
+ vm_page_cache_free(p);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ continue;
+ } else if (p->object != backing_object)
+ continue;
+ new_pindex = p->pindex - backing_offset_index;
if (op & OBSC_TEST_ALL_SHADOWED) {
vm_page_t pp;
@@ -1451,13 +1529,9 @@ vm_object_backing_scan(vm_object_t object, int op)
* note that we do not busy the backing object's
* page.
*/
- if (
- p->pindex < backing_offset_index ||
- new_pindex >= object->size
- ) {
- p = next;
+ if (p->pindex < backing_offset_index ||
+ new_pindex >= object->size)
continue;
- }
/*
* See if the parent has the page or if the parent's
@@ -1486,12 +1560,9 @@ vm_object_backing_scan(vm_object_t object, int op)
vm_page_t pp;
if (op & OBSC_COLLAPSE_NOWAIT) {
- if ((p->oflags & VPO_BUSY) ||
- !p->valid ||
- p->busy) {
- p = next;
+ if ((p->oflags & VPO_BUSY) || !p->valid ||
+ p->busy)
continue;
- }
} else if (op & OBSC_COLLAPSE_WAIT) {
if ((p->oflags & VPO_BUSY) || p->busy) {
VM_OBJECT_UNLOCK(object);
@@ -1507,8 +1578,7 @@ vm_object_backing_scan(vm_object_t object, int op)
* should not have changed so we
* just restart our scan.
*/
- p = TAILQ_FIRST(&backing_object->memq);
- continue;
+ goto restart;
}
}
@@ -1544,7 +1614,6 @@ vm_object_backing_scan(vm_object_t object, int op)
else
vm_page_remove(p);
vm_page_unlock(p);
- p = next;
continue;
}
@@ -1564,7 +1633,6 @@ vm_object_backing_scan(vm_object_t object, int op)
* page before we can (re)lock the parent.
* Hence we can get here.
*/
- p = next;
continue;
}
if (
@@ -1586,7 +1654,6 @@ vm_object_backing_scan(vm_object_t object, int op)
else
vm_page_remove(p);
vm_page_unlock(p);
- p = next;
continue;
}
@@ -1605,12 +1672,9 @@ vm_object_backing_scan(vm_object_t object, int op)
* If the page was mapped to a process, it can remain
* mapped through the rename.
*/
- vm_page_lock(p);
vm_page_rename(p, object, new_pindex);
- vm_page_unlock(p);
/* page automatically made dirty by rename */
}
- p = next;
}
return (r);
}
@@ -1724,12 +1788,6 @@ vm_object_collapse(vm_object_t object)
backing_object,
object,
OFF_TO_IDX(object->backing_object_offset), TRUE);
-
- /*
- * Free any cached pages from backing_object.
- */
- if (__predict_false(backing_object->cache != NULL))
- vm_page_cache_free(backing_object, 0, 0);
}
/*
* Object now shadows whatever backing_object did.
@@ -1850,80 +1908,112 @@ void
vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
int options)
{
- vm_page_t p, next;
+ struct vnode *vp;
+ vm_page_t pa[VM_RADIX_STACK];
+ vm_page_t p;
+ u_int exhausted;
+ int i, n;
int wirings;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
(options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
("vm_object_page_remove: illegal options for object %p", object));
- if (object->resident_page_count == 0)
- goto skipmemq;
+ if (object->resident_page_count == 0 && object->cached_page_count == 0)
+ return;
+ vp = NULL;
vm_object_pip_add(object, 1);
-again:
- p = vm_page_find_least(object, start);
-
- /*
- * Here, the variable "p" is either (1) the page with the least pindex
- * greater than or equal to the parameter "start" or (2) NULL.
- */
- for (; p != NULL && (p->pindex < end || end == 0); p = next) {
- next = TAILQ_NEXT(p, listq);
-
- /*
- * If the page is wired for any reason besides the existence
- * of managed, wired mappings, then it cannot be freed. For
- * example, fictitious pages, which represent device memory,
- * are inherently wired and cannot be freed. They can,
- * however, be invalidated if the option OBJPR_CLEANONLY is
- * not specified.
- */
- vm_page_lock(p);
- if ((wirings = p->wire_count) != 0 &&
- (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
+restart:
+ exhausted = 0;
+ while (exhausted == 0 && (n = vm_radix_lookupn(&object->rtree, start,
+ end, VM_RADIX_ANY, (void **)pa, VM_RADIX_STACK, &start,
+ &exhausted)) != 0) {
+ for (i = 0; i < n; i++) {
+ p = pa[i];
+ /*
+ * Another thread may allocate this cached page from
+ * the queue before we acquire the page queue free
+ * mtx.
+ */
+ if (p->flags & PG_CACHED) {
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (p->object == object) {
+ vm_page_cache_free(p);
+ if (object->type == OBJT_VNODE &&
+ object->cached_page_count == 0)
+ vp = object->handle;
+ }
+ mtx_unlock(&vm_page_queue_free_mtx);
+ continue;
+ } else if (p->object != object)
+ continue;
+ /*
+ * If the page is wired for any reason besides
+ * the existence of managed, wired mappings, then
+ * it cannot be freed. For example, fictitious
+ * pages, which represent device memory, are
+ * inherently wired and cannot be freed. They can,
+ * however, be invalidated if the option
+ * OBJPR_CLEANONLY is not specified.
+ */
+ vm_page_lock(p);
+ if ((wirings = p->wire_count) != 0 &&
+ (wirings = pmap_page_wired_mappings(p)) !=
+ p->wire_count) {
+ if ((options & OBJPR_NOTMAPPED) == 0) {
+ pmap_remove_all(p);
+ /*
+ * Account for removal of wired
+ * mappings.
+ */
+ if (wirings != 0)
+ p->wire_count -= wirings;
+ }
+ if ((options & OBJPR_CLEANONLY) == 0) {
+ p->valid = 0;
+ vm_page_undirty(p);
+ }
+ vm_page_unlock(p);
+ continue;
+ }
+ if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) {
+ start = p->pindex;
+ goto restart;
+ }
+ KASSERT((p->flags & PG_FICTITIOUS) == 0,
+ ("vm_object_page_remove: page %p is fictitious",
+ p));
+ if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
+ if ((options & OBJPR_NOTMAPPED) == 0)
+ pmap_remove_write(p);
+ if (p->dirty) {
+ vm_page_unlock(p);
+ continue;
+ }
+ }
if ((options & OBJPR_NOTMAPPED) == 0) {
pmap_remove_all(p);
/* Account for removal of wired mappings. */
+ if (wirings != 0) {
+ KASSERT(p->wire_count == wirings,
+ ("inconsistent wire count %d %d %p",
+ p->wire_count, wirings, p));
+ p->wire_count = 0;
+ atomic_subtract_int(&cnt.v_wire_count,
+ 1);
+ }
if (wirings != 0)
p->wire_count -= wirings;
}
- if ((options & OBJPR_CLEANONLY) == 0) {
- p->valid = 0;
- vm_page_undirty(p);
- }
+ vm_page_free(p);
vm_page_unlock(p);
- continue;
- }
- if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
- goto again;
- KASSERT((p->flags & PG_FICTITIOUS) == 0,
- ("vm_object_page_remove: page %p is fictitious", p));
- if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
- if ((options & OBJPR_NOTMAPPED) == 0)
- pmap_remove_write(p);
- if (p->dirty) {
- vm_page_unlock(p);
- continue;
- }
}
- if ((options & OBJPR_NOTMAPPED) == 0) {
- pmap_remove_all(p);
- /* Account for removal of wired mappings. */
- if (wirings != 0) {
- KASSERT(p->wire_count == wirings,
- ("inconsistent wire count %d %d %p",
- p->wire_count, wirings, p));
- p->wire_count = 0;
- atomic_subtract_int(&cnt.v_wire_count, 1);
- }
- }
- vm_page_free(p);
- vm_page_unlock(p);
+ if (n < VM_RADIX_STACK)
+ break;
}
vm_object_pip_wakeup(object);
-skipmemq:
- if (__predict_false(object->cache != NULL))
- vm_page_cache_free(object, start, end);
+ if (vp)
+ vdrop(vp);
}
/*
@@ -2301,8 +2391,9 @@ DB_SHOW_COMMAND(object, vm_object_print_static)
db_printf(",");
count++;
- db_printf("(off=0x%jx,page=0x%jx)",
- (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
+ db_printf("(off=0x%jx,page=0x%jx,obj=%p,flags=0x%X)",
+ (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p),
+ p->object, p->flags);
}
if (count != 0)
db_printf("\n");
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 8134752..7beee59 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -71,6 +71,8 @@
#include <sys/_lock.h>
#include <sys/_mutex.h>
+#include <vm/vm_radix.h>
+
/*
* Types defined:
*
@@ -100,7 +102,7 @@ struct vm_object {
LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
TAILQ_HEAD(, vm_page) memq; /* list of resident pages */
- vm_page_t root; /* root of the resident page splay tree */
+ struct vm_radix rtree; /* root of the resident page radix index tree */
vm_pindex_t size; /* Object size */
int generation; /* generation ID */
int ref_count; /* How many refs?? */
@@ -111,11 +113,11 @@ struct vm_object {
u_short pg_color; /* (c) color of first page in obj */
u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */
int resident_page_count; /* number of resident pages */
+ int cached_page_count; /* number of cached pages */
struct vm_object *backing_object; /* object that I'm a shadow of */
vm_ooffset_t backing_object_offset;/* Offset in backing object */
TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
LIST_HEAD(, vm_reserv) rvq; /* list of reservations */
- vm_page_t cache; /* (o + f) root of the cache page splay tree */
void *handle;
union {
/*
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index c952c05..fb49a6d 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -109,11 +109,13 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <vm/uma_int.h>
+#include <machine/cpu.h>
#include <machine/md_var.h>
/*
@@ -316,7 +318,6 @@ vm_page_startup(vm_offset_t vaddr)
mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
for (i = 0; i < PQ_COUNT; i++)
vm_pagequeue_init_lock(&vm_pagequeues[i]);
-
/*
* Allocate memory for use when boot strapping the kernel memory
* allocator.
@@ -793,63 +794,6 @@ vm_page_dirty_KBI(vm_page_t m)
}
/*
- * vm_page_splay:
- *
- * Implements Sleator and Tarjan's top-down splay algorithm. Returns
- * the vm_page containing the given pindex. If, however, that
- * pindex is not found in the vm_object, returns a vm_page that is
- * adjacent to the pindex, coming before or after it.
- */
-vm_page_t
-vm_page_splay(vm_pindex_t pindex, vm_page_t root)
-{
- struct vm_page dummy;
- vm_page_t lefttreemax, righttreemin, y;
-
- if (root == NULL)
- return (root);
- lefttreemax = righttreemin = &dummy;
- for (;; root = y) {
- if (pindex < root->pindex) {
- if ((y = root->left) == NULL)
- break;
- if (pindex < y->pindex) {
- /* Rotate right. */
- root->left = y->right;
- y->right = root;
- root = y;
- if ((y = root->left) == NULL)
- break;
- }
- /* Link into the new root's right tree. */
- righttreemin->left = root;
- righttreemin = root;
- } else if (pindex > root->pindex) {
- if ((y = root->right) == NULL)
- break;
- if (pindex > y->pindex) {
- /* Rotate left. */
- root->right = y->left;
- y->left = root;
- root = y;
- if ((y = root->right) == NULL)
- break;
- }
- /* Link into the new root's left tree. */
- lefttreemax->right = root;
- lefttreemax = root;
- } else
- break;
- }
- /* Assemble the new root. */
- lefttreemax->right = root->left;
- righttreemin->left = root->right;
- root->left = dummy.right;
- root->right = dummy.left;
- return (root);
-}
-
-/*
* vm_page_insert: [ internal use only ]
*
* Inserts the given mem entry into the object and object list.
@@ -861,14 +805,15 @@ vm_page_splay(vm_pindex_t pindex, vm_page_t root)
*
* The object must be locked.
*/
-void
+int
vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t root;
-
+ vm_page_t neighbor;
+ vm_pindex_t cpindex;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (m->object != NULL)
panic("vm_page_insert: page already inserted");
+ cpindex = m->pindex;
/*
* Record the object/offset pair in this page
@@ -876,31 +821,26 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
m->object = object;
m->pindex = pindex;
- /*
- * Now link into the object's ordered list of backed pages.
- */
- root = object->root;
- if (root == NULL) {
- m->left = NULL;
- m->right = NULL;
+ if (object->resident_page_count == 0) {
TAILQ_INSERT_TAIL(&object->memq, m, listq);
- } else {
- root = vm_page_splay(pindex, root);
- if (pindex < root->pindex) {
- m->left = root->left;
- m->right = root;
- root->left = NULL;
- TAILQ_INSERT_BEFORE(root, m, listq);
- } else if (pindex == root->pindex)
- panic("vm_page_insert: offset already allocated");
- else {
- m->right = root->right;
- m->left = root;
- root->right = NULL;
- TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
- }
+ } else {
+ neighbor = vm_radix_lookup_ge(&object->rtree, pindex,
+ VM_RADIX_BLACK);
+ if (neighbor != NULL) {
+ KASSERT(pindex < neighbor->pindex,
+ ("vm_page_insert: offset %ju not minor than %ju",
+ (uintmax_t)pindex, (uintmax_t)neighbor->pindex));
+ TAILQ_INSERT_BEFORE(neighbor, m, listq);
+ } else
+ TAILQ_INSERT_TAIL(&object->memq, m, listq);
+ }
+
+ if (vm_radix_insert(&object->rtree, pindex, m) != 0) {
+ TAILQ_REMOVE(&object->memq, m, listq);
+ m->object = NULL;
+ m->pindex = cpindex;
+ return (ENOMEM);
}
- object->root = m;
/*
* Show that the object has one more resident page.
@@ -919,6 +859,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
*/
if (pmap_page_is_write_mapped(m))
vm_object_set_writeable_dirty(object);
+ return (0);
}
/*
@@ -936,7 +877,6 @@ void
vm_page_remove(vm_page_t m)
{
vm_object_t object;
- vm_page_t next, prev, root;
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_lock_assert(m, MA_OWNED);
@@ -948,45 +888,7 @@ vm_page_remove(vm_page_t m)
vm_page_flash(m);
}
- /*
- * Now remove from the object's list of backed pages.
- */
- if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
- /*
- * Since the page's successor in the list is also its parent
- * in the tree, its right subtree must be empty.
- */
- next->left = m->left;
- KASSERT(m->right == NULL,
- ("vm_page_remove: page %p has right child", m));
- } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
- prev->right == m) {
- /*
- * Since the page's predecessor in the list is also its parent
- * in the tree, its left subtree must be empty.
- */
- KASSERT(m->left == NULL,
- ("vm_page_remove: page %p has left child", m));
- prev->right = m->right;
- } else {
- if (m != object->root)
- vm_page_splay(m->pindex, object->root);
- if (m->left == NULL)
- root = m->right;
- else if (m->right == NULL)
- root = m->left;
- else {
- /*
- * Move the page's successor to the root, because
- * pages are usually removed in ascending order.
- */
- if (m->right != next)
- vm_page_splay(m->pindex, m->right);
- next->left = m->left;
- root = next;
- }
- object->root = root;
- }
+ vm_radix_remove(&object->rtree, m->pindex, VM_RADIX_BLACK);
TAILQ_REMOVE(&object->memq, m, listq);
/*
@@ -1014,15 +916,10 @@ vm_page_remove(vm_page_t m)
vm_page_t
vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t m;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if ((m = object->root) != NULL && m->pindex != pindex) {
- m = vm_page_splay(pindex, m);
- if ((object->root = m)->pindex != pindex)
- m = NULL;
- }
- return (m);
+
+ return vm_radix_lookup(&object->rtree, pindex, VM_RADIX_BLACK);
}
/*
@@ -1036,17 +933,12 @@ vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
vm_page_t
vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t m;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
- if (m->pindex < pindex) {
- m = vm_page_splay(pindex, object->root);
- if ((object->root = m)->pindex < pindex)
- m = TAILQ_NEXT(m, listq);
- }
- }
- return (m);
+ if (object->resident_page_count)
+ return vm_radix_lookup_ge(&object->rtree, pindex,
+ VM_RADIX_BLACK);
+ return (NULL);
}
/*
@@ -1108,75 +1000,21 @@ vm_page_prev(vm_page_t m)
void
vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
{
+ u_int i;
- vm_page_remove(m);
- vm_page_insert(m, new_object, new_pindex);
- vm_page_dirty(m);
-}
-
-/*
- * Convert all of the given object's cached pages that have a
- * pindex within the given range into free pages. If the value
- * zero is given for "end", then the range's upper bound is
- * infinity. If the given object is backed by a vnode and it
- * transitions from having one or more cached pages to none, the
- * vnode's hold count is reduced.
- */
-void
-vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
-{
- vm_page_t m, m_next;
- boolean_t empty;
-
- mtx_lock(&vm_page_queue_free_mtx);
- if (__predict_false(object->cache == NULL)) {
- mtx_unlock(&vm_page_queue_free_mtx);
- return;
- }
- m = object->cache = vm_page_splay(start, object->cache);
- if (m->pindex < start) {
- if (m->right == NULL)
- m = NULL;
- else {
- m_next = vm_page_splay(start, m->right);
- m_next->left = m;
- m->right = NULL;
- m = object->cache = m_next;
- }
- }
+ MPASS(m->object != NULL);
+ VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
- /*
- * At this point, "m" is either (1) a reference to the page
- * with the least pindex that is greater than or equal to
- * "start" or (2) NULL.
- */
- for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
- /*
- * Find "m"'s successor and remove "m" from the
- * object's cache.
- */
- if (m->right == NULL) {
- object->cache = m->left;
- m_next = NULL;
- } else {
- m_next = vm_page_splay(start, m->right);
- m_next->left = m->left;
- object->cache = m_next;
- }
- /* Convert "m" to a free page. */
- m->object = NULL;
- m->valid = 0;
- /* Clear PG_CACHED and set PG_FREE. */
- m->flags ^= PG_CACHED | PG_FREE;
- KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
- ("vm_page_cache_free: page %p has inconsistent flags", m));
- cnt.v_cache_count--;
- cnt.v_free_count++;
+ vm_page_lock(m);
+ vm_page_remove(m);
+ vm_page_unlock(m);
+ while (vm_page_insert(m, new_object, new_pindex) != 0) {
+ pagedaemon_wakeup();
+ for (i = 0; i < 10000000; i++)
+ cpu_spinwait();
}
- empty = object->cache == NULL;
- mtx_unlock(&vm_page_queue_free_mtx);
- if (object->type == OBJT_VNODE && empty)
- vdrop(object->handle);
+ vm_page_dirty(m);
}
/*
@@ -1188,15 +1026,12 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
static inline vm_page_t
vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t m;
+ VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- if ((m = object->cache) != NULL && m->pindex != pindex) {
- m = vm_page_splay(pindex, m);
- if ((object->cache = m)->pindex != pindex)
- m = NULL;
- }
- return (m);
+ if (object->cached_page_count != 0)
+ return vm_radix_lookup(&object->rtree, pindex, VM_RADIX_RED);
+ return (NULL);
}
/*
@@ -1208,131 +1043,77 @@ vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
static void
vm_page_cache_remove(vm_page_t m)
{
- vm_object_t object;
- vm_page_t root;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
KASSERT((m->flags & PG_CACHED) != 0,
("vm_page_cache_remove: page %p is not cached", m));
- object = m->object;
- if (m != object->cache) {
- root = vm_page_splay(m->pindex, object->cache);
- KASSERT(root == m,
- ("vm_page_cache_remove: page %p is not cached in object %p",
- m, object));
- }
- if (m->left == NULL)
- root = m->right;
- else if (m->right == NULL)
- root = m->left;
- else {
- root = vm_page_splay(m->pindex, m->left);
- root->right = m->right;
- }
- object->cache = root;
+ vm_radix_remove(&m->object->rtree, m->pindex, VM_RADIX_RED);
+ m->object->cached_page_count--;
m->object = NULL;
cnt.v_cache_count--;
}
/*
- * Transfer all of the cached pages with offset greater than or
- * equal to 'offidxstart' from the original object's cache to the
- * new object's cache. However, any cached pages with offset
- * greater than or equal to the new object's size are kept in the
- * original object. Initially, the new object's cache must be
- * empty. Offset 'offidxstart' in the original object must
- * correspond to offset zero in the new object.
- *
- * The new object must be locked.
+ * Move a given cached page from an object's cached pages to
+ * the free list.
+ *
+ * The free page queue mtx and object lock must be locked.
*/
void
-vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
- vm_object_t new_object)
+vm_page_cache_free(vm_page_t m)
{
- vm_page_t m, m_next;
+ vm_object_t object;
+
+ object = m->object;
+ VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ KASSERT((m->flags & PG_CACHED) != 0,
+ ("vm_page_cache_free: page %p is not cached", m));
/*
- * Insertion into an object's collection of cached pages
- * requires the object to be locked. In contrast, removal does
- * not.
+ * Replicate vm_page_cache_remove with a version that can collapse
+ * internal nodes since the object lock is held.
*/
- VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
- KASSERT(new_object->cache == NULL,
- ("vm_page_cache_transfer: object %p has cached pages",
- new_object));
- mtx_lock(&vm_page_queue_free_mtx);
- if ((m = orig_object->cache) != NULL) {
- /*
- * Transfer all of the pages with offset greater than or
- * equal to 'offidxstart' from the original object's
- * cache to the new object's cache.
- */
- m = vm_page_splay(offidxstart, m);
- if (m->pindex < offidxstart) {
- orig_object->cache = m;
- new_object->cache = m->right;
- m->right = NULL;
- } else {
- orig_object->cache = m->left;
- new_object->cache = m;
- m->left = NULL;
- }
- while ((m = new_object->cache) != NULL) {
- if ((m->pindex - offidxstart) >= new_object->size) {
- /*
- * Return all of the cached pages with
- * offset greater than or equal to the
- * new object's size to the original
- * object's cache.
- */
- new_object->cache = m->left;
- m->left = orig_object->cache;
- orig_object->cache = m;
- break;
- }
- m_next = vm_page_splay(m->pindex, m->right);
- /* Update the page's object and offset. */
- m->object = new_object;
- m->pindex -= offidxstart;
- if (m_next == NULL)
- break;
- m->right = NULL;
- m_next->left = m;
- new_object->cache = m_next;
- }
- KASSERT(new_object->cache == NULL ||
- new_object->type == OBJT_SWAP,
- ("vm_page_cache_transfer: object %p's type is incompatible"
- " with cached pages", new_object));
- }
- mtx_unlock(&vm_page_queue_free_mtx);
+ vm_radix_remove(&object->rtree, m->pindex, VM_RADIX_ANY);
+ object->cached_page_count--;
+ m->object = NULL;
+ m->valid = 0;
+ /* Clear PG_CACHED and set PG_FREE. */
+ m->flags ^= PG_CACHED | PG_FREE;
+ cnt.v_cache_count--;
+ cnt.v_free_count++;
}
/*
- * Returns TRUE if a cached page is associated with the given object and
- * offset, and FALSE otherwise.
- *
- * The object must be locked.
+ * Attempt to rename a cached page from one object to another. If
+ * it fails the cached page is freed.
*/
-boolean_t
-vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
+void
+vm_page_cache_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t idx)
{
- vm_page_t m;
+ vm_object_t orig_object;
+ orig_object = m->object;
+ VM_OBJECT_LOCK_ASSERT(orig_object, MA_OWNED);
+ VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
/*
- * Insertion into an object's collection of cached pages requires the
- * object to be locked. Therefore, if the object is locked and the
- * object's collection is empty, there is no need to acquire the free
- * page queues lock in order to prove that the specified page doesn't
- * exist.
+ * If the insert fails we simply free the cached page.
*/
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (__predict_true(object->cache == NULL))
- return (FALSE);
- mtx_lock(&vm_page_queue_free_mtx);
- m = vm_page_cache_lookup(object, pindex);
- mtx_unlock(&vm_page_queue_free_mtx);
- return (m != NULL);
+ if (vm_radix_insert(&new_object->rtree, idx, m) != 0) {
+ vm_page_cache_free(m);
+ return;
+ }
+ vm_radix_color(&new_object->rtree, idx, VM_RADIX_RED);
+ /*
+ * We use any color here though we know it's red so that tree
+ * compaction will still work.
+ */
+ vm_radix_remove(&orig_object->rtree, m->pindex, VM_RADIX_ANY);
+ m->object = new_object;
+ m->pindex = idx;
+ new_object->cached_page_count++;
+ orig_object->cached_page_count--;
}
/*
@@ -1465,7 +1246,8 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
m->valid = 0;
m_object = m->object;
vm_page_cache_remove(m);
- if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+ if (m_object->type == OBJT_VNODE &&
+ m_object->cached_page_count == 0)
vp = m_object->handle;
} else {
KASSERT(VM_PAGE_IS_FREE(m),
@@ -1509,7 +1291,19 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
if (object->memattr != VM_MEMATTR_DEFAULT &&
(object->flags & OBJ_FICTITIOUS) == 0)
pmap_page_set_memattr(m, object->memattr);
- vm_page_insert(m, object, pindex);
+ if (vm_page_insert(m, object, pindex) != 0) {
+
+ /* See the comment below about hold count handling. */
+ if (vp != NULL)
+ vdrop(vp);
+ vm_page_lock(m);
+ if (req & VM_ALLOC_WIRED)
+ vm_page_unwire(m, 0);
+ vm_page_free(m);
+ vm_page_unlock(m);
+ pagedaemon_wakeup();
+ return (NULL);
+ }
} else
m->pindex = pindex;
@@ -1576,6 +1370,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
{
struct vnode *drop;
vm_page_t deferred_vdrop_list, m, m_ret;
+ vm_pindex_t cpindex;
u_int flags, oflags;
int req_class;
@@ -1662,6 +1457,7 @@ retry:
memattr == VM_MEMATTR_DEFAULT)
memattr = object->memattr;
}
+ cpindex = pindex;
for (m = m_ret; m < &m_ret[npages]; m++) {
m->aflags = 0;
m->flags = (m->flags | PG_NODUMP) & flags;
@@ -1671,12 +1467,30 @@ retry:
m->oflags = oflags;
if (memattr != VM_MEMATTR_DEFAULT)
pmap_page_set_memattr(m, memattr);
- if (object != NULL)
- vm_page_insert(m, object, pindex);
- else
- m->pindex = pindex;
pindex++;
}
+ for (m = m_ret; m < &m_ret[npages]; m++) {
+ if (object != NULL) {
+ if (vm_page_insert(m, object, cpindex) != 0) {
+ while (deferred_vdrop_list != NULL) {
+ vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
+ deferred_vdrop_list =
+ deferred_vdrop_list->pageq.tqe_next;
+ }
+ for (m = m_ret; m < &m_ret[npages]; m++) {
+ vm_page_lock(m);
+ if (req & VM_ALLOC_WIRED)
+ vm_page_unwire(m, 0);
+ vm_page_free(m);
+ vm_page_unlock(m);
+ }
+ pagedaemon_wakeup();
+ return (NULL);
+ }
+ } else
+ m->pindex = cpindex;
+ cpindex++;
+ }
while (deferred_vdrop_list != NULL) {
vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
@@ -1722,7 +1536,8 @@ vm_page_alloc_init(vm_page_t m)
m->valid = 0;
m_object = m->object;
vm_page_cache_remove(m);
- if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+ if (m_object->type == OBJT_VNODE &&
+ m_object->cached_page_count == 0)
drop = m_object->handle;
} else {
KASSERT(VM_PAGE_IS_FREE(m),
@@ -2321,7 +2136,7 @@ void
vm_page_cache(vm_page_t m)
{
vm_object_t object;
- vm_page_t next, prev, root;
+ int old_cached;
vm_page_lock_assert(m, MA_OWNED);
object = m->object;
@@ -2352,46 +2167,7 @@ vm_page_cache(vm_page_t m)
*/
vm_page_remque(m);
- /*
- * Remove the page from the object's collection of resident
- * pages.
- */
- if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
- /*
- * Since the page's successor in the list is also its parent
- * in the tree, its right subtree must be empty.
- */
- next->left = m->left;
- KASSERT(m->right == NULL,
- ("vm_page_cache: page %p has right child", m));
- } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
- prev->right == m) {
- /*
- * Since the page's predecessor in the list is also its parent
- * in the tree, its left subtree must be empty.
- */
- KASSERT(m->left == NULL,
- ("vm_page_cache: page %p has left child", m));
- prev->right = m->right;
- } else {
- if (m != object->root)
- vm_page_splay(m->pindex, object->root);
- if (m->left == NULL)
- root = m->right;
- else if (m->right == NULL)
- root = m->left;
- else {
- /*
- * Move the page's successor to the root, because
- * pages are usually removed in ascending order.
- */
- if (m->right != next)
- vm_page_splay(m->pindex, m->right);
- next->left = m->left;
- root = next;
- }
- object->root = root;
- }
+ vm_radix_color(&object->rtree, m->pindex, VM_RADIX_RED);
TAILQ_REMOVE(&object->memq, m, listq);
object->resident_page_count--;
@@ -2408,26 +2184,9 @@ vm_page_cache(vm_page_t m)
m->flags &= ~PG_ZERO;
mtx_lock(&vm_page_queue_free_mtx);
m->flags |= PG_CACHED;
+ old_cached = object->cached_page_count;
+ object->cached_page_count++;
cnt.v_cache_count++;
- root = object->cache;
- if (root == NULL) {
- m->left = NULL;
- m->right = NULL;
- } else {
- root = vm_page_splay(m->pindex, root);
- if (m->pindex < root->pindex) {
- m->left = root->left;
- m->right = root;
- root->left = NULL;
- } else if (__predict_false(m->pindex == root->pindex))
- panic("vm_page_cache: offset already cached");
- else {
- m->right = root->right;
- m->left = root;
- root->right = NULL;
- }
- }
- object->cache = m;
#if VM_NRESERVLEVEL > 0
if (!vm_reserv_free_page(m)) {
#else
@@ -2445,9 +2204,9 @@ vm_page_cache(vm_page_t m)
* the object's only resident page.
*/
if (object->type == OBJT_VNODE) {
- if (root == NULL && object->resident_page_count != 0)
+ if (old_cached == 0 && object->resident_page_count != 0)
vhold(object->handle);
- else if (root != NULL && object->resident_page_count == 0)
+ else if (old_cached != 0 && object->resident_page_count == 0)
vdrop(object->handle);
}
}
@@ -2969,11 +2728,8 @@ vm_page_cowfault(vm_page_t m)
pindex = m->pindex;
retry_alloc:
- pmap_remove_all(m);
- vm_page_remove(m);
mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
if (mnew == NULL) {
- vm_page_insert(m, object, pindex);
vm_page_unlock(m);
VM_OBJECT_UNLOCK(object);
VM_WAIT;
@@ -2999,8 +2755,9 @@ vm_page_cowfault(vm_page_t m)
vm_page_lock(mnew);
vm_page_free(mnew);
vm_page_unlock(mnew);
- vm_page_insert(m, object, pindex);
} else { /* clear COW & copy page */
+ pmap_remove_all(m);
+ vm_page_remove(m);
if (!so_zerocp_fullpage)
pmap_copy_page(m, mnew);
mnew->valid = VM_PAGE_BITS_ALL;
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 7a3f138..8a3deb9 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -128,8 +128,6 @@ typedef uint64_t vm_page_bits_t;
struct vm_page {
TAILQ_ENTRY(vm_page) pageq; /* page queue or free list (Q) */
TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */
- struct vm_page *left; /* splay tree link (O) */
- struct vm_page *right; /* splay tree link (O) */
vm_object_t object; /* which object am I in (O,P)*/
vm_pindex_t pindex; /* offset into object (O,P) */
@@ -378,8 +376,8 @@ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
vm_page_t vm_page_alloc_freelist(int, int);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
void vm_page_cache(vm_page_t);
-void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
+void vm_page_cache_free(vm_page_t);
+void vm_page_cache_rename(vm_page_t, vm_object_t, vm_pindex_t);
int vm_page_try_to_cache (vm_page_t);
int vm_page_try_to_free (vm_page_t);
void vm_page_dontneed(vm_page_t);
@@ -389,8 +387,7 @@ void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
-void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
-boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
+int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
@@ -404,7 +401,6 @@ void vm_page_requeue(vm_page_t m);
void vm_page_requeue_locked(vm_page_t m);
void vm_page_set_valid_range(vm_page_t m, int base, int size);
void vm_page_sleep(vm_page_t m, const char *msg);
-vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
vm_offset_t vm_page_startup(vm_offset_t vaddr);
void vm_page_unhold_pages(vm_page_t *ma, int count);
void vm_page_unwire (vm_page_t, int);
diff --git a/sys/vm/vm_radix.c b/sys/vm/vm_radix.c
new file mode 100644
index 0000000..f29f5d3
--- /dev/null
+++ b/sys/vm/vm_radix.c
@@ -0,0 +1,956 @@
+/*
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+
+/*
+ * Radix tree implementation.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ktr.h>
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#ifndef UMA_MD_SMALL_ALLOC
+#include <vm/vm_map.h>
+#endif
+#include <vm/vm_radix.h>
+#include <vm/vm_object.h>
+
+#include <sys/kdb.h>
+
+#ifndef UMA_MD_SMALL_ALLOC
+#define VM_RADIX_RNODE_MAP_SCALE (1024 * 1024 / 2)
+#define VM_RADIX_WIDTH 4
+
+/*
+ * Bits of height in root.
+ * The mask of smaller power of 2 containing VM_RADIX_LIMIT.
+ */
+#define VM_RADIX_HEIGHT 0x1f
+#else
+#define VM_RADIX_WIDTH 5
+
+/* See the comment above. */
+#define VM_RADIX_HEIGHT 0xf
+#endif
+
+#define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH)
+#define VM_RADIX_MASK (VM_RADIX_COUNT - 1)
+#define VM_RADIX_MAXVAL ((vm_pindex_t)-1)
+#define VM_RADIX_LIMIT howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH)
+
+/* Flag bits stored in node pointers. */
+#define VM_RADIX_FLAGS 0x3
+
+/* Calculates maximum value for a tree of height h. */
+#define VM_RADIX_MAX(h) \
+ ((h) == VM_RADIX_LIMIT ? VM_RADIX_MAXVAL : \
+ (((vm_pindex_t)1 << ((h) * VM_RADIX_WIDTH)) - 1))
+
+/*
+ * On 32-bits architectures KTR cannot handle 64-bits values.
+ * Add macros for splitting in 32-bits quantity and provide format strings.
+ * Note that braces are not used because they would break compilation.
+ * Also, note that arguments are cast to u_long in order to follow KTR
+ * convention.
+ */
+#ifdef KTR
+#define KFRMT64(x) __STRING(x)"l 0x%08lx, "__STRING(x)"h 0x%08lx"
+#define KSPLT64L(x) ((u_long)((x) & 0xFFFFFFFF))
+#define KSPLT64H(x) ((u_long)(((x) >> 32) & 0xFFFFFFFF))
+#endif
+
+struct vm_radix_node {
+ void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */
+ volatile uint32_t rn_count; /* Valid children. */
+};
+
+CTASSERT(VM_RADIX_HEIGHT >= VM_RADIX_LIMIT);
+CTASSERT(sizeof(struct vm_radix_node) < PAGE_SIZE);
+
+static uma_zone_t vm_radix_node_zone;
+
+#ifndef UMA_MD_SMALL_ALLOC
+static vm_map_t rnode_map;
+static u_long rnode_map_scale;
+
+static void *
+vm_radix_node_zone_allocf(uma_zone_t zone, int size, uint8_t *flags, int wait)
+{
+ vm_offset_t addr;
+ vm_page_t m;
+ int pflags;
+
+ /* Inform UMA that this allocator uses rnode_map. */
+ *flags = UMA_SLAB_KERNEL;
+
+ pflags = VM_ALLOC_WIRED | VM_ALLOC_NOOBJ;
+
+ /*
+ * As kmem_alloc_nofault() can however fail, let just assume that
+ * M_NOWAIT is on and act accordingly.
+ */
+ pflags |= ((wait & M_USE_RESERVE) != 0) ? VM_ALLOC_INTERRUPT :
+ VM_ALLOC_SYSTEM;
+ if ((wait & M_ZERO) != 0)
+ pflags |= VM_ALLOC_ZERO;
+ addr = kmem_alloc_nofault(rnode_map, size);
+ if (addr == 0)
+ return (NULL);
+
+ /* Just one page allocation is assumed here. */
+ m = vm_page_alloc(NULL, OFF_TO_IDX(addr - VM_MIN_KERNEL_ADDRESS),
+ pflags);
+ if (m == NULL) {
+ kmem_free(rnode_map, addr, size);
+ return (NULL);
+ }
+ if ((wait & M_ZERO) != 0 && (m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ pmap_qenter(addr, &m, 1);
+ return ((void *)addr);
+}
+
+static void
+vm_radix_node_zone_freef(void *item, int size, uint8_t flags)
+{
+ vm_page_t m;
+ vm_offset_t voitem;
+
+ MPASS((flags & UMA_SLAB_KERNEL) != 0);
+
+ /* Just one page allocation is assumed here. */
+ voitem = (vm_offset_t)item;
+ m = PHYS_TO_VM_PAGE(pmap_kextract(voitem));
+ pmap_qremove(voitem, 1);
+ vm_page_lock(m);
+ vm_page_unwire(m, 0);
+ vm_page_free(m);
+ vm_page_unlock(m);
+ kmem_free(rnode_map, voitem, size);
+}
+
+static void
+init_vm_radix_alloc(void *dummy __unused)
+{
+
+ uma_zone_set_max(vm_radix_node_zone, rnode_map_scale);
+ uma_zone_set_allocf(vm_radix_node_zone, vm_radix_node_zone_allocf);
+ uma_zone_set_freef(vm_radix_node_zone, vm_radix_node_zone_freef);
+}
+SYSINIT(vm_radix, SI_SUB_KMEM, SI_ORDER_SECOND, init_vm_radix_alloc, NULL);
+#endif
+
+/*
+ * Radix node zone destructor.
+ */
+#ifdef INVARIANTS
+static void
+vm_radix_node_zone_dtor(void *mem, int size, void *arg)
+{
+ struct vm_radix_node *rnode;
+
+ rnode = mem;
+ KASSERT(rnode->rn_count == 0,
+ ("vm_radix_node_put: Freeing a node with %d children\n",
+ rnode->rn_count));
+}
+#endif
+
+/*
+ * Allocate a radix node. Initializes all elements to 0.
+ */
+static __inline struct vm_radix_node *
+vm_radix_node_get(void)
+{
+
+ return (uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO));
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+vm_radix_node_put(struct vm_radix_node *rnode)
+{
+
+ uma_zfree(vm_radix_node_zone, rnode);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+vm_radix_slot(vm_pindex_t index, int level)
+{
+
+ return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
+}
+
+/*
+ * Initialize the radix node submap (for architectures not supporting
+ * direct-mapping) and the radix node zone.
+ *
+ * WITNESS reports a lock order reversal, for architectures not
+ * supporting direct-mapping, between the "system map" lock
+ * and the "vm object" lock. This is because the well established ordering
+ * "system map" -> "vm object" is not honoured in this case as allocating
+ * from the radix node submap ends up adding a mapping entry to it, meaning
+ * it is necessary to lock the submap. However, the radix node submap is
+ * a leaf and self-contained, thus a deadlock cannot happen here and
+ * adding MTX_NOWITNESS to all map locks would be largerly sub-optimal.
+ */
+void
+vm_radix_init(void)
+{
+#ifndef UMA_MD_SMALL_ALLOC
+ vm_offset_t maxaddr, minaddr;
+
+ rnode_map_scale = VM_RADIX_RNODE_MAP_SCALE;
+ TUNABLE_ULONG_FETCH("hw.rnode_map_scale", &rnode_map_scale);
+ rnode_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
+ rnode_map_scale * sizeof(struct vm_radix_node), FALSE);
+ rnode_map->system_map = 1;
+#endif
+
+ vm_radix_node_zone = uma_zcreate("RADIX NODE",
+ sizeof(struct vm_radix_node), NULL,
+#ifdef INVARIANTS
+ vm_radix_node_zone_dtor,
+#else
+ NULL,
+#endif
+ NULL, NULL, VM_RADIX_HEIGHT, UMA_ZONE_VM);
+}
+
+/*
+ * Extract the root node and height from a radix tree with a single load.
+ */
+static __inline int
+vm_radix_height(struct vm_radix *rtree, struct vm_radix_node **rnode)
+{
+ uintptr_t root;
+ int height;
+
+ root = rtree->rt_root;
+ height = root & VM_RADIX_HEIGHT;
+ *rnode = (struct vm_radix_node *)(root - height);
+ return (height);
+}
+
+
+/*
+ * Set the root node and height for a radix tree.
+ */
+static inline void
+vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode,
+ int height)
+{
+ uintptr_t root;
+
+ root = (uintptr_t)rnode | height;
+ rtree->rt_root = root;
+}
+
+static inline void
+vm_radix_unwind_heightup(struct vm_radix *rtree, struct vm_radix_node *root,
+ struct vm_radix_node *iroot, int ilevel)
+{
+ struct vm_radix_node *rnode;
+
+ CTR4(KTR_VM, "unwind: tree %p, root %p, iroot %p, ilevel %d",
+ rtree, root, iroot, ilevel);
+ while (iroot != root && root != NULL) {
+ rnode = root;
+ MPASS(rnode->rn_count == 0 || rnode->rn_count == 1);
+ rnode->rn_count = 0;
+ root = rnode->rn_child[0];
+ vm_radix_node_put(rnode);
+ }
+ vm_radix_setroot(rtree, iroot, ilevel);
+}
+
+static inline void *
+vm_radix_match(void *child, int color)
+{
+ uintptr_t c;
+
+ c = (uintptr_t)child;
+
+ if ((c & color) == 0)
+ return (NULL);
+ return ((void *)(c & ~VM_RADIX_FLAGS));
+}
+
+static void
+vm_radix_reclaim_allnodes_internal(struct vm_radix_node *rnode, int level)
+{
+ int slot;
+
+ MPASS(rnode != NULL && level >= 0);
+
+ /*
+ * Level 0 just contains pages as children, thus make it a special
+ * case, free the node and return.
+ */
+ if (level == 0) {
+ CTR2(KTR_VM, "reclaiming: node %p, level %d", rnode, level);
+ rnode->rn_count = 0;
+ vm_radix_node_put(rnode);
+ return;
+ }
+ for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) {
+ if (rnode->rn_child[slot] == NULL)
+ continue;
+ CTR3(KTR_VM,
+ "reclaiming: node %p, level %d recursing in slot %d",
+ rnode, level, slot);
+ vm_radix_reclaim_allnodes_internal(rnode->rn_child[slot],
+ level - 1);
+ rnode->rn_count--;
+ }
+ MPASS(rnode->rn_count == 0);
+ CTR2(KTR_VM, "reclaiming: node %p, level %d", rnode, level);
+ vm_radix_node_put(rnode);
+}
+
+/*
+ * Remove the specified index from the tree. If possible the height of the
+ * tree is adjusted after deletion. May be used to cleanup intermediate
+ * nodes of the path if the key is not entirely present.
+ */
+static void
+vm_radix_sweep(struct vm_radix *rtree, vm_pindex_t index, int color, int hard)
+{
+ struct vm_radix_node *stack[VM_RADIX_LIMIT];
+ struct vm_radix_node *rnode, *root;
+ int level;
+ int slot;
+
+ level = vm_radix_height(rtree, &root);
+ KASSERT(index <= VM_RADIX_MAX(level),
+ ("vm_radix_sweep: %p index out of range %jd.", rtree,
+ VM_RADIX_MAX(level)));
+ rnode = root;
+ level--;
+ /*
+ * Find the node and record the path in stack.
+ */
+ while (level && rnode) {
+ stack[level] = rnode;
+ slot = vm_radix_slot(index, level);
+ CTR6(KTR_VM,
+ "remove: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot,
+ rnode);
+ CTR4(KTR_VM, "remove: tree %p, rnode %p, child %p, count %u",
+ rtree, rnode, rnode->rn_child[slot], rnode->rn_count);
+ rnode = rnode->rn_child[slot];
+ level--;
+ }
+ if (rnode == NULL) {
+ if (hard)
+ panic("vm_radix_sweep: index not present.\n");
+
+ /*
+ * Almost certainly it got an half-constructed tree it was
+ * expected.
+ */
+ KASSERT(level < (VM_RADIX_LIMIT - 1),
+ ("vm_radix_sweep: level %d not consistent.\n", level));
+ ++level;
+ rnode = stack[level];
+ slot = vm_radix_slot(index, level);
+ } else {
+ slot = vm_radix_slot(index, 0);
+ if (hard &&
+ vm_radix_match(rnode->rn_child[slot], color) == NULL)
+ panic("vm_radix_sweep: index not present as leaf.\n");
+ }
+
+ for (;;) {
+ CTR6(KTR_VM,
+"remove: resetting tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot,
+ rnode);
+ CTR4(KTR_VM,
+ "remove: resetting tree %p, rnode %p, child %p, count %u",
+ rtree, rnode,
+ (rnode != NULL) ? rnode->rn_child[slot] : NULL,
+ (rnode != NULL) ? rnode->rn_count : 0);
+ rnode->rn_child[slot] = NULL;
+ /*
+ * Use atomics for the last level since red and black
+ * will both adjust it.
+ * Use a write memory barrier here in order to avoid
+ * rn_count reaching 0 before to fetch the actual pointer.
+ * Concurrent black removal, infact, may want to reclaim
+ * the radix node itself before to read it.
+ */
+ MPASS(rnode->rn_count != 0);
+ if (level == 0)
+ atomic_add_rel_32(&rnode->rn_count, -1);
+ else
+ rnode->rn_count--;
+
+ /*
+ * Only allow black removes to prune the tree.
+ */
+ if ((color & VM_RADIX_BLACK) == 0 || rnode->rn_count > 0)
+ break;
+ vm_radix_node_put(rnode);
+ if (rnode == root) {
+ vm_radix_setroot(rtree, NULL, 0);
+ break;
+ }
+ rnode = stack[++level];
+ slot = vm_radix_slot(index, level);
+
+ }
+}
+
+/*
+ * Inserts the key-value pair in to the radix tree. Returns errno.
+ * Panics if the key already exists.
+ */
+int
+vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
+{
+ struct vm_radix_node *iroot, *rnode, *root;
+ int ilevel, level, slot;
+
+ CTR4(KTR_VM,
+ "insert: tree %p, " KFRMT64(index) ", val %p", rtree,
+ KSPLT64L(index), KSPLT64H(index), val);
+ if (index == -1)
+ panic("vm_radix_insert: -1 is not a valid index.\n");
+ level = vm_radix_height(rtree, &root);
+ /*
+ * Increase the height by adding nodes at the root until
+ * there is sufficient space.
+ */
+ ilevel = level;
+ iroot = root;
+ while (level == 0 || index > VM_RADIX_MAX(level)) {
+ CTR5(KTR_VM,
+ "insert: expanding " KFRMT64(index) ">" KFRMT64(mxl) ", height %d",
+ KSPLT64L(index), KSPLT64H(index),
+ KSPLT64L(VM_RADIX_MAX(level)),
+ KSPLT64H(VM_RADIX_MAX(level)), level);
+ level++;
+ KASSERT(level <= VM_RADIX_LIMIT,
+ ("vm_radix_insert: Tree %p height %d too tall",
+ rtree, level));
+ /*
+ * Only allocate tree nodes if they are needed.
+ */
+ if (root == NULL || root->rn_count != 0) {
+ rnode = vm_radix_node_get();
+ if (rnode == NULL) {
+ CTR5(KTR_VM,
+ "insert: tree %p, root %p, " KFRMT64(index) ", level %d ENOMEM",
+ rtree, root, KSPLT64L(index),
+ KSPLT64H(index), level);
+ vm_radix_unwind_heightup(rtree, root, iroot,
+ ilevel);
+ return (ENOMEM);
+ }
+ /*
+ * Store the new pointer with a memory barrier so
+ * that it is visible before the new root.
+ */
+ if (root) {
+ atomic_store_rel_ptr((volatile uintptr_t *)
+ &rnode->rn_child[0], (uintptr_t)root);
+ rnode->rn_count = 1;
+ }
+ root = rnode;
+ }
+ vm_radix_setroot(rtree, root, level);
+ }
+
+ /* Now that the tree is tall enough, fill in the path to the index. */
+ rnode = root;
+ for (level = level - 1; level > 0; level--) {
+ slot = vm_radix_slot(index, level);
+ /* Add the required intermidiate nodes. */
+ if (rnode->rn_child[slot] == NULL) {
+
+ /*
+ * In case of failed allocation, the vm_radix_sweep()
+ * will unwind back rn_count appropriately.
+ */
+ rnode->rn_count++;
+ rnode->rn_child[slot] = vm_radix_node_get();
+ if (rnode->rn_child[slot] == NULL) {
+ CTR6(KTR_VM,
+ "insert: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p ENOMEM",
+ rtree, KSPLT64L(index), KSPLT64H(index),
+ level, slot, rnode);
+ CTR4(KTR_VM,
+ "insert: tree %p, rnode %p, child %p, count %u ENOMEM",
+ rtree, rnode, rnode->rn_child[slot],
+ rnode->rn_count);
+ vm_radix_sweep(rtree, index, VM_RADIX_BLACK, 0);
+
+ /*
+ * vm_radix_sweep() may have changed the shape
+ * of the tree, refetch the root.
+ */
+ vm_radix_height(rtree, &root);
+ vm_radix_unwind_heightup(rtree, root, iroot,
+ ilevel);
+ return (ENOMEM);
+ }
+ }
+ CTR6(KTR_VM,
+ "insert: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot,
+ rnode);
+ CTR4(KTR_VM,
+ "insert: tree %p, rnode %p, child %p, count %u",
+ rtree, rnode, rnode->rn_child[slot], rnode->rn_count);
+ rnode = rnode->rn_child[slot];
+ }
+
+ slot = vm_radix_slot(index, 0);
+ MPASS(rnode != NULL);
+ KASSERT(rnode->rn_child[slot] == NULL,
+ ("vm_radix_insert: Duplicate value %p at index: %lu\n",
+ rnode->rn_child[slot], (u_long)index));
+ val = (void *)((uintptr_t)val | VM_RADIX_BLACK);
+ rnode->rn_child[slot] = val;
+ atomic_add_32(&rnode->rn_count, 1);
+ CTR5(KTR_VM,
+ "insert: tree %p, " KFRMT64(index) ", level %d, slot %d",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot);
+ CTR3(KTR_VM, "insert: slot %d, rnode %p, count %u", slot, rnode,
+ rnode->rn_count);
+
+ return 0;
+}
+
+/*
+ * Returns the value stored at the index. If the index is not present
+ * NULL is returned.
+ */
+void *
+vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+ struct vm_radix_node *rnode;
+ int slot;
+ int level;
+
+ level = vm_radix_height(rtree, &rnode);
+ if (index > VM_RADIX_MAX(level))
+ return NULL;
+ level--;
+ while (rnode) {
+ slot = vm_radix_slot(index, level);
+ CTR6(KTR_VM,
+ "lookup: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot,
+ rnode);
+ CTR2(KTR_VM, "lookup: rnode %p, child %p", rnode,
+ rnode->rn_child[slot]);
+ if (level == 0)
+ return vm_radix_match(rnode->rn_child[slot], color);
+ rnode = rnode->rn_child[slot];
+ level--;
+ }
+ CTR3(KTR_VM, "lookup: tree %p, " KFRMT64(index) " failed", rtree,
+ KSPLT64L(index), KSPLT64H(index));
+
+ return NULL;
+}
+
+void *
+vm_radix_color(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+ struct vm_radix_node *rnode;
+ uintptr_t child;
+ int slot;
+ int level;
+
+ level = vm_radix_height(rtree, &rnode);
+ if (index > VM_RADIX_MAX(level))
+ return NULL;
+ level--;
+ while (rnode) {
+ slot = vm_radix_slot(index, level);
+ CTR6(KTR_VM,
+ "color: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot,
+ rnode);
+ CTR2(KTR_VM, "color: rnode %p, child %p", rnode,
+ rnode->rn_child[slot]);
+ if (level == 0)
+ break;
+ rnode = rnode->rn_child[slot];
+ level--;
+ }
+ if (rnode == NULL || rnode->rn_child[slot] == NULL)
+ return (NULL);
+ child = (uintptr_t)rnode->rn_child[slot];
+ child &= ~VM_RADIX_FLAGS;
+ rnode->rn_child[slot] = (void *)(child | color);
+
+ return (void *)child;
+}
+
+/*
+ * Find the first leaf with a valid node between *startp and end. Return
+ * the index of the first valid item in the leaf in *startp.
+ */
+static struct vm_radix_node *
+vm_radix_leaf(struct vm_radix *rtree, vm_pindex_t *startp, vm_pindex_t end)
+{
+ struct vm_radix_node *rnode;
+ vm_pindex_t start;
+ vm_pindex_t inc;
+ int slot;
+ int level;
+
+ start = *startp;
+restart:
+ level = vm_radix_height(rtree, &rnode);
+ if (start > VM_RADIX_MAX(level) || (end && start >= end)) {
+ rnode = NULL;
+ goto out;
+ }
+ /*
+ * Search the tree from the top for any leaf node holding an index
+ * between start and end.
+ */
+ for (level--; level; level--) {
+ slot = vm_radix_slot(start, level);
+ CTR6(KTR_VM,
+ "leaf: tree %p, " KFRMT64(start) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(start), KSPLT64H(start), level, slot,
+ rnode);
+ CTR2(KTR_VM, "leaf: rnode %p, child %p", rnode,
+ rnode->rn_child[slot]);
+ if (rnode->rn_child[slot] != NULL) {
+ rnode = rnode->rn_child[slot];
+ continue;
+ }
+ /*
+ * Calculate how much to increment our index by
+ * based on the tree level. We must truncate the
+ * lower bits to start from the begnning of the
+ * next leaf.
+ */
+ inc = 1LL << (level * VM_RADIX_WIDTH);
+ start &= ~VM_RADIX_MAX(level);
+
+ /* Avoid start address wrapping up. */
+ if ((VM_RADIX_MAXVAL - start) < inc) {
+ rnode = NULL;
+ goto out;
+ }
+ start += inc;
+ slot++;
+ CTR6(KTR_VM,
+ "leaf: " KFRMT64(start) ", " KFRMT64(end) ", " KFRMT64(inc),
+ KSPLT64L(start), KSPLT64H(start), KSPLT64L(end),
+ KSPLT64H(end), KSPLT64L(inc), KSPLT64H(inc));
+ CTR2(KTR_VM, "leaf: level %d, slot %d", level, slot);
+ for (; slot < VM_RADIX_COUNT; slot++, start += inc) {
+ if (end != 0 && start >= end) {
+ rnode = NULL;
+ goto out;
+ }
+ if (rnode->rn_child[slot]) {
+ rnode = rnode->rn_child[slot];
+ break;
+ }
+ if ((VM_RADIX_MAXVAL - start) < inc) {
+ rnode = NULL;
+ goto out;
+ }
+ }
+ if (slot == VM_RADIX_COUNT)
+ goto restart;
+ }
+
+out:
+ *startp = start;
+ return (rnode);
+}
+
+
+
+/*
+ * Looks up as many as cnt values between start and end, and stores
+ * them in the caller allocated array out. The next index can be used
+ * to restart the scan. This optimizes forward scans in the tree.
+ */
+int
+vm_radix_lookupn(struct vm_radix *rtree, vm_pindex_t start,
+ vm_pindex_t end, int color, void **out, int cnt, vm_pindex_t *next,
+ u_int *exhausted)
+{
+ struct vm_radix_node *rnode;
+ void *val;
+ int slot;
+ int outidx;
+
+ CTR5(KTR_VM, "lookupn: tree %p, " KFRMT64(start) ", " KFRMT64(end),
+ rtree, KSPLT64L(start), KSPLT64H(start), KSPLT64L(end),
+ KSPLT64H(end));
+ if (end == 0)
+ *exhausted = 0;
+ if (rtree->rt_root == 0)
+ return (0);
+ outidx = 0;
+ while ((rnode = vm_radix_leaf(rtree, &start, end)) != NULL) {
+ slot = vm_radix_slot(start, 0);
+ for (; slot < VM_RADIX_COUNT; slot++, start++) {
+ if (end != 0 && start >= end)
+ goto out;
+ val = vm_radix_match(rnode->rn_child[slot], color);
+ if (val == NULL) {
+
+ /*
+ * The start address can wrap at the
+ * VM_RADIX_MAXVAL value.
+ * We need to make sure that start address
+ * point to the next chunk (even if wrapping)
+ * to stay consistent with default scanning
+ * behaviour. Also, because of the nature
+ * of the wrapping, the wrap up checks must
+ * be done after all the necessary controls
+ * on start are completed.
+ */
+ if ((VM_RADIX_MAXVAL - start) == 0) {
+ start++;
+ if (end == 0)
+ *exhausted = 1;
+ goto out;
+ }
+ continue;
+ }
+ CTR5(KTR_VM,
+ "lookupn: tree %p " KFRMT64(index) " slot %d found child %p",
+ rtree, KSPLT64L(start), KSPLT64H(start), slot, val);
+ out[outidx] = val;
+ if (++outidx == cnt ||
+ (VM_RADIX_MAXVAL - start) == 0) {
+ start++;
+ if ((VM_RADIX_MAXVAL - start) == 0 && end == 0)
+ *exhausted = 1;
+ goto out;
+ }
+ }
+ MPASS((VM_RADIX_MAXVAL - start) != 0);
+ if (end != 0 && start >= end)
+ break;
+ }
+out:
+ *next = start;
+ return (outidx);
+}
+
+#if 0
+void
+vm_radix_foreach(struct vm_radix *rtree, vm_pindex_t start, vm_pindex_t end,
+ int color, void (*iter)(void *))
+{
+ struct vm_radix_node *rnode;
+ void *val;
+ int slot;
+
+ if (rtree->rt_root == 0)
+ return;
+ while ((rnode = vm_radix_leaf(rtree, &start, end)) != NULL) {
+ slot = vm_radix_slot(start, 0);
+ for (; slot < VM_RADIX_COUNT; slot++, start++) {
+ if (end != 0 && start >= end)
+ return;
+ val = vm_radix_match(rnode->rn_child[slot], color);
+ if (val)
+ iter(val);
+ }
+ if (end != 0 && start >= end)
+ return;
+ }
+}
+#endif
+
+
+/*
+ * Look up any entry at a position less than or equal to index.
+ */
+void *
+vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+ struct vm_radix_node *rnode;
+ struct vm_radix_node *child;
+ vm_pindex_t max;
+ vm_pindex_t inc;
+ void *val;
+ int slot;
+ int level;
+
+ CTR3(KTR_VM, "lookup_le: tree %p, " KFRMT64(index), rtree,
+ KSPLT64L(index), KSPLT64H(index));
+restart:
+ level = vm_radix_height(rtree, &rnode);
+ if (rnode == NULL)
+ return (NULL);
+ max = VM_RADIX_MAX(level);
+ if (index > max || index == 0)
+ index = max;
+ /*
+ * Search the tree from the top for any leaf node holding an index
+ * lower than 'index'.
+ */
+ level--;
+ while (rnode) {
+ slot = vm_radix_slot(index, level);
+ CTR6(KTR_VM,
+ "lookup_le: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p",
+ rtree, KSPLT64L(index), KSPLT64H(index), level, slot,
+ rnode);
+ CTR2(KTR_VM, "lookup_le: rnode %p, child %p", rnode,
+ rnode->rn_child[slot]);
+ if (level == 0)
+ break;
+ /*
+ * If we don't have an exact match we must start our search
+ * from the next leaf and adjust our index appropriately.
+ */
+ if ((child = rnode->rn_child[slot]) == NULL) {
+ /*
+ * Calculate how much to decrement our index by
+ * based on the tree level. We must set the
+ * lower bits to start from the end of the next
+ * leaf.
+ */
+ inc = 1LL << (level * VM_RADIX_WIDTH);
+ index |= VM_RADIX_MAX(level);
+ index -= inc;
+ slot--;
+ CTR6(KTR_VM,
+ "lookup_le: " KFRMT64(start) ", " KFRMT64(inc) ", level %d slot %d",
+ KSPLT64L(index), KSPLT64H(index), KSPLT64L(inc),
+ KSPLT64H(inc), level, slot);
+ for (; slot >= 0; slot--, index -= inc) {
+ child = rnode->rn_child[slot];
+ if (child)
+ break;
+ }
+ }
+ rnode = child;
+ level--;
+ }
+ if (rnode) {
+ for (; slot >= 0; slot--, index--) {
+ val = vm_radix_match(rnode->rn_child[slot], color);
+ if (val)
+ return (val);
+ }
+ }
+ if (index != -1)
+ goto restart;
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the tree. Expects the entry to be already present.
+ */
+void
+vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+
+ vm_radix_sweep(rtree, index, color, 1);
+}
+
+/*
+ * Remove and free all the nodes from the radix tree.
+ * This function is recrusive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+vm_radix_reclaim_allnodes(struct vm_radix *rtree)
+{
+ struct vm_radix_node *root;
+ int level;
+
+ if (rtree->rt_root == 0)
+ return;
+ level = vm_radix_height(rtree, &root);
+ vm_radix_reclaim_allnodes_internal(root, level - 1);
+ rtree->rt_root = 0;
+}
+
+#ifdef notyet
+/*
+ * Attempts to reduce the height of the tree.
+ */
+void
+vm_radix_shrink(struct vm_radix *rtree)
+{
+ struct vm_radix_node *tmp, *root;
+ int level;
+
+ if (rtree->rt_root == 0)
+ return;
+ level = vm_radix_height(rtree, &root);
+
+ /* Adjust the height of the tree. */
+ while (root->rn_count == 1 && root->rn_child[0] != NULL) {
+ tmp = root;
+ root->rn_count--;
+ root = root->rn_child[0];
+ level--;
+ vm_radix_node_put(tmp);
+ }
+ /* Finally see if we have an empty tree. */
+ if (root->rn_count == 0) {
+ vm_radix_node_put(root);
+ root = NULL;
+ level--;
+ }
+ vm_radix_setroot(rtree, root, level);
+}
+#endif
diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h
new file mode 100644
index 0000000..72a77e6
--- /dev/null
+++ b/sys/vm/vm_radix.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _VM_RADIX_H_
+#define _VM_RADIX_H_
+
+#define VM_RADIX_BLACK 0x1 /* Black node. (leaf only) */
+#define VM_RADIX_RED 0x2 /* Red node. (leaf only) */
+#define VM_RADIX_ANY (VM_RADIX_RED | VM_RADIX_BLACK)
+#define VM_RADIX_STACK 8 /* Nodes to store on stack. */
+
+/*
+ * Radix tree root. The height and pointer are set together to permit
+ * coherent lookups while the root is modified.
+ */
+struct vm_radix {
+ uintptr_t rt_root; /* root + height */
+};
+
+#ifdef _KERNEL
+
+/*
+ * Initialize the radix tree subsystem.
+ */
+void vm_radix_init(void);
+
+/*
+ * Functions which only work with black nodes. (object lock)
+ */
+int vm_radix_insert(struct vm_radix *, vm_pindex_t, void *);
+
+/*
+ * Functions which work on specified colors. (object, vm_page_queue_free locks)
+ */
+void *vm_radix_color(struct vm_radix *, vm_pindex_t, int);
+void *vm_radix_lookup(struct vm_radix *, vm_pindex_t, int);
+int vm_radix_lookupn(struct vm_radix *, vm_pindex_t, vm_pindex_t, int,
+ void **, int, vm_pindex_t *, u_int *);
+void *vm_radix_lookup_le(struct vm_radix *, vm_pindex_t, int);
+void vm_radix_reclaim_allnodes(struct vm_radix *);
+void vm_radix_remove(struct vm_radix *, vm_pindex_t, int);
+
+/*
+ * Look up any entry at a position greater or equal to index.
+ */
+static inline void *
+vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+ void *val;
+ u_int dummy;
+
+ if (vm_radix_lookupn(rtree, index, 0, color, &val, 1, &index, &dummy))
+ return (val);
+ return (NULL);
+}
+
+static inline void *
+vm_radix_last(struct vm_radix *rtree, int color)
+{
+
+ return vm_radix_lookup_le(rtree, 0, color);
+}
+
+static inline void *
+vm_radix_first(struct vm_radix *rtree, int color)
+{
+
+ return vm_radix_lookup_ge(rtree, 0, color);
+}
+
+static inline void *
+vm_radix_next(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+
+ if (index == -1)
+ return (NULL);
+ return vm_radix_lookup_ge(rtree, index + 1, color);
+}
+
+static inline void *
+vm_radix_prev(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+
+ if (index == 0)
+ return (NULL);
+ return vm_radix_lookup_le(rtree, index - 1, color);
+}
+
+#endif /* _KERNEL */
+#endif /* !_VM_RADIX_H_ */
diff --git a/sys/vm/vm_reserv.c b/sys/vm/vm_reserv.c
index e5ac1a5..549e710 100644
--- a/sys/vm/vm_reserv.c
+++ b/sys/vm/vm_reserv.c
@@ -341,33 +341,21 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
/*
* Look for an existing reservation.
*/
- msucc = NULL;
- mpred = object->root;
- while (mpred != NULL) {
+ mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK);
+ if (mpred != NULL) {
KASSERT(mpred->pindex != pindex,
("vm_reserv_alloc_contig: pindex already allocated"));
rv = vm_reserv_from_page(mpred);
if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
goto found;
- else if (mpred->pindex < pindex) {
- if (msucc != NULL ||
- (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
- break;
- KASSERT(msucc->pindex != pindex,
- ("vm_reserv_alloc_contig: pindex already allocated"));
- rv = vm_reserv_from_page(msucc);
- if (rv->object == object &&
- vm_reserv_has_pindex(rv, pindex))
- goto found;
- else if (pindex < msucc->pindex)
- break;
- } else if (msucc == NULL) {
- msucc = mpred;
- mpred = TAILQ_PREV(msucc, pglist, listq);
- continue;
- }
- msucc = NULL;
- mpred = object->root = vm_page_splay(pindex, object->root);
+ }
+ msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK);
+ if (msucc != NULL) {
+ KASSERT(msucc->pindex != pindex,
+ ("vm_reserv_alloc_page: pindex already allocated"));
+ rv = vm_reserv_from_page(msucc);
+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+ goto found;
}
/*
@@ -507,33 +495,21 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
/*
* Look for an existing reservation.
*/
- msucc = NULL;
- mpred = object->root;
- while (mpred != NULL) {
+ mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK);
+ if (mpred != NULL) {
KASSERT(mpred->pindex != pindex,
("vm_reserv_alloc_page: pindex already allocated"));
rv = vm_reserv_from_page(mpred);
if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
goto found;
- else if (mpred->pindex < pindex) {
- if (msucc != NULL ||
- (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
- break;
- KASSERT(msucc->pindex != pindex,
- ("vm_reserv_alloc_page: pindex already allocated"));
- rv = vm_reserv_from_page(msucc);
- if (rv->object == object &&
- vm_reserv_has_pindex(rv, pindex))
- goto found;
- else if (pindex < msucc->pindex)
- break;
- } else if (msucc == NULL) {
- msucc = mpred;
- mpred = TAILQ_PREV(msucc, pglist, listq);
- continue;
- }
- msucc = NULL;
- mpred = object->root = vm_page_splay(pindex, object->root);
+ }
+ msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK);
+ if (msucc != NULL) {
+ KASSERT(msucc->pindex != pindex,
+ ("vm_reserv_alloc_page: pindex already allocated"));
+ rv = vm_reserv_from_page(msucc);
+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+ goto found;
}
/*
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index a6d78f4..1e3f4b4 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -373,6 +373,7 @@ vnode_pager_setsize(vp, nsize)
vm_ooffset_t nsize;
{
vm_object_t object;
+ struct vnode *drop;
vm_page_t m;
vm_pindex_t nobjsize;
@@ -398,16 +399,41 @@ vnode_pager_setsize(vp, nsize)
/*
* this gets rid of garbage at the end of a page that is now
* only partially backed by the vnode.
- *
- * XXX for some reason (I don't know yet), if we take a
- * completely invalid page and mark it partially valid
- * it can screw up NFS reads, so we don't allow the case.
*/
if ((nsize & PAGE_MASK) &&
- (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
- m->valid != 0) {
- int base = (int)nsize & PAGE_MASK;
- int size = PAGE_SIZE - base;
+ (m = vm_radix_lookup(&object->rtree, OFF_TO_IDX(nsize),
+ VM_RADIX_ANY)) != NULL) {
+ int base;
+ int size;
+
+ /*
+ * Eliminate any cached page as we would have to
+ * do too much work to save it.
+ */
+ if (m->flags & PG_CACHED) {
+ drop = NULL;
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (m->object == object) {
+ vm_page_cache_free(m);
+ if (object->cached_page_count == 0)
+ drop = vp;
+ }
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (drop)
+ vdrop(drop);
+ goto out;
+ }
+ /*
+ * XXX for some reason (I don't know yet), if we take a
+ * completely invalid page and mark it partially valid
+ * it can screw up NFS reads, so we don't allow the
+ * case.
+ */
+ if (m->valid != 0 || m->object != object)
+ goto out;
+
+ base = (int)nsize & PAGE_MASK;
+ size = PAGE_SIZE - base;
/*
* Clear out partial-page garbage in case
@@ -437,12 +463,9 @@ vnode_pager_setsize(vp, nsize)
* replacement from working properly.
*/
vm_page_clear_dirty(m, base, PAGE_SIZE - base);
- } else if ((nsize & PAGE_MASK) &&
- vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
- vm_page_cache_free(object, OFF_TO_IDX(nsize),
- nobjsize);
}
}
+out:
object->un_pager.vnp.vnp_size = nsize;
object->size = nobjsize;
VM_OBJECT_UNLOCK(object);
OpenPOWER on IntegriCloud