diff options
-rw-r--r-- | lib/libz/FREEBSD-upgrade | 2 | ||||
-rw-r--r-- | lib/libz/Makefile | 2 | ||||
-rw-r--r-- | lib/libz/Symbol.map | 2 | ||||
-rw-r--r-- | lib/libz/Versions.def | 2 | ||||
-rw-r--r-- | lib/libz/zopen.c | 2 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 151 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 16 | ||||
-rw-r--r-- | sys/cddl/compat/opensolaris/sys/vnode.h | 3 | ||||
-rw-r--r-- | sys/conf/files | 1 | ||||
-rw-r--r-- | sys/fs/tmpfs/tmpfs_fifoops.c | 1 | ||||
-rw-r--r-- | sys/fs/tmpfs/tmpfs_vnops.c | 61 | ||||
-rw-r--r-- | sys/i386/i386/pmap.c | 155 | ||||
-rw-r--r-- | sys/i386/include/pmap.h | 16 | ||||
-rw-r--r-- | sys/kern/subr_uio.c | 11 | ||||
-rw-r--r-- | sys/kern/uipc_shm.c | 3 | ||||
-rw-r--r-- | sys/vm/device_pager.c | 9 | ||||
-rw-r--r-- | sys/vm/sg_pager.c | 5 | ||||
-rw-r--r-- | sys/vm/vm_fault.c | 2 | ||||
-rw-r--r-- | sys/vm/vm_init.c | 2 | ||||
-rw-r--r-- | sys/vm/vm_mmap.c | 12 | ||||
-rw-r--r-- | sys/vm/vm_object.c | 431 | ||||
-rw-r--r-- | sys/vm/vm_object.h | 6 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 533 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 10 | ||||
-rw-r--r-- | sys/vm/vm_radix.c | 956 | ||||
-rw-r--r-- | sys/vm/vm_radix.h | 115 | ||||
-rw-r--r-- | sys/vm/vm_reserv.c | 64 | ||||
-rw-r--r-- | sys/vm/vnode_pager.c | 47 |
28 files changed, 1860 insertions, 760 deletions
diff --git a/lib/libz/FREEBSD-upgrade b/lib/libz/FREEBSD-upgrade index d2d251b..7482228 100644 --- a/lib/libz/FREEBSD-upgrade +++ b/lib/libz/FREEBSD-upgrade @@ -1,4 +1,4 @@ -$FreeBSD: head/lib/libz/FREEBSD-upgrade 146082 2005-05-11 03:50:50Z kientzle $ +$FreeBSD: user/attilio/vmcontention/lib/libz/FREEBSD-upgrade 146082 2005-05-11 03:50:50Z kientzle $ ZLib 1.2.2 diff --git a/lib/libz/Makefile b/lib/libz/Makefile index 4ecb022..bd1fa60 100644 --- a/lib/libz/Makefile +++ b/lib/libz/Makefile @@ -1,5 +1,5 @@ # -# $FreeBSD: head/lib/libz/Makefile 232263 2012-02-28 18:30:18Z dim $ +# $FreeBSD: user/attilio/vmcontention/lib/libz/Makefile 232325 2012-03-01 00:27:51Z attilio $ # LIB= z diff --git a/lib/libz/Symbol.map b/lib/libz/Symbol.map index c9eef6f..088cd0c 100644 --- a/lib/libz/Symbol.map +++ b/lib/libz/Symbol.map @@ -1,5 +1,5 @@ /* - * $FreeBSD: head/lib/libz/Symbol.map 206709 2010-04-16 20:07:24Z delphij $ + * $FreeBSD: user/attilio/vmcontention/lib/libz/Symbol.map 206709 2010-04-16 20:07:24Z delphij $ */ ZLIB_1.2.7.0 { diff --git a/lib/libz/Versions.def b/lib/libz/Versions.def index a5aeb28..6317174 100644 --- a/lib/libz/Versions.def +++ b/lib/libz/Versions.def @@ -1,4 +1,4 @@ -# $FreeBSD: head/lib/libz/Versions.def 205486 2010-03-22 22:12:27Z delphij $ +# $FreeBSD: user/attilio/vmcontention/lib/libz/Versions.def 205486 2010-03-22 22:12:27Z delphij $ ZLIB_1.2.4.0 { }; diff --git a/lib/libz/zopen.c b/lib/libz/zopen.c index 822a5e1..8c95ab6 100644 --- a/lib/libz/zopen.c +++ b/lib/libz/zopen.c @@ -3,7 +3,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/lib/libz/zopen.c 84228 2001-09-30 22:39:00Z dillon $"); +__FBSDID("$FreeBSD: user/attilio/vmcontention/lib/libz/zopen.c 84228 2001-09-30 22:39:00Z dillon $"); #include <stdio.h> #include <zlib.h> diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index f73e956..5436b6f 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -302,6 +302,7 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); +static vm_page_t pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); @@ -1452,7 +1453,8 @@ pmap_free_zero_pages(vm_page_t free) while (free != NULL) { m = free; - free = m->right; + free = (void *)m->object; + m->object = NULL; /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } @@ -1471,7 +1473,7 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; - m->right = *free; + m->object = (void *)*free; *free = m; } @@ -1489,20 +1491,20 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) PMAP_LOCK_ASSERT(pmap, MA_OWNED); root = pmap->pm_root; if (root == NULL) { - mpte->left = NULL; - mpte->right = NULL; + mpte->md.pv_left = NULL; + mpte->md.pv_right = NULL; } else { - root = vm_page_splay(mpte->pindex, root); + root = pmap_vmpage_splay(mpte->pindex, root); if (mpte->pindex < root->pindex) { - mpte->left = root->left; - mpte->right = root; - root->left = NULL; + mpte->md.pv_left = root->md.pv_left; + mpte->md.pv_right = root; + root->md.pv_left = NULL; } else if (mpte->pindex == root->pindex) panic("pmap_insert_pt_page: pindex already inserted"); else { - mpte->right = root->right; - mpte->left = root; - root->right = NULL; + mpte->md.pv_right = root->md.pv_right; + mpte->md.pv_left = root; + root->md.pv_right = NULL; } } pmap->pm_root = mpte; @@ -1521,7 +1523,7 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { - mpte = vm_page_splay(pindex, mpte); + mpte = pmap_vmpage_splay(pindex, mpte); if ((pmap->pm_root = mpte)->pindex != pindex) mpte = NULL; } @@ -1540,18 +1542,24 @@ pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (mpte != pmap->pm_root) { - root = vm_page_splay(mpte->pindex, pmap->pm_root); + root = pmap_vmpage_splay(mpte->pindex, pmap->pm_root); KASSERT(mpte == root, ("pmap_remove_pt_page: mpte %p is missing from pmap %p", mpte, pmap)); } - if (mpte->left == NULL) - root = mpte->right; + if (mpte->md.pv_left == NULL) + root = mpte->md.pv_right; else { - root = vm_page_splay(mpte->pindex, mpte->left); - root->right = mpte->right; + root = pmap_vmpage_splay(mpte->pindex, mpte->md.pv_left); + root->md.pv_right = mpte->md.pv_right; } pmap->pm_root = root; + + /* + * Reinitialize the pv_list which could be dirty now because of the + * splay tree work. + */ + TAILQ_INIT(&mpte->md.pv_list); } /* @@ -1627,6 +1635,61 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) } /* + * Implements Sleator and Tarjan's top-down splay algorithm. Returns + * the vm_page containing the given pindex. If, however, that + * pindex is not found in the pmap, returns a vm_page that is + * adjacent to the pindex, coming before or after it. + */ +static vm_page_t +pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root) +{ + struct vm_page dummy; + vm_page_t lefttreemax, righttreemin, y; + + if (root == NULL) + return (root); + lefttreemax = righttreemin = &dummy; + for (;; root = y) { + if (pindex < root->pindex) { + if ((y = root->md.pv_left) == NULL) + break; + if (pindex < y->pindex) { + /* Rotate right. */ + root->md.pv_left = y->md.pv_right; + y->md.pv_right = root; + root = y; + if ((y = root->md.pv_left) == NULL) + break; + } + /* Link into the new root's right tree. */ + righttreemin->md.pv_left = root; + righttreemin = root; + } else if (pindex > root->pindex) { + if ((y = root->md.pv_right) == NULL) + break; + if (pindex > y->pindex) { + /* Rotate left. */ + root->md.pv_right = y->md.pv_left; + y->md.pv_left = root; + root = y; + if ((y = root->md.pv_right) == NULL) + break; + } + /* Link into the new root's left tree. */ + lefttreemax->md.pv_right = root; + lefttreemax = root; + } else + break; + } + /* Assemble the new root. */ + lefttreemax->md.pv_right = root->md.pv_left; + righttreemin->md.pv_left = root->md.pv_right; + root->md.pv_left = dummy.md.pv_right; + root->md.pv_right = dummy.md.pv_left; + return (root); +} + +/* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ @@ -2176,7 +2239,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); @@ -2228,7 +2291,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) } if (m_pc == NULL && free != NULL) { m_pc = free; - free = m_pc->right; + free = (void *)m_pc->object; /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&cnt.v_wire_count, 1); @@ -2460,9 +2523,9 @@ pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { - TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } @@ -2501,7 +2564,7 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; @@ -2519,7 +2582,7 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); if (va == va_last) goto out; } @@ -2567,7 +2630,7 @@ pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); - TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { @@ -2608,7 +2671,7 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); @@ -2632,7 +2695,7 @@ pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); - TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); @@ -3111,7 +3174,7 @@ small_mappings: vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } @@ -4250,7 +4313,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; @@ -4261,7 +4324,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; @@ -4313,7 +4376,7 @@ pmap_pvh_wired_mappings(struct md_page *pvh, int count) pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); @@ -4442,7 +4505,7 @@ pmap_remove_pages(pmap_t pmap) if ((tpte & PG_PS) != 0) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); - TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && @@ -4461,7 +4524,7 @@ pmap_remove_pages(pmap_t pmap) } } else { pmap_resident_count_dec(pmap, 1); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { @@ -4536,7 +4599,7 @@ pmap_is_modified_pvh(struct md_page *pvh) rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); @@ -4607,7 +4670,7 @@ pmap_is_referenced_pvh(struct md_page *pvh) rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); @@ -4648,7 +4711,7 @@ pmap_remove_write(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; @@ -4658,7 +4721,7 @@ pmap_remove_write(vm_page_t m) PMAP_UNLOCK(pmap); } small_mappings: - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -4711,7 +4774,7 @@ pmap_ts_referenced(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; @@ -4745,9 +4808,9 @@ small_mappings: if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { - pvn = TAILQ_NEXT(pv, pv_list); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + pvn = TAILQ_NEXT(pv, pv_next); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -4799,7 +4862,7 @@ pmap_clear_modify(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; @@ -4831,7 +4894,7 @@ pmap_clear_modify(vm_page_t m) PMAP_UNLOCK(pmap); } small_mappings: - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -4868,7 +4931,7 @@ pmap_clear_reference(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; @@ -4891,7 +4954,7 @@ pmap_clear_reference(vm_page_t m) PMAP_UNLOCK(pmap); } small_mappings: - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 71045ae..492354d 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -240,10 +240,20 @@ struct pv_entry; struct pv_chunk; struct md_page { - TAILQ_HEAD(,pv_entry) pv_list; - int pat_mode; + union { + TAILQ_HEAD(,pv_entry) pvi_list; + struct { + vm_page_t pii_left; + vm_page_t pii_right; + } pvi_siters; + } pv_structs; + int pat_mode; }; +#define pv_list pv_structs.pvi_list +#define pv_left pv_structs.pvi_siters.pii_left +#define pv_right pv_structs.pvi_siters.pii_right + /* * The kernel virtual address (KVA) of the level 4 page table page is always * within the direct map (DMAP) region. @@ -282,7 +292,7 @@ extern struct pmap kernel_pmap_store; */ typedef struct pv_entry { vm_offset_t pv_va; /* virtual address for mapping */ - TAILQ_ENTRY(pv_entry) pv_list; + TAILQ_ENTRY(pv_entry) pv_next; } *pv_entry_t; /* diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h index eee00a0..d90ed91 100644 --- a/sys/cddl/compat/opensolaris/sys/vnode.h +++ b/sys/cddl/compat/opensolaris/sys/vnode.h @@ -75,8 +75,7 @@ vn_is_readonly(vnode_t *vp) #define vn_mountedvfs(vp) ((vp)->v_mountedhere) #define vn_has_cached_data(vp) \ ((vp)->v_object != NULL && \ - ((vp)->v_object->resident_page_count > 0 || \ - (vp)->v_object->cache != NULL)) + (vp)->v_object->cached_page_count > 0) #define vn_exists(vp) do { } while (0) #define vn_invalid(vp) do { } while (0) #define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) diff --git a/sys/conf/files b/sys/conf/files index f4961e9..2a3db4f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3626,6 +3626,7 @@ vm/vm_page.c standard vm/vm_pageout.c standard vm/vm_pager.c standard vm/vm_phys.c standard +vm/vm_radix.c standard vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard diff --git a/sys/fs/tmpfs/tmpfs_fifoops.c b/sys/fs/tmpfs/tmpfs_fifoops.c index 347732b..e312a1e 100644 --- a/sys/fs/tmpfs/tmpfs_fifoops.c +++ b/sys/fs/tmpfs/tmpfs_fifoops.c @@ -39,6 +39,7 @@ #include <sys/param.h> #include <sys/filedesc.h> #include <sys/proc.h> +#include <sys/systm.h> #include <sys/vnode.h> #include <vm/vm.h> diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 5e2ea65..7ffff53 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -511,11 +511,14 @@ tmpfs_mappedread(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio offset = addr & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); - if ((vobj == NULL) || - (vobj->resident_page_count == 0 && vobj->cache == NULL)) + if (vobj == NULL) goto nocache; VM_OBJECT_LOCK(vobj); + if (vobj->resident_page_count == 0 && vobj->cached_page_count == 0) { + VM_OBJECT_UNLOCK(vobj); + goto nocache; + } lookupvpg: if (((m = vm_page_lookup(vobj, idx)) != NULL) && vm_page_is_valid(m, offset, tlen)) { @@ -639,35 +642,47 @@ tmpfs_mappedwrite(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *ui offset = addr & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); - if ((vobj == NULL) || - (vobj->resident_page_count == 0 && vobj->cache == NULL)) { + if (vobj == NULL) { vpg = NULL; goto nocache; } VM_OBJECT_LOCK(vobj); -lookupvpg: - if (((vpg = vm_page_lookup(vobj, idx)) != NULL) && - vm_page_is_valid(vpg, offset, tlen)) { - if ((vpg->oflags & VPO_BUSY) != 0) { - /* - * Reference the page before unlocking and sleeping so - * that the page daemon is less likely to reclaim it. - */ - vm_page_reference(vpg); - vm_page_sleep(vpg, "tmfsmw"); - goto lookupvpg; - } - vm_page_busy(vpg); - vm_page_undirty(vpg); - VM_OBJECT_UNLOCK(vobj); - error = uiomove_fromphys(&vpg, offset, tlen, uio); - } else { - if (vm_page_is_cached(vobj, idx)) - vm_page_cache_free(vobj, idx, idx + 1); + if (vobj->resident_page_count == 0 && vobj->cached_page_count == 0) { VM_OBJECT_UNLOCK(vobj); vpg = NULL; + goto nocache; } +lookupvpg: + vpg = vm_radix_lookup(&vobj->rtree, idx, VM_RADIX_ANY); + if (vpg != NULL) { + if (vm_page_is_valid(vpg, offset, tlen)) { + if ((vpg->oflags & VPO_BUSY) != 0) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(vpg); + vm_page_sleep(vpg, "tmfsmw"); + goto lookupvpg; + } + vm_page_busy(vpg); + vm_page_undirty(vpg); + VM_OBJECT_UNLOCK(vobj); + error = uiomove_fromphys(&vpg, offset, tlen, uio); + } else { + if (vpg->flags & PG_CACHED) { + mtx_lock(&vm_page_queue_free_mtx); + if (vpg->object == vobj) + vm_page_cache_free(vpg); + mtx_unlock(&vm_page_queue_free_mtx); + } + VM_OBJECT_UNLOCK(vobj); + vpg = NULL; + } + } else + VM_OBJECT_UNLOCK(vobj); nocache: VM_OBJECT_LOCK(tobj); tpg = vm_page_grab(tobj, idx, VM_ALLOC_WIRED | diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index a9d031d..f6ce067 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -330,6 +330,7 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); +static vm_page_t pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); @@ -1574,7 +1575,8 @@ pmap_free_zero_pages(vm_page_t free) while (free != NULL) { m = free; - free = m->right; + free = (void *)m->object; + m->object = NULL; /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } @@ -1593,7 +1595,7 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; - m->right = *free; + m->object = (void *)*free; *free = m; } @@ -1611,20 +1613,20 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) PMAP_LOCK_ASSERT(pmap, MA_OWNED); root = pmap->pm_root; if (root == NULL) { - mpte->left = NULL; - mpte->right = NULL; + mpte->md.pv_left = NULL; + mpte->md.pv_right = NULL; } else { - root = vm_page_splay(mpte->pindex, root); + root = pmap_vmpage_splay(mpte->pindex, root); if (mpte->pindex < root->pindex) { - mpte->left = root->left; - mpte->right = root; - root->left = NULL; + mpte->md.pv_left = root->md.pv_left; + mpte->md.pv_right = root; + root->md.pv_left = NULL; } else if (mpte->pindex == root->pindex) panic("pmap_insert_pt_page: pindex already inserted"); else { - mpte->right = root->right; - mpte->left = root; - root->right = NULL; + mpte->md.pv_right = root->md.pv_right; + mpte->md.pv_left = root; + root->md.pv_right = NULL; } } pmap->pm_root = mpte; @@ -1643,7 +1645,7 @@ pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { - mpte = vm_page_splay(pindex, mpte); + mpte = pmap_vmpage_splay(pindex, mpte); if ((pmap->pm_root = mpte)->pindex != pindex) mpte = NULL; } @@ -1662,14 +1664,20 @@ pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (mpte != pmap->pm_root) - vm_page_splay(mpte->pindex, pmap->pm_root); - if (mpte->left == NULL) - root = mpte->right; + pmap_vmpage_splay(mpte->pindex, pmap->pm_root); + if (mpte->md.pv_left == NULL) + root = mpte->md.pv_right; else { - root = vm_page_splay(mpte->pindex, mpte->left); - root->right = mpte->right; + root = pmap_vmpage_splay(mpte->pindex, mpte->md.pv_left); + root->md.pv_right = mpte->md.pv_right; } pmap->pm_root = root; + + /* + * Reinitialize the pv_list which could be dirty now because of the + * splay tree work. + */ + TAILQ_INIT(&mpte->md.pv_list); } /* @@ -1723,6 +1731,61 @@ _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) } /* + * Implements Sleator and Tarjan's top-down splay algorithm. Returns + * the vm_page containing the given pindex. If, however, that + * pindex is not found in the pmap, returns a vm_page that is + * adjacent to the pindex, coming before or after it. + */ +static vm_page_t +pmap_vmpage_splay(vm_pindex_t pindex, vm_page_t root) +{ + struct vm_page dummy; + vm_page_t lefttreemax, righttreemin, y; + + if (root == NULL) + return (root); + lefttreemax = righttreemin = &dummy; + for (;; root = y) { + if (pindex < root->pindex) { + if ((y = root->md.pv_left) == NULL) + break; + if (pindex < y->pindex) { + /* Rotate right. */ + root->md.pv_left = y->md.pv_right; + y->md.pv_right = root; + root = y; + if ((y = root->md.pv_left) == NULL) + break; + } + /* Link into the new root's right tree. */ + righttreemin->md.pv_left = root; + righttreemin = root; + } else if (pindex > root->pindex) { + if ((y = root->md.pv_right) == NULL) + break; + if (pindex > y->pindex) { + /* Rotate left. */ + root->md.pv_right = y->md.pv_left; + y->md.pv_left = root; + root = y; + if ((y = root->md.pv_right) == NULL) + break; + } + /* Link into the new root's left tree. */ + lefttreemax->md.pv_right = root; + lefttreemax = root; + } else + break; + } + /* Assemble the new root. */ + lefttreemax->md.pv_right = root->md.pv_left; + righttreemin->md.pv_left = root->md.pv_right; + root->md.pv_left = dummy.md.pv_right; + root->md.pv_right = dummy.md.pv_left; + return (root); +} + +/* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ @@ -2286,7 +2349,7 @@ pmap_pv_reclaim(pmap_t locked_pmap) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); @@ -2344,7 +2407,7 @@ out: } if (m_pc == NULL && pv_vafree != 0 && free != NULL) { m_pc = free; - free = m_pc->right; + free = (void *)m_pc->object; /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&cnt.v_wire_count, 1); @@ -2491,9 +2554,9 @@ pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { - TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } @@ -2521,7 +2584,7 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { @@ -2557,7 +2620,7 @@ pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); - TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { @@ -2604,7 +2667,7 @@ pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* @@ -2620,7 +2683,7 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); @@ -2640,7 +2703,7 @@ pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); - TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); @@ -3095,7 +3158,7 @@ small_mappings: vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } @@ -3551,7 +3614,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); @@ -4259,7 +4322,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; @@ -4270,7 +4333,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; @@ -4322,7 +4385,7 @@ pmap_pvh_wired_mappings(struct md_page *pvh, int count) rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); @@ -4449,7 +4512,7 @@ pmap_remove_pages(pmap_t pmap) if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); - TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) @@ -4467,7 +4530,7 @@ pmap_remove_pages(pmap_t pmap) } } else { pmap->pm_stats.resident_count--; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); @@ -4537,7 +4600,7 @@ pmap_is_modified_pvh(struct md_page *pvh) rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); @@ -4610,7 +4673,7 @@ pmap_is_referenced_pvh(struct md_page *pvh) rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); @@ -4653,7 +4716,7 @@ pmap_remove_write(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); @@ -4663,7 +4726,7 @@ pmap_remove_write(vm_page_t m) PMAP_UNLOCK(pmap); } small_mappings: - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -4722,7 +4785,7 @@ pmap_ts_referenced(vm_page_t m) sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); @@ -4756,9 +4819,9 @@ small_mappings: if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { - pvn = TAILQ_NEXT(pv, pv_list); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + pvn = TAILQ_NEXT(pv, pv_next); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -4812,7 +4875,7 @@ pmap_clear_modify(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); @@ -4849,7 +4912,7 @@ pmap_clear_modify(vm_page_t m) PMAP_UNLOCK(pmap); } small_mappings: - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -4893,7 +4956,7 @@ pmap_clear_reference(vm_page_t m) if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); @@ -4916,7 +4979,7 @@ pmap_clear_reference(vm_page_t m) PMAP_UNLOCK(pmap); } small_mappings: - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); @@ -5427,7 +5490,7 @@ pmap_pvdump(vm_paddr_t pa) printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); pads(pmap); diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 7a827f8..9ba2752 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -426,10 +426,20 @@ struct pv_entry; struct pv_chunk; struct md_page { - TAILQ_HEAD(,pv_entry) pv_list; - int pat_mode; + union { + TAILQ_HEAD(,pv_entry) pvi_list; + struct { + vm_page_t pii_left; + vm_page_t pii_right; + } pvi_siters; + } pv_structs; + int pat_mode; }; +#define pv_list pv_structs.pvi_list +#define pv_left pv_structs.pvi_siters.pii_left +#define pv_right pv_structs.pvi_siters.pii_right + struct pmap { struct mtx pm_mtx; pd_entry_t *pm_pdir; /* KVA of page directory */ @@ -468,7 +478,7 @@ extern struct pmap kernel_pmap_store; */ typedef struct pv_entry { vm_offset_t pv_va; /* virtual address for mapping */ - TAILQ_ENTRY(pv_entry) pv_list; + TAILQ_ENTRY(pv_entry) pv_next; } *pv_entry_t; /* diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c index 7b593487..becc0b8 100644 --- a/sys/kern/subr_uio.c +++ b/sys/kern/subr_uio.c @@ -85,6 +85,7 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) vm_map_entry_t entry; vm_pindex_t upindex; vm_prot_t prot; + vm_page_bits_t vbits; boolean_t wired; KASSERT((uaddr & PAGE_MASK) == 0, @@ -95,6 +96,7 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) * unwired in sf_buf_mext(). */ kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); + vbits = kern_pg->valid; kern_pg->valid = VM_PAGE_BITS_ALL; KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1, ("vm_pgmoveco: kern_pg is not correctly wired")); @@ -105,6 +107,13 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) return(EFAULT); } VM_OBJECT_LOCK(uobject); + if (vm_page_insert(kern_pg, uobject, upindex) != 0) { + kern_pg->valid = vbits; + VM_OBJECT_UNLOCK(uobject); + vm_map_lookup_done(map, entry); + return(ENOMEM); + } + vm_page_dirty(kern_pg); retry: if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco")) @@ -122,8 +131,6 @@ retry: if (uobject->backing_object != NULL) pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE); } - vm_page_insert(kern_pg, uobject, upindex); - vm_page_dirty(kern_pg); VM_OBJECT_UNLOCK(uobject); vm_map_lookup_done(map, entry); return(KERN_SUCCESS); diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 7f75bdc..64169b2 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -278,7 +278,8 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) if (base != 0) { idx = OFF_TO_IDX(length); retry: - m = vm_page_lookup(object, idx); + m = vm_radix_lookup(&object->rtree, idx, + VM_RADIX_BLACK); if (m != NULL) { if ((m->oflags & VPO_BUSY) != 0 || m->busy != 0) { diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index 809c32c..30aaac0 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -54,6 +54,8 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_phys.h> #include <vm/uma.h> +#include <machine/cpu.h> + static void dev_pager_init(void); static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); @@ -299,7 +301,7 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, struct file *fpop; struct thread *td; vm_memattr_t memattr; - int ref, ret; + int i, ref, ret; pidx = OFF_TO_IDX(offset); memattr = object->memattr; @@ -351,7 +353,10 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_free(*mres); vm_page_unlock(*mres); *mres = page; - vm_page_insert(page, object, pidx); + while (vm_page_insert(page, object, pidx) != 0) { + for (i = 0; i < 10000000; i++) + cpu_spinwait(); + } } page->valid = VM_PAGE_BITS_ALL; return (VM_PAGER_OK); diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c index 097039e..92c1857 100644 --- a/sys/vm/sg_pager.c +++ b/sys/vm/sg_pager.c @@ -181,6 +181,10 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) /* Construct a new fake page. */ page = vm_page_getfake(paddr, memattr); VM_OBJECT_LOCK(object); + if (vm_page_insert(page, object, offset) != 0) { + vm_page_putfake(page); + return (VM_PAGER_FAIL); + } TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, pageq); /* Free the original pages and insert this fake page into the object. */ @@ -189,7 +193,6 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) vm_page_free(m[i]); vm_page_unlock(m[i]); } - vm_page_insert(page, object, offset); m[reqpage] = page; page->valid = VM_PAGE_BITS_ALL; diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index b79b3f5..9883dcf 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -741,9 +741,7 @@ vnode_locked: * process'es object. The page is * automatically made dirty. */ - vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); - vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index c507691..9761b21 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_kern.h> #include <vm/vm_object.h> #include <vm/vm_page.h> +#include <vm/vm_radix.h> #include <vm/vm_map.h> #include <vm/vm_pager.h> #include <vm/vm_extern.h> @@ -123,6 +124,7 @@ vm_mem_init(dummy) vm_object_init(); vm_map_startup(); kmem_init(virtual_avail, virtual_end); + vm_radix_init(); pmap_init(); vm_pager_init(); } diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 05bb8ae..4ae1f90 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pageout.h> #include <vm/vm_extern.h> #include <vm/vm_page.h> +#include <vm/vm_radix.h> #include <vm/vnode_pager.h> #ifdef HWPMC_HOOKS @@ -912,10 +913,15 @@ RestartScan: object->type == OBJT_VNODE) { pindex = OFF_TO_IDX(current->offset + (addr - current->start)); - m = vm_page_lookup(object, pindex); - if (m == NULL && - vm_page_is_cached(object, pindex)) + m = vm_radix_lookup(&object->rtree, + pindex, VM_RADIX_ANY); + + /* Lock just for consistency. */ + mtx_lock(&vm_page_queue_free_mtx); + if (m != NULL && + (m->flags & PG_CACHED) != 0) mincoreinfo = MINCORE_INCORE; + mtx_unlock(&vm_page_queue_free_mtx); if (m != NULL && m->valid == 0) m = NULL; if (m != NULL) diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 32b0779..570befc 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -164,6 +164,9 @@ vm_object_zdtor(void *mem, int size, void *arg) vm_object_t object; object = (vm_object_t)mem; + KASSERT(object->resident_page_count == 0, + ("object %p resident_page_count = %d", + object, object->resident_page_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages", object)); @@ -172,15 +175,12 @@ vm_object_zdtor(void *mem, int size, void *arg) ("object %p has reservations", object)); #endif - KASSERT(object->cache == NULL, + KASSERT(object->cached_page_count == 0, ("object %p has cached pages", object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); - KASSERT(object->resident_page_count == 0, - ("object %p resident_page_count = %d", - object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); @@ -210,7 +210,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); - object->root = NULL; + object->rtree.rt_root = 0; object->type = type; switch (type) { case OBJT_DEAD: @@ -248,7 +248,6 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif - object->cache = NULL; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); @@ -326,7 +325,7 @@ vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: - if (!TAILQ_EMPTY(&object->memq)) + if (object->resident_page_count == 0) return (KERN_FAILURE); break; case OBJT_DEAD: @@ -673,7 +672,11 @@ vm_object_destroy(vm_object_t object) void vm_object_terminate(vm_object_t object) { - vm_page_t p, p_next; + vm_page_t pa[VM_RADIX_STACK]; + vm_page_t p; + vm_pindex_t start; + u_int exhausted; + int n, i; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); @@ -718,43 +721,78 @@ vm_object_terminate(vm_object_t object) * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ - TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { - KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0, - ("vm_object_terminate: freeing busy page %p", p)); - vm_page_lock(p); - /* - * Optimize the page's removal from the object by resetting - * its "object" field. Specifically, if the page is not - * wired, then the effect of this assignment is that - * vm_page_free()'s call to vm_page_remove() will return - * immediately without modifying the page or the object. - */ - p->object = NULL; - if (p->wire_count == 0) { - vm_page_free(p); - PCPU_INC(cnt.v_pfree); + start = 0; + exhausted = 0; + while (exhausted == 0 && (n = vm_radix_lookupn(&object->rtree, start, + 0, VM_RADIX_ANY, (void **)pa, VM_RADIX_STACK, &start, + &exhausted)) != 0) { + for (i = 0; i < n; i++) { + p = pa[i]; + /* + * Another thread may allocate this cached page from + * the queue before we acquire the page queue free + * mtx. + */ + if (p->flags & PG_CACHED) { + mtx_lock(&vm_page_queue_free_mtx); + if (p->object == object) { + p->object = NULL; + p->valid = 0; + /* Clear PG_CACHED and set PG_FREE. */ + p->flags ^= PG_CACHED | PG_FREE; + cnt.v_cache_count--; + cnt.v_free_count++; + } + mtx_unlock(&vm_page_queue_free_mtx); + continue; + } else if (p->object != object) + continue; + KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0, + ("vm_object_terminate: freeing busy page %p", p)); + vm_page_lock(p); + /* + * Optimize the page's removal from the object by + * resetting its "object" field. Specifically, if + * the page is not wired, then the effect of this + * assignment is that vm_page_free()'s call to + * vm_page_remove() will return immediately without + * modifying the page or the object. + * Anyway, the radix tree cannot be accessed anymore + * from within the object, thus all the nodes need + * to be reclaimed later on. + */ + p->object = NULL; + if (p->wire_count == 0) { + vm_page_free(p); + PCPU_INC(cnt.v_pfree); + } + vm_page_unlock(p); } - vm_page_unlock(p); + if (n < VM_RADIX_STACK) + break; } + vm_radix_reclaim_allnodes(&object->rtree); /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { - object->root = NULL; TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } + if (object->cached_page_count != 0) { + object->cached_page_count = 0; + if (object->type == OBJT_VNODE) + vdrop(object->handle); + } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif - if (__predict_false(object->cache != NULL)) - vm_page_cache_free(object, 0, 0); /* * Let the pager know object is dead. @@ -1266,10 +1304,13 @@ vm_object_shadow( void vm_object_split(vm_map_entry_t entry) { - vm_page_t m, m_next; + vm_page_t ma[VM_RADIX_STACK]; + vm_page_t m; vm_object_t orig_object, new_object, source; - vm_pindex_t idx, offidxstart; + vm_pindex_t idx, offidxstart, start; vm_size_t size; + u_int exhausted; + int i, n; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) @@ -1322,46 +1363,66 @@ vm_object_split(vm_map_entry_t entry) ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } + start = offidxstart; retry: - m = vm_page_find_least(orig_object, offidxstart); - for (; m != NULL && (idx = m->pindex - offidxstart) < size; - m = m_next) { - m_next = TAILQ_NEXT(m, listq); - - /* - * We must wait for pending I/O to complete before we can - * rename the page. - * - * We do not have to VM_PROT_NONE the page as mappings should - * not be changed by this operation. - */ - if ((m->oflags & VPO_BUSY) || m->busy) { - VM_OBJECT_UNLOCK(new_object); - m->oflags |= VPO_WANTED; - msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0); - VM_OBJECT_LOCK(new_object); - goto retry; - } + exhausted = 0; + while (exhausted == 0 && (n = vm_radix_lookupn(&orig_object->rtree, + start, offidxstart + size, VM_RADIX_ANY, (void **)ma, + VM_RADIX_STACK, &start, &exhausted)) != 0) { + for (i = 0; i < n; i++) { + m = ma[i]; + idx = m->pindex - offidxstart; + if (m->flags & PG_CACHED) { + mtx_lock(&vm_page_queue_free_mtx); + if (m->object == orig_object) + vm_page_cache_rename(m, new_object, + idx); + mtx_unlock(&vm_page_queue_free_mtx); + continue; + } else if (m->object != orig_object) + continue; + /* + * We must wait for pending I/O to complete before + * we can rename the page. + * + * We do not have to VM_PROT_NONE the page as mappings + * should not be changed by this operation. + */ + if ((m->oflags & VPO_BUSY) || m->busy) { + start = m->pindex; + VM_OBJECT_UNLOCK(new_object); + m->oflags |= VPO_WANTED; + msleep(m, VM_OBJECT_MTX(orig_object), PVM, + "spltwt", 0); + VM_OBJECT_LOCK(new_object); + goto retry; + } #if VM_NRESERVLEVEL > 0 - /* - * If some of the reservation's allocated pages remain with - * the original object, then transferring the reservation to - * the new object is neither particularly beneficial nor - * particularly harmful as compared to leaving the reservation - * with the original object. If, however, all of the - * reservation's allocated pages are transferred to the new - * object, then transferring the reservation is typically - * beneficial. Determining which of these two cases applies - * would be more costly than unconditionally renaming the - * reservation. - */ - vm_reserv_rename(m, new_object, orig_object, offidxstart); + /* + * If some of the reservation's allocated pages remain + * with the original object, then transferring the + * reservation to the new object is neither + * particularly beneficial nor particularly harmful as + * compared to leaving the reservation with the + * original object. If, however, all of the + * reservation's allocated pages are transferred to + * the new object, then transferring the reservation + * is typically beneficial. Determining which of + * these two cases applies would be more costly than + * unconditionally renaming the reservation. + */ + vm_reserv_rename(m, new_object, orig_object, + offidxstart); #endif - vm_page_lock(m); - vm_page_rename(m, new_object, idx); - vm_page_unlock(m); - /* page automatically made dirty by rename and cache handled */ - vm_page_busy(m); + vm_page_rename(m, new_object, idx); + /* + * page automatically made dirty by rename and + * cache handled + */ + vm_page_busy(m); + } + if (n < VM_RADIX_STACK) + break; } if (orig_object->type == OBJT_SWAP) { /* @@ -1369,19 +1430,6 @@ retry: * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); - - /* - * Transfer any cached pages from orig_object to new_object. - * If swap_pager_copy() found swapped out pages within the - * specified range of orig_object, then it changed - * new_object's type to OBJT_SWAP when it transferred those - * pages to new_object. Otherwise, new_object's type - * should still be OBJT_DEFAULT and orig_object should not - * contain any cached pages within the specified range. - */ - if (__predict_false(orig_object->cache != NULL)) - vm_page_cache_transfer(orig_object, offidxstart, - new_object); } VM_OBJECT_UNLOCK(orig_object); TAILQ_FOREACH(m, &new_object->memq, listq) @@ -1400,10 +1448,14 @@ retry: static int vm_object_backing_scan(vm_object_t object, int op) { - int r = 1; + vm_page_t pa[VM_RADIX_STACK]; vm_page_t p; vm_object_t backing_object; - vm_pindex_t backing_offset_index; + vm_pindex_t backing_offset_index, new_pindex; + vm_pindex_t start; + u_int exhausted; + int color, i, n; + int r = 1; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED); @@ -1431,15 +1483,41 @@ vm_object_backing_scan(vm_object_t object, int op) if (op & OBSC_COLLAPSE_WAIT) { vm_object_set_flag(backing_object, OBJ_DEAD); } - + color = VM_RADIX_BLACK; + if (op & OBSC_COLLAPSE_WAIT) + color |= VM_RADIX_RED; /* * Our scan */ - p = TAILQ_FIRST(&backing_object->memq); - while (p) { - vm_page_t next = TAILQ_NEXT(p, listq); - vm_pindex_t new_pindex = p->pindex - backing_offset_index; +restart: + start = 0; + i = n = VM_RADIX_STACK; + exhausted = 0; + for (;;) { + if (i == n) { + if (n < VM_RADIX_STACK) + break; + if (exhausted != 0 || + (n = vm_radix_lookupn(&backing_object->rtree, + start, 0, color, (void **)pa, VM_RADIX_STACK, + &start, &exhausted)) == 0) + break; + i = 0; + } + p = pa[i++]; + /* + * Free cached pages. XXX Why? Emulating old behavior here. + */ + if (p->flags & PG_CACHED) { + mtx_lock(&vm_page_queue_free_mtx); + if (p->object == backing_object) + vm_page_cache_free(p); + mtx_unlock(&vm_page_queue_free_mtx); + continue; + } else if (p->object != backing_object) + continue; + new_pindex = p->pindex - backing_offset_index; if (op & OBSC_TEST_ALL_SHADOWED) { vm_page_t pp; @@ -1451,13 +1529,9 @@ vm_object_backing_scan(vm_object_t object, int op) * note that we do not busy the backing object's * page. */ - if ( - p->pindex < backing_offset_index || - new_pindex >= object->size - ) { - p = next; + if (p->pindex < backing_offset_index || + new_pindex >= object->size) continue; - } /* * See if the parent has the page or if the parent's @@ -1486,12 +1560,9 @@ vm_object_backing_scan(vm_object_t object, int op) vm_page_t pp; if (op & OBSC_COLLAPSE_NOWAIT) { - if ((p->oflags & VPO_BUSY) || - !p->valid || - p->busy) { - p = next; + if ((p->oflags & VPO_BUSY) || !p->valid || + p->busy) continue; - } } else if (op & OBSC_COLLAPSE_WAIT) { if ((p->oflags & VPO_BUSY) || p->busy) { VM_OBJECT_UNLOCK(object); @@ -1507,8 +1578,7 @@ vm_object_backing_scan(vm_object_t object, int op) * should not have changed so we * just restart our scan. */ - p = TAILQ_FIRST(&backing_object->memq); - continue; + goto restart; } } @@ -1544,7 +1614,6 @@ vm_object_backing_scan(vm_object_t object, int op) else vm_page_remove(p); vm_page_unlock(p); - p = next; continue; } @@ -1564,7 +1633,6 @@ vm_object_backing_scan(vm_object_t object, int op) * page before we can (re)lock the parent. * Hence we can get here. */ - p = next; continue; } if ( @@ -1586,7 +1654,6 @@ vm_object_backing_scan(vm_object_t object, int op) else vm_page_remove(p); vm_page_unlock(p); - p = next; continue; } @@ -1605,12 +1672,9 @@ vm_object_backing_scan(vm_object_t object, int op) * If the page was mapped to a process, it can remain * mapped through the rename. */ - vm_page_lock(p); vm_page_rename(p, object, new_pindex); - vm_page_unlock(p); /* page automatically made dirty by rename */ } - p = next; } return (r); } @@ -1724,12 +1788,6 @@ vm_object_collapse(vm_object_t object) backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); - - /* - * Free any cached pages from backing_object. - */ - if (__predict_false(backing_object->cache != NULL)) - vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. @@ -1850,80 +1908,112 @@ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { - vm_page_t p, next; + struct vnode *vp; + vm_page_t pa[VM_RADIX_STACK]; + vm_page_t p; + u_int exhausted; + int i, n; int wirings; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); - if (object->resident_page_count == 0) - goto skipmemq; + if (object->resident_page_count == 0 && object->cached_page_count == 0) + return; + vp = NULL; vm_object_pip_add(object, 1); -again: - p = vm_page_find_least(object, start); - - /* - * Here, the variable "p" is either (1) the page with the least pindex - * greater than or equal to the parameter "start" or (2) NULL. - */ - for (; p != NULL && (p->pindex < end || end == 0); p = next) { - next = TAILQ_NEXT(p, listq); - - /* - * If the page is wired for any reason besides the existence - * of managed, wired mappings, then it cannot be freed. For - * example, fictitious pages, which represent device memory, - * are inherently wired and cannot be freed. They can, - * however, be invalidated if the option OBJPR_CLEANONLY is - * not specified. - */ - vm_page_lock(p); - if ((wirings = p->wire_count) != 0 && - (wirings = pmap_page_wired_mappings(p)) != p->wire_count) { +restart: + exhausted = 0; + while (exhausted == 0 && (n = vm_radix_lookupn(&object->rtree, start, + end, VM_RADIX_ANY, (void **)pa, VM_RADIX_STACK, &start, + &exhausted)) != 0) { + for (i = 0; i < n; i++) { + p = pa[i]; + /* + * Another thread may allocate this cached page from + * the queue before we acquire the page queue free + * mtx. + */ + if (p->flags & PG_CACHED) { + mtx_lock(&vm_page_queue_free_mtx); + if (p->object == object) { + vm_page_cache_free(p); + if (object->type == OBJT_VNODE && + object->cached_page_count == 0) + vp = object->handle; + } + mtx_unlock(&vm_page_queue_free_mtx); + continue; + } else if (p->object != object) + continue; + /* + * If the page is wired for any reason besides + * the existence of managed, wired mappings, then + * it cannot be freed. For example, fictitious + * pages, which represent device memory, are + * inherently wired and cannot be freed. They can, + * however, be invalidated if the option + * OBJPR_CLEANONLY is not specified. + */ + vm_page_lock(p); + if ((wirings = p->wire_count) != 0 && + (wirings = pmap_page_wired_mappings(p)) != + p->wire_count) { + if ((options & OBJPR_NOTMAPPED) == 0) { + pmap_remove_all(p); + /* + * Account for removal of wired + * mappings. + */ + if (wirings != 0) + p->wire_count -= wirings; + } + if ((options & OBJPR_CLEANONLY) == 0) { + p->valid = 0; + vm_page_undirty(p); + } + vm_page_unlock(p); + continue; + } + if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) { + start = p->pindex; + goto restart; + } + KASSERT((p->flags & PG_FICTITIOUS) == 0, + ("vm_object_page_remove: page %p is fictitious", + p)); + if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { + if ((options & OBJPR_NOTMAPPED) == 0) + pmap_remove_write(p); + if (p->dirty) { + vm_page_unlock(p); + continue; + } + } if ((options & OBJPR_NOTMAPPED) == 0) { pmap_remove_all(p); /* Account for removal of wired mappings. */ + if (wirings != 0) { + KASSERT(p->wire_count == wirings, + ("inconsistent wire count %d %d %p", + p->wire_count, wirings, p)); + p->wire_count = 0; + atomic_subtract_int(&cnt.v_wire_count, + 1); + } if (wirings != 0) p->wire_count -= wirings; } - if ((options & OBJPR_CLEANONLY) == 0) { - p->valid = 0; - vm_page_undirty(p); - } + vm_page_free(p); vm_page_unlock(p); - continue; - } - if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) - goto again; - KASSERT((p->flags & PG_FICTITIOUS) == 0, - ("vm_object_page_remove: page %p is fictitious", p)); - if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { - if ((options & OBJPR_NOTMAPPED) == 0) - pmap_remove_write(p); - if (p->dirty) { - vm_page_unlock(p); - continue; - } } - if ((options & OBJPR_NOTMAPPED) == 0) { - pmap_remove_all(p); - /* Account for removal of wired mappings. */ - if (wirings != 0) { - KASSERT(p->wire_count == wirings, - ("inconsistent wire count %d %d %p", - p->wire_count, wirings, p)); - p->wire_count = 0; - atomic_subtract_int(&cnt.v_wire_count, 1); - } - } - vm_page_free(p); - vm_page_unlock(p); + if (n < VM_RADIX_STACK) + break; } vm_object_pip_wakeup(object); -skipmemq: - if (__predict_false(object->cache != NULL)) - vm_page_cache_free(object, start, end); + if (vp) + vdrop(vp); } /* @@ -2301,8 +2391,9 @@ DB_SHOW_COMMAND(object, vm_object_print_static) db_printf(","); count++; - db_printf("(off=0x%jx,page=0x%jx)", - (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); + db_printf("(off=0x%jx,page=0x%jx,obj=%p,flags=0x%X)", + (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p), + p->object, p->flags); } if (count != 0) db_printf("\n"); diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 8134752..7beee59 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -71,6 +71,8 @@ #include <sys/_lock.h> #include <sys/_mutex.h> +#include <vm/vm_radix.h> + /* * Types defined: * @@ -100,7 +102,7 @@ struct vm_object { LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(, vm_page) memq; /* list of resident pages */ - vm_page_t root; /* root of the resident page splay tree */ + struct vm_radix rtree; /* root of the resident page radix index tree */ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ @@ -111,11 +113,11 @@ struct vm_object { u_short pg_color; /* (c) color of first page in obj */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ + int cached_page_count; /* number of cached pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ - vm_page_t cache; /* (o + f) root of the cache page splay tree */ void *handle; union { /* diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index c952c05..fb49a6d 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -109,11 +109,13 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pageout.h> #include <vm/vm_pager.h> #include <vm/vm_phys.h> +#include <vm/vm_radix.h> #include <vm/vm_reserv.h> #include <vm/vm_extern.h> #include <vm/uma.h> #include <vm/uma_int.h> +#include <machine/cpu.h> #include <machine/md_var.h> /* @@ -316,7 +318,6 @@ vm_page_startup(vm_offset_t vaddr) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < PQ_COUNT; i++) vm_pagequeue_init_lock(&vm_pagequeues[i]); - /* * Allocate memory for use when boot strapping the kernel memory * allocator. @@ -793,63 +794,6 @@ vm_page_dirty_KBI(vm_page_t m) } /* - * vm_page_splay: - * - * Implements Sleator and Tarjan's top-down splay algorithm. Returns - * the vm_page containing the given pindex. If, however, that - * pindex is not found in the vm_object, returns a vm_page that is - * adjacent to the pindex, coming before or after it. - */ -vm_page_t -vm_page_splay(vm_pindex_t pindex, vm_page_t root) -{ - struct vm_page dummy; - vm_page_t lefttreemax, righttreemin, y; - - if (root == NULL) - return (root); - lefttreemax = righttreemin = &dummy; - for (;; root = y) { - if (pindex < root->pindex) { - if ((y = root->left) == NULL) - break; - if (pindex < y->pindex) { - /* Rotate right. */ - root->left = y->right; - y->right = root; - root = y; - if ((y = root->left) == NULL) - break; - } - /* Link into the new root's right tree. */ - righttreemin->left = root; - righttreemin = root; - } else if (pindex > root->pindex) { - if ((y = root->right) == NULL) - break; - if (pindex > y->pindex) { - /* Rotate left. */ - root->right = y->left; - y->left = root; - root = y; - if ((y = root->right) == NULL) - break; - } - /* Link into the new root's left tree. */ - lefttreemax->right = root; - lefttreemax = root; - } else - break; - } - /* Assemble the new root. */ - lefttreemax->right = root->left; - righttreemin->left = root->right; - root->left = dummy.right; - root->right = dummy.left; - return (root); -} - -/* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. @@ -861,14 +805,15 @@ vm_page_splay(vm_pindex_t pindex, vm_page_t root) * * The object must be locked. */ -void +int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { - vm_page_t root; - + vm_page_t neighbor; + vm_pindex_t cpindex; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (m->object != NULL) panic("vm_page_insert: page already inserted"); + cpindex = m->pindex; /* * Record the object/offset pair in this page @@ -876,31 +821,26 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) m->object = object; m->pindex = pindex; - /* - * Now link into the object's ordered list of backed pages. - */ - root = object->root; - if (root == NULL) { - m->left = NULL; - m->right = NULL; + if (object->resident_page_count == 0) { TAILQ_INSERT_TAIL(&object->memq, m, listq); - } else { - root = vm_page_splay(pindex, root); - if (pindex < root->pindex) { - m->left = root->left; - m->right = root; - root->left = NULL; - TAILQ_INSERT_BEFORE(root, m, listq); - } else if (pindex == root->pindex) - panic("vm_page_insert: offset already allocated"); - else { - m->right = root->right; - m->left = root; - root->right = NULL; - TAILQ_INSERT_AFTER(&object->memq, root, m, listq); - } + } else { + neighbor = vm_radix_lookup_ge(&object->rtree, pindex, + VM_RADIX_BLACK); + if (neighbor != NULL) { + KASSERT(pindex < neighbor->pindex, + ("vm_page_insert: offset %ju not minor than %ju", + (uintmax_t)pindex, (uintmax_t)neighbor->pindex)); + TAILQ_INSERT_BEFORE(neighbor, m, listq); + } else + TAILQ_INSERT_TAIL(&object->memq, m, listq); + } + + if (vm_radix_insert(&object->rtree, pindex, m) != 0) { + TAILQ_REMOVE(&object->memq, m, listq); + m->object = NULL; + m->pindex = cpindex; + return (ENOMEM); } - object->root = m; /* * Show that the object has one more resident page. @@ -919,6 +859,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); + return (0); } /* @@ -936,7 +877,6 @@ void vm_page_remove(vm_page_t m) { vm_object_t object; - vm_page_t next, prev, root; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_lock_assert(m, MA_OWNED); @@ -948,45 +888,7 @@ vm_page_remove(vm_page_t m) vm_page_flash(m); } - /* - * Now remove from the object's list of backed pages. - */ - if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { - /* - * Since the page's successor in the list is also its parent - * in the tree, its right subtree must be empty. - */ - next->left = m->left; - KASSERT(m->right == NULL, - ("vm_page_remove: page %p has right child", m)); - } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && - prev->right == m) { - /* - * Since the page's predecessor in the list is also its parent - * in the tree, its left subtree must be empty. - */ - KASSERT(m->left == NULL, - ("vm_page_remove: page %p has left child", m)); - prev->right = m->right; - } else { - if (m != object->root) - vm_page_splay(m->pindex, object->root); - if (m->left == NULL) - root = m->right; - else if (m->right == NULL) - root = m->left; - else { - /* - * Move the page's successor to the root, because - * pages are usually removed in ascending order. - */ - if (m->right != next) - vm_page_splay(m->pindex, m->right); - next->left = m->left; - root = next; - } - object->root = root; - } + vm_radix_remove(&object->rtree, m->pindex, VM_RADIX_BLACK); TAILQ_REMOVE(&object->memq, m, listq); /* @@ -1014,15 +916,10 @@ vm_page_remove(vm_page_t m) vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { - vm_page_t m; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); - if ((m = object->root) != NULL && m->pindex != pindex) { - m = vm_page_splay(pindex, m); - if ((object->root = m)->pindex != pindex) - m = NULL; - } - return (m); + + return vm_radix_lookup(&object->rtree, pindex, VM_RADIX_BLACK); } /* @@ -1036,17 +933,12 @@ vm_page_lookup(vm_object_t object, vm_pindex_t pindex) vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { - vm_page_t m; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); - if ((m = TAILQ_FIRST(&object->memq)) != NULL) { - if (m->pindex < pindex) { - m = vm_page_splay(pindex, object->root); - if ((object->root = m)->pindex < pindex) - m = TAILQ_NEXT(m, listq); - } - } - return (m); + if (object->resident_page_count) + return vm_radix_lookup_ge(&object->rtree, pindex, + VM_RADIX_BLACK); + return (NULL); } /* @@ -1108,75 +1000,21 @@ vm_page_prev(vm_page_t m) void vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { + u_int i; - vm_page_remove(m); - vm_page_insert(m, new_object, new_pindex); - vm_page_dirty(m); -} - -/* - * Convert all of the given object's cached pages that have a - * pindex within the given range into free pages. If the value - * zero is given for "end", then the range's upper bound is - * infinity. If the given object is backed by a vnode and it - * transitions from having one or more cached pages to none, the - * vnode's hold count is reduced. - */ -void -vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) -{ - vm_page_t m, m_next; - boolean_t empty; - - mtx_lock(&vm_page_queue_free_mtx); - if (__predict_false(object->cache == NULL)) { - mtx_unlock(&vm_page_queue_free_mtx); - return; - } - m = object->cache = vm_page_splay(start, object->cache); - if (m->pindex < start) { - if (m->right == NULL) - m = NULL; - else { - m_next = vm_page_splay(start, m->right); - m_next->left = m; - m->right = NULL; - m = object->cache = m_next; - } - } + MPASS(m->object != NULL); + VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); + VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); - /* - * At this point, "m" is either (1) a reference to the page - * with the least pindex that is greater than or equal to - * "start" or (2) NULL. - */ - for (; m != NULL && (m->pindex < end || end == 0); m = m_next) { - /* - * Find "m"'s successor and remove "m" from the - * object's cache. - */ - if (m->right == NULL) { - object->cache = m->left; - m_next = NULL; - } else { - m_next = vm_page_splay(start, m->right); - m_next->left = m->left; - object->cache = m_next; - } - /* Convert "m" to a free page. */ - m->object = NULL; - m->valid = 0; - /* Clear PG_CACHED and set PG_FREE. */ - m->flags ^= PG_CACHED | PG_FREE; - KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, - ("vm_page_cache_free: page %p has inconsistent flags", m)); - cnt.v_cache_count--; - cnt.v_free_count++; + vm_page_lock(m); + vm_page_remove(m); + vm_page_unlock(m); + while (vm_page_insert(m, new_object, new_pindex) != 0) { + pagedaemon_wakeup(); + for (i = 0; i < 10000000; i++) + cpu_spinwait(); } - empty = object->cache == NULL; - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && empty) - vdrop(object->handle); + vm_page_dirty(m); } /* @@ -1188,15 +1026,12 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) static inline vm_page_t vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) { - vm_page_t m; + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - if ((m = object->cache) != NULL && m->pindex != pindex) { - m = vm_page_splay(pindex, m); - if ((object->cache = m)->pindex != pindex) - m = NULL; - } - return (m); + if (object->cached_page_count != 0) + return vm_radix_lookup(&object->rtree, pindex, VM_RADIX_RED); + return (NULL); } /* @@ -1208,131 +1043,77 @@ vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) static void vm_page_cache_remove(vm_page_t m) { - vm_object_t object; - vm_page_t root; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT((m->flags & PG_CACHED) != 0, ("vm_page_cache_remove: page %p is not cached", m)); - object = m->object; - if (m != object->cache) { - root = vm_page_splay(m->pindex, object->cache); - KASSERT(root == m, - ("vm_page_cache_remove: page %p is not cached in object %p", - m, object)); - } - if (m->left == NULL) - root = m->right; - else if (m->right == NULL) - root = m->left; - else { - root = vm_page_splay(m->pindex, m->left); - root->right = m->right; - } - object->cache = root; + vm_radix_remove(&m->object->rtree, m->pindex, VM_RADIX_RED); + m->object->cached_page_count--; m->object = NULL; cnt.v_cache_count--; } /* - * Transfer all of the cached pages with offset greater than or - * equal to 'offidxstart' from the original object's cache to the - * new object's cache. However, any cached pages with offset - * greater than or equal to the new object's size are kept in the - * original object. Initially, the new object's cache must be - * empty. Offset 'offidxstart' in the original object must - * correspond to offset zero in the new object. - * - * The new object must be locked. + * Move a given cached page from an object's cached pages to + * the free list. + * + * The free page queue mtx and object lock must be locked. */ void -vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, - vm_object_t new_object) +vm_page_cache_free(vm_page_t m) { - vm_page_t m, m_next; + vm_object_t object; + + object = m->object; + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + KASSERT((m->flags & PG_CACHED) != 0, + ("vm_page_cache_free: page %p is not cached", m)); /* - * Insertion into an object's collection of cached pages - * requires the object to be locked. In contrast, removal does - * not. + * Replicate vm_page_cache_remove with a version that can collapse + * internal nodes since the object lock is held. */ - VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); - KASSERT(new_object->cache == NULL, - ("vm_page_cache_transfer: object %p has cached pages", - new_object)); - mtx_lock(&vm_page_queue_free_mtx); - if ((m = orig_object->cache) != NULL) { - /* - * Transfer all of the pages with offset greater than or - * equal to 'offidxstart' from the original object's - * cache to the new object's cache. - */ - m = vm_page_splay(offidxstart, m); - if (m->pindex < offidxstart) { - orig_object->cache = m; - new_object->cache = m->right; - m->right = NULL; - } else { - orig_object->cache = m->left; - new_object->cache = m; - m->left = NULL; - } - while ((m = new_object->cache) != NULL) { - if ((m->pindex - offidxstart) >= new_object->size) { - /* - * Return all of the cached pages with - * offset greater than or equal to the - * new object's size to the original - * object's cache. - */ - new_object->cache = m->left; - m->left = orig_object->cache; - orig_object->cache = m; - break; - } - m_next = vm_page_splay(m->pindex, m->right); - /* Update the page's object and offset. */ - m->object = new_object; - m->pindex -= offidxstart; - if (m_next == NULL) - break; - m->right = NULL; - m_next->left = m; - new_object->cache = m_next; - } - KASSERT(new_object->cache == NULL || - new_object->type == OBJT_SWAP, - ("vm_page_cache_transfer: object %p's type is incompatible" - " with cached pages", new_object)); - } - mtx_unlock(&vm_page_queue_free_mtx); + vm_radix_remove(&object->rtree, m->pindex, VM_RADIX_ANY); + object->cached_page_count--; + m->object = NULL; + m->valid = 0; + /* Clear PG_CACHED and set PG_FREE. */ + m->flags ^= PG_CACHED | PG_FREE; + cnt.v_cache_count--; + cnt.v_free_count++; } /* - * Returns TRUE if a cached page is associated with the given object and - * offset, and FALSE otherwise. - * - * The object must be locked. + * Attempt to rename a cached page from one object to another. If + * it fails the cached page is freed. */ -boolean_t -vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) +void +vm_page_cache_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t idx) { - vm_page_t m; + vm_object_t orig_object; + orig_object = m->object; + VM_OBJECT_LOCK_ASSERT(orig_object, MA_OWNED); + VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* - * Insertion into an object's collection of cached pages requires the - * object to be locked. Therefore, if the object is locked and the - * object's collection is empty, there is no need to acquire the free - * page queues lock in order to prove that the specified page doesn't - * exist. + * If the insert fails we simply free the cached page. */ - VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); - if (__predict_true(object->cache == NULL)) - return (FALSE); - mtx_lock(&vm_page_queue_free_mtx); - m = vm_page_cache_lookup(object, pindex); - mtx_unlock(&vm_page_queue_free_mtx); - return (m != NULL); + if (vm_radix_insert(&new_object->rtree, idx, m) != 0) { + vm_page_cache_free(m); + return; + } + vm_radix_color(&new_object->rtree, idx, VM_RADIX_RED); + /* + * We use any color here though we know it's red so that tree + * compaction will still work. + */ + vm_radix_remove(&orig_object->rtree, m->pindex, VM_RADIX_ANY); + m->object = new_object; + m->pindex = idx; + new_object->cached_page_count++; + orig_object->cached_page_count--; } /* @@ -1465,7 +1246,8 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) m->valid = 0; m_object = m->object; vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && m_object->cache == NULL) + if (m_object->type == OBJT_VNODE && + m_object->cached_page_count == 0) vp = m_object->handle; } else { KASSERT(VM_PAGE_IS_FREE(m), @@ -1509,7 +1291,19 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); - vm_page_insert(m, object, pindex); + if (vm_page_insert(m, object, pindex) != 0) { + + /* See the comment below about hold count handling. */ + if (vp != NULL) + vdrop(vp); + vm_page_lock(m); + if (req & VM_ALLOC_WIRED) + vm_page_unwire(m, 0); + vm_page_free(m); + vm_page_unlock(m); + pagedaemon_wakeup(); + return (NULL); + } } else m->pindex = pindex; @@ -1576,6 +1370,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, { struct vnode *drop; vm_page_t deferred_vdrop_list, m, m_ret; + vm_pindex_t cpindex; u_int flags, oflags; int req_class; @@ -1662,6 +1457,7 @@ retry: memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } + cpindex = pindex; for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; @@ -1671,12 +1467,30 @@ retry: m->oflags = oflags; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); - if (object != NULL) - vm_page_insert(m, object, pindex); - else - m->pindex = pindex; pindex++; } + for (m = m_ret; m < &m_ret[npages]; m++) { + if (object != NULL) { + if (vm_page_insert(m, object, cpindex) != 0) { + while (deferred_vdrop_list != NULL) { + vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev); + deferred_vdrop_list = + deferred_vdrop_list->pageq.tqe_next; + } + for (m = m_ret; m < &m_ret[npages]; m++) { + vm_page_lock(m); + if (req & VM_ALLOC_WIRED) + vm_page_unwire(m, 0); + vm_page_free(m); + vm_page_unlock(m); + } + pagedaemon_wakeup(); + return (NULL); + } + } else + m->pindex = cpindex; + cpindex++; + } while (deferred_vdrop_list != NULL) { vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev); deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next; @@ -1722,7 +1536,8 @@ vm_page_alloc_init(vm_page_t m) m->valid = 0; m_object = m->object; vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && m_object->cache == NULL) + if (m_object->type == OBJT_VNODE && + m_object->cached_page_count == 0) drop = m_object->handle; } else { KASSERT(VM_PAGE_IS_FREE(m), @@ -2321,7 +2136,7 @@ void vm_page_cache(vm_page_t m) { vm_object_t object; - vm_page_t next, prev, root; + int old_cached; vm_page_lock_assert(m, MA_OWNED); object = m->object; @@ -2352,46 +2167,7 @@ vm_page_cache(vm_page_t m) */ vm_page_remque(m); - /* - * Remove the page from the object's collection of resident - * pages. - */ - if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { - /* - * Since the page's successor in the list is also its parent - * in the tree, its right subtree must be empty. - */ - next->left = m->left; - KASSERT(m->right == NULL, - ("vm_page_cache: page %p has right child", m)); - } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && - prev->right == m) { - /* - * Since the page's predecessor in the list is also its parent - * in the tree, its left subtree must be empty. - */ - KASSERT(m->left == NULL, - ("vm_page_cache: page %p has left child", m)); - prev->right = m->right; - } else { - if (m != object->root) - vm_page_splay(m->pindex, object->root); - if (m->left == NULL) - root = m->right; - else if (m->right == NULL) - root = m->left; - else { - /* - * Move the page's successor to the root, because - * pages are usually removed in ascending order. - */ - if (m->right != next) - vm_page_splay(m->pindex, m->right); - next->left = m->left; - root = next; - } - object->root = root; - } + vm_radix_color(&object->rtree, m->pindex, VM_RADIX_RED); TAILQ_REMOVE(&object->memq, m, listq); object->resident_page_count--; @@ -2408,26 +2184,9 @@ vm_page_cache(vm_page_t m) m->flags &= ~PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); m->flags |= PG_CACHED; + old_cached = object->cached_page_count; + object->cached_page_count++; cnt.v_cache_count++; - root = object->cache; - if (root == NULL) { - m->left = NULL; - m->right = NULL; - } else { - root = vm_page_splay(m->pindex, root); - if (m->pindex < root->pindex) { - m->left = root->left; - m->right = root; - root->left = NULL; - } else if (__predict_false(m->pindex == root->pindex)) - panic("vm_page_cache: offset already cached"); - else { - m->right = root->right; - m->left = root; - root->right = NULL; - } - } - object->cache = m; #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) { #else @@ -2445,9 +2204,9 @@ vm_page_cache(vm_page_t m) * the object's only resident page. */ if (object->type == OBJT_VNODE) { - if (root == NULL && object->resident_page_count != 0) + if (old_cached == 0 && object->resident_page_count != 0) vhold(object->handle); - else if (root != NULL && object->resident_page_count == 0) + else if (old_cached != 0 && object->resident_page_count == 0) vdrop(object->handle); } } @@ -2969,11 +2728,8 @@ vm_page_cowfault(vm_page_t m) pindex = m->pindex; retry_alloc: - pmap_remove_all(m); - vm_page_remove(m); mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (mnew == NULL) { - vm_page_insert(m, object, pindex); vm_page_unlock(m); VM_OBJECT_UNLOCK(object); VM_WAIT; @@ -2999,8 +2755,9 @@ vm_page_cowfault(vm_page_t m) vm_page_lock(mnew); vm_page_free(mnew); vm_page_unlock(mnew); - vm_page_insert(m, object, pindex); } else { /* clear COW & copy page */ + pmap_remove_all(m); + vm_page_remove(m); if (!so_zerocp_fullpage) pmap_copy_page(m, mnew); mnew->valid = VM_PAGE_BITS_ALL; diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 7a3f138..8a3deb9 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -128,8 +128,6 @@ typedef uint64_t vm_page_bits_t; struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* page queue or free list (Q) */ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ - struct vm_page *left; /* splay tree link (O) */ - struct vm_page *right; /* splay tree link (O) */ vm_object_t object; /* which object am I in (O,P)*/ vm_pindex_t pindex; /* offset into object (O,P) */ @@ -378,8 +376,8 @@ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); void vm_page_cache(vm_page_t); -void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t); -void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); +void vm_page_cache_free(vm_page_t); +void vm_page_cache_rename(vm_page_t, vm_object_t, vm_pindex_t); int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_dontneed(vm_page_t); @@ -389,8 +387,7 @@ void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); -void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); -boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); +int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); @@ -404,7 +401,6 @@ void vm_page_requeue(vm_page_t m); void vm_page_requeue_locked(vm_page_t m); void vm_page_set_valid_range(vm_page_t m, int base, int size); void vm_page_sleep(vm_page_t m, const char *msg); -vm_page_t vm_page_splay(vm_pindex_t, vm_page_t); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unwire (vm_page_t, int); diff --git a/sys/vm/vm_radix.c b/sys/vm/vm_radix.c new file mode 100644 index 0000000..f29f5d3 --- /dev/null +++ b/sys/vm/vm_radix.c @@ -0,0 +1,956 @@ +/* + * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> + * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + + +/* + * Radix tree implementation. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/conf.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/queue.h> +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/ktr.h> +#include <vm/uma.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#ifndef UMA_MD_SMALL_ALLOC +#include <vm/vm_map.h> +#endif +#include <vm/vm_radix.h> +#include <vm/vm_object.h> + +#include <sys/kdb.h> + +#ifndef UMA_MD_SMALL_ALLOC +#define VM_RADIX_RNODE_MAP_SCALE (1024 * 1024 / 2) +#define VM_RADIX_WIDTH 4 + +/* + * Bits of height in root. + * The mask of smaller power of 2 containing VM_RADIX_LIMIT. + */ +#define VM_RADIX_HEIGHT 0x1f +#else +#define VM_RADIX_WIDTH 5 + +/* See the comment above. */ +#define VM_RADIX_HEIGHT 0xf +#endif + +#define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) +#define VM_RADIX_MASK (VM_RADIX_COUNT - 1) +#define VM_RADIX_MAXVAL ((vm_pindex_t)-1) +#define VM_RADIX_LIMIT howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH) + +/* Flag bits stored in node pointers. */ +#define VM_RADIX_FLAGS 0x3 + +/* Calculates maximum value for a tree of height h. */ +#define VM_RADIX_MAX(h) \ + ((h) == VM_RADIX_LIMIT ? VM_RADIX_MAXVAL : \ + (((vm_pindex_t)1 << ((h) * VM_RADIX_WIDTH)) - 1)) + +/* + * On 32-bits architectures KTR cannot handle 64-bits values. + * Add macros for splitting in 32-bits quantity and provide format strings. + * Note that braces are not used because they would break compilation. + * Also, note that arguments are cast to u_long in order to follow KTR + * convention. + */ +#ifdef KTR +#define KFRMT64(x) __STRING(x)"l 0x%08lx, "__STRING(x)"h 0x%08lx" +#define KSPLT64L(x) ((u_long)((x) & 0xFFFFFFFF)) +#define KSPLT64H(x) ((u_long)(((x) >> 32) & 0xFFFFFFFF)) +#endif + +struct vm_radix_node { + void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */ + volatile uint32_t rn_count; /* Valid children. */ +}; + +CTASSERT(VM_RADIX_HEIGHT >= VM_RADIX_LIMIT); +CTASSERT(sizeof(struct vm_radix_node) < PAGE_SIZE); + +static uma_zone_t vm_radix_node_zone; + +#ifndef UMA_MD_SMALL_ALLOC +static vm_map_t rnode_map; +static u_long rnode_map_scale; + +static void * +vm_radix_node_zone_allocf(uma_zone_t zone, int size, uint8_t *flags, int wait) +{ + vm_offset_t addr; + vm_page_t m; + int pflags; + + /* Inform UMA that this allocator uses rnode_map. */ + *flags = UMA_SLAB_KERNEL; + + pflags = VM_ALLOC_WIRED | VM_ALLOC_NOOBJ; + + /* + * As kmem_alloc_nofault() can however fail, let just assume that + * M_NOWAIT is on and act accordingly. + */ + pflags |= ((wait & M_USE_RESERVE) != 0) ? VM_ALLOC_INTERRUPT : + VM_ALLOC_SYSTEM; + if ((wait & M_ZERO) != 0) + pflags |= VM_ALLOC_ZERO; + addr = kmem_alloc_nofault(rnode_map, size); + if (addr == 0) + return (NULL); + + /* Just one page allocation is assumed here. */ + m = vm_page_alloc(NULL, OFF_TO_IDX(addr - VM_MIN_KERNEL_ADDRESS), + pflags); + if (m == NULL) { + kmem_free(rnode_map, addr, size); + return (NULL); + } + if ((wait & M_ZERO) != 0 && (m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + pmap_qenter(addr, &m, 1); + return ((void *)addr); +} + +static void +vm_radix_node_zone_freef(void *item, int size, uint8_t flags) +{ + vm_page_t m; + vm_offset_t voitem; + + MPASS((flags & UMA_SLAB_KERNEL) != 0); + + /* Just one page allocation is assumed here. */ + voitem = (vm_offset_t)item; + m = PHYS_TO_VM_PAGE(pmap_kextract(voitem)); + pmap_qremove(voitem, 1); + vm_page_lock(m); + vm_page_unwire(m, 0); + vm_page_free(m); + vm_page_unlock(m); + kmem_free(rnode_map, voitem, size); +} + +static void +init_vm_radix_alloc(void *dummy __unused) +{ + + uma_zone_set_max(vm_radix_node_zone, rnode_map_scale); + uma_zone_set_allocf(vm_radix_node_zone, vm_radix_node_zone_allocf); + uma_zone_set_freef(vm_radix_node_zone, vm_radix_node_zone_freef); +} +SYSINIT(vm_radix, SI_SUB_KMEM, SI_ORDER_SECOND, init_vm_radix_alloc, NULL); +#endif + +/* + * Radix node zone destructor. + */ +#ifdef INVARIANTS +static void +vm_radix_node_zone_dtor(void *mem, int size, void *arg) +{ + struct vm_radix_node *rnode; + + rnode = mem; + KASSERT(rnode->rn_count == 0, + ("vm_radix_node_put: Freeing a node with %d children\n", + rnode->rn_count)); +} +#endif + +/* + * Allocate a radix node. Initializes all elements to 0. + */ +static __inline struct vm_radix_node * +vm_radix_node_get(void) +{ + + return (uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO)); +} + +/* + * Free radix node. + */ +static __inline void +vm_radix_node_put(struct vm_radix_node *rnode) +{ + + uma_zfree(vm_radix_node_zone, rnode); +} + +/* + * Return the position in the array for a given level. + */ +static __inline int +vm_radix_slot(vm_pindex_t index, int level) +{ + + return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); +} + +/* + * Initialize the radix node submap (for architectures not supporting + * direct-mapping) and the radix node zone. + * + * WITNESS reports a lock order reversal, for architectures not + * supporting direct-mapping, between the "system map" lock + * and the "vm object" lock. This is because the well established ordering + * "system map" -> "vm object" is not honoured in this case as allocating + * from the radix node submap ends up adding a mapping entry to it, meaning + * it is necessary to lock the submap. However, the radix node submap is + * a leaf and self-contained, thus a deadlock cannot happen here and + * adding MTX_NOWITNESS to all map locks would be largerly sub-optimal. + */ +void +vm_radix_init(void) +{ +#ifndef UMA_MD_SMALL_ALLOC + vm_offset_t maxaddr, minaddr; + + rnode_map_scale = VM_RADIX_RNODE_MAP_SCALE; + TUNABLE_ULONG_FETCH("hw.rnode_map_scale", &rnode_map_scale); + rnode_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, + rnode_map_scale * sizeof(struct vm_radix_node), FALSE); + rnode_map->system_map = 1; +#endif + + vm_radix_node_zone = uma_zcreate("RADIX NODE", + sizeof(struct vm_radix_node), NULL, +#ifdef INVARIANTS + vm_radix_node_zone_dtor, +#else + NULL, +#endif + NULL, NULL, VM_RADIX_HEIGHT, UMA_ZONE_VM); +} + +/* + * Extract the root node and height from a radix tree with a single load. + */ +static __inline int +vm_radix_height(struct vm_radix *rtree, struct vm_radix_node **rnode) +{ + uintptr_t root; + int height; + + root = rtree->rt_root; + height = root & VM_RADIX_HEIGHT; + *rnode = (struct vm_radix_node *)(root - height); + return (height); +} + + +/* + * Set the root node and height for a radix tree. + */ +static inline void +vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode, + int height) +{ + uintptr_t root; + + root = (uintptr_t)rnode | height; + rtree->rt_root = root; +} + +static inline void +vm_radix_unwind_heightup(struct vm_radix *rtree, struct vm_radix_node *root, + struct vm_radix_node *iroot, int ilevel) +{ + struct vm_radix_node *rnode; + + CTR4(KTR_VM, "unwind: tree %p, root %p, iroot %p, ilevel %d", + rtree, root, iroot, ilevel); + while (iroot != root && root != NULL) { + rnode = root; + MPASS(rnode->rn_count == 0 || rnode->rn_count == 1); + rnode->rn_count = 0; + root = rnode->rn_child[0]; + vm_radix_node_put(rnode); + } + vm_radix_setroot(rtree, iroot, ilevel); +} + +static inline void * +vm_radix_match(void *child, int color) +{ + uintptr_t c; + + c = (uintptr_t)child; + + if ((c & color) == 0) + return (NULL); + return ((void *)(c & ~VM_RADIX_FLAGS)); +} + +static void +vm_radix_reclaim_allnodes_internal(struct vm_radix_node *rnode, int level) +{ + int slot; + + MPASS(rnode != NULL && level >= 0); + + /* + * Level 0 just contains pages as children, thus make it a special + * case, free the node and return. + */ + if (level == 0) { + CTR2(KTR_VM, "reclaiming: node %p, level %d", rnode, level); + rnode->rn_count = 0; + vm_radix_node_put(rnode); + return; + } + for (slot = 0; slot < VM_RADIX_COUNT && rnode->rn_count != 0; slot++) { + if (rnode->rn_child[slot] == NULL) + continue; + CTR3(KTR_VM, + "reclaiming: node %p, level %d recursing in slot %d", + rnode, level, slot); + vm_radix_reclaim_allnodes_internal(rnode->rn_child[slot], + level - 1); + rnode->rn_count--; + } + MPASS(rnode->rn_count == 0); + CTR2(KTR_VM, "reclaiming: node %p, level %d", rnode, level); + vm_radix_node_put(rnode); +} + +/* + * Remove the specified index from the tree. If possible the height of the + * tree is adjusted after deletion. May be used to cleanup intermediate + * nodes of the path if the key is not entirely present. + */ +static void +vm_radix_sweep(struct vm_radix *rtree, vm_pindex_t index, int color, int hard) +{ + struct vm_radix_node *stack[VM_RADIX_LIMIT]; + struct vm_radix_node *rnode, *root; + int level; + int slot; + + level = vm_radix_height(rtree, &root); + KASSERT(index <= VM_RADIX_MAX(level), + ("vm_radix_sweep: %p index out of range %jd.", rtree, + VM_RADIX_MAX(level))); + rnode = root; + level--; + /* + * Find the node and record the path in stack. + */ + while (level && rnode) { + stack[level] = rnode; + slot = vm_radix_slot(index, level); + CTR6(KTR_VM, + "remove: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot, + rnode); + CTR4(KTR_VM, "remove: tree %p, rnode %p, child %p, count %u", + rtree, rnode, rnode->rn_child[slot], rnode->rn_count); + rnode = rnode->rn_child[slot]; + level--; + } + if (rnode == NULL) { + if (hard) + panic("vm_radix_sweep: index not present.\n"); + + /* + * Almost certainly it got an half-constructed tree it was + * expected. + */ + KASSERT(level < (VM_RADIX_LIMIT - 1), + ("vm_radix_sweep: level %d not consistent.\n", level)); + ++level; + rnode = stack[level]; + slot = vm_radix_slot(index, level); + } else { + slot = vm_radix_slot(index, 0); + if (hard && + vm_radix_match(rnode->rn_child[slot], color) == NULL) + panic("vm_radix_sweep: index not present as leaf.\n"); + } + + for (;;) { + CTR6(KTR_VM, +"remove: resetting tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot, + rnode); + CTR4(KTR_VM, + "remove: resetting tree %p, rnode %p, child %p, count %u", + rtree, rnode, + (rnode != NULL) ? rnode->rn_child[slot] : NULL, + (rnode != NULL) ? rnode->rn_count : 0); + rnode->rn_child[slot] = NULL; + /* + * Use atomics for the last level since red and black + * will both adjust it. + * Use a write memory barrier here in order to avoid + * rn_count reaching 0 before to fetch the actual pointer. + * Concurrent black removal, infact, may want to reclaim + * the radix node itself before to read it. + */ + MPASS(rnode->rn_count != 0); + if (level == 0) + atomic_add_rel_32(&rnode->rn_count, -1); + else + rnode->rn_count--; + + /* + * Only allow black removes to prune the tree. + */ + if ((color & VM_RADIX_BLACK) == 0 || rnode->rn_count > 0) + break; + vm_radix_node_put(rnode); + if (rnode == root) { + vm_radix_setroot(rtree, NULL, 0); + break; + } + rnode = stack[++level]; + slot = vm_radix_slot(index, level); + + } +} + +/* + * Inserts the key-value pair in to the radix tree. Returns errno. + * Panics if the key already exists. + */ +int +vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val) +{ + struct vm_radix_node *iroot, *rnode, *root; + int ilevel, level, slot; + + CTR4(KTR_VM, + "insert: tree %p, " KFRMT64(index) ", val %p", rtree, + KSPLT64L(index), KSPLT64H(index), val); + if (index == -1) + panic("vm_radix_insert: -1 is not a valid index.\n"); + level = vm_radix_height(rtree, &root); + /* + * Increase the height by adding nodes at the root until + * there is sufficient space. + */ + ilevel = level; + iroot = root; + while (level == 0 || index > VM_RADIX_MAX(level)) { + CTR5(KTR_VM, + "insert: expanding " KFRMT64(index) ">" KFRMT64(mxl) ", height %d", + KSPLT64L(index), KSPLT64H(index), + KSPLT64L(VM_RADIX_MAX(level)), + KSPLT64H(VM_RADIX_MAX(level)), level); + level++; + KASSERT(level <= VM_RADIX_LIMIT, + ("vm_radix_insert: Tree %p height %d too tall", + rtree, level)); + /* + * Only allocate tree nodes if they are needed. + */ + if (root == NULL || root->rn_count != 0) { + rnode = vm_radix_node_get(); + if (rnode == NULL) { + CTR5(KTR_VM, + "insert: tree %p, root %p, " KFRMT64(index) ", level %d ENOMEM", + rtree, root, KSPLT64L(index), + KSPLT64H(index), level); + vm_radix_unwind_heightup(rtree, root, iroot, + ilevel); + return (ENOMEM); + } + /* + * Store the new pointer with a memory barrier so + * that it is visible before the new root. + */ + if (root) { + atomic_store_rel_ptr((volatile uintptr_t *) + &rnode->rn_child[0], (uintptr_t)root); + rnode->rn_count = 1; + } + root = rnode; + } + vm_radix_setroot(rtree, root, level); + } + + /* Now that the tree is tall enough, fill in the path to the index. */ + rnode = root; + for (level = level - 1; level > 0; level--) { + slot = vm_radix_slot(index, level); + /* Add the required intermidiate nodes. */ + if (rnode->rn_child[slot] == NULL) { + + /* + * In case of failed allocation, the vm_radix_sweep() + * will unwind back rn_count appropriately. + */ + rnode->rn_count++; + rnode->rn_child[slot] = vm_radix_node_get(); + if (rnode->rn_child[slot] == NULL) { + CTR6(KTR_VM, + "insert: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p ENOMEM", + rtree, KSPLT64L(index), KSPLT64H(index), + level, slot, rnode); + CTR4(KTR_VM, + "insert: tree %p, rnode %p, child %p, count %u ENOMEM", + rtree, rnode, rnode->rn_child[slot], + rnode->rn_count); + vm_radix_sweep(rtree, index, VM_RADIX_BLACK, 0); + + /* + * vm_radix_sweep() may have changed the shape + * of the tree, refetch the root. + */ + vm_radix_height(rtree, &root); + vm_radix_unwind_heightup(rtree, root, iroot, + ilevel); + return (ENOMEM); + } + } + CTR6(KTR_VM, + "insert: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot, + rnode); + CTR4(KTR_VM, + "insert: tree %p, rnode %p, child %p, count %u", + rtree, rnode, rnode->rn_child[slot], rnode->rn_count); + rnode = rnode->rn_child[slot]; + } + + slot = vm_radix_slot(index, 0); + MPASS(rnode != NULL); + KASSERT(rnode->rn_child[slot] == NULL, + ("vm_radix_insert: Duplicate value %p at index: %lu\n", + rnode->rn_child[slot], (u_long)index)); + val = (void *)((uintptr_t)val | VM_RADIX_BLACK); + rnode->rn_child[slot] = val; + atomic_add_32(&rnode->rn_count, 1); + CTR5(KTR_VM, + "insert: tree %p, " KFRMT64(index) ", level %d, slot %d", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot); + CTR3(KTR_VM, "insert: slot %d, rnode %p, count %u", slot, rnode, + rnode->rn_count); + + return 0; +} + +/* + * Returns the value stored at the index. If the index is not present + * NULL is returned. + */ +void * +vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + struct vm_radix_node *rnode; + int slot; + int level; + + level = vm_radix_height(rtree, &rnode); + if (index > VM_RADIX_MAX(level)) + return NULL; + level--; + while (rnode) { + slot = vm_radix_slot(index, level); + CTR6(KTR_VM, + "lookup: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot, + rnode); + CTR2(KTR_VM, "lookup: rnode %p, child %p", rnode, + rnode->rn_child[slot]); + if (level == 0) + return vm_radix_match(rnode->rn_child[slot], color); + rnode = rnode->rn_child[slot]; + level--; + } + CTR3(KTR_VM, "lookup: tree %p, " KFRMT64(index) " failed", rtree, + KSPLT64L(index), KSPLT64H(index)); + + return NULL; +} + +void * +vm_radix_color(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + struct vm_radix_node *rnode; + uintptr_t child; + int slot; + int level; + + level = vm_radix_height(rtree, &rnode); + if (index > VM_RADIX_MAX(level)) + return NULL; + level--; + while (rnode) { + slot = vm_radix_slot(index, level); + CTR6(KTR_VM, + "color: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot, + rnode); + CTR2(KTR_VM, "color: rnode %p, child %p", rnode, + rnode->rn_child[slot]); + if (level == 0) + break; + rnode = rnode->rn_child[slot]; + level--; + } + if (rnode == NULL || rnode->rn_child[slot] == NULL) + return (NULL); + child = (uintptr_t)rnode->rn_child[slot]; + child &= ~VM_RADIX_FLAGS; + rnode->rn_child[slot] = (void *)(child | color); + + return (void *)child; +} + +/* + * Find the first leaf with a valid node between *startp and end. Return + * the index of the first valid item in the leaf in *startp. + */ +static struct vm_radix_node * +vm_radix_leaf(struct vm_radix *rtree, vm_pindex_t *startp, vm_pindex_t end) +{ + struct vm_radix_node *rnode; + vm_pindex_t start; + vm_pindex_t inc; + int slot; + int level; + + start = *startp; +restart: + level = vm_radix_height(rtree, &rnode); + if (start > VM_RADIX_MAX(level) || (end && start >= end)) { + rnode = NULL; + goto out; + } + /* + * Search the tree from the top for any leaf node holding an index + * between start and end. + */ + for (level--; level; level--) { + slot = vm_radix_slot(start, level); + CTR6(KTR_VM, + "leaf: tree %p, " KFRMT64(start) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(start), KSPLT64H(start), level, slot, + rnode); + CTR2(KTR_VM, "leaf: rnode %p, child %p", rnode, + rnode->rn_child[slot]); + if (rnode->rn_child[slot] != NULL) { + rnode = rnode->rn_child[slot]; + continue; + } + /* + * Calculate how much to increment our index by + * based on the tree level. We must truncate the + * lower bits to start from the begnning of the + * next leaf. + */ + inc = 1LL << (level * VM_RADIX_WIDTH); + start &= ~VM_RADIX_MAX(level); + + /* Avoid start address wrapping up. */ + if ((VM_RADIX_MAXVAL - start) < inc) { + rnode = NULL; + goto out; + } + start += inc; + slot++; + CTR6(KTR_VM, + "leaf: " KFRMT64(start) ", " KFRMT64(end) ", " KFRMT64(inc), + KSPLT64L(start), KSPLT64H(start), KSPLT64L(end), + KSPLT64H(end), KSPLT64L(inc), KSPLT64H(inc)); + CTR2(KTR_VM, "leaf: level %d, slot %d", level, slot); + for (; slot < VM_RADIX_COUNT; slot++, start += inc) { + if (end != 0 && start >= end) { + rnode = NULL; + goto out; + } + if (rnode->rn_child[slot]) { + rnode = rnode->rn_child[slot]; + break; + } + if ((VM_RADIX_MAXVAL - start) < inc) { + rnode = NULL; + goto out; + } + } + if (slot == VM_RADIX_COUNT) + goto restart; + } + +out: + *startp = start; + return (rnode); +} + + + +/* + * Looks up as many as cnt values between start and end, and stores + * them in the caller allocated array out. The next index can be used + * to restart the scan. This optimizes forward scans in the tree. + */ +int +vm_radix_lookupn(struct vm_radix *rtree, vm_pindex_t start, + vm_pindex_t end, int color, void **out, int cnt, vm_pindex_t *next, + u_int *exhausted) +{ + struct vm_radix_node *rnode; + void *val; + int slot; + int outidx; + + CTR5(KTR_VM, "lookupn: tree %p, " KFRMT64(start) ", " KFRMT64(end), + rtree, KSPLT64L(start), KSPLT64H(start), KSPLT64L(end), + KSPLT64H(end)); + if (end == 0) + *exhausted = 0; + if (rtree->rt_root == 0) + return (0); + outidx = 0; + while ((rnode = vm_radix_leaf(rtree, &start, end)) != NULL) { + slot = vm_radix_slot(start, 0); + for (; slot < VM_RADIX_COUNT; slot++, start++) { + if (end != 0 && start >= end) + goto out; + val = vm_radix_match(rnode->rn_child[slot], color); + if (val == NULL) { + + /* + * The start address can wrap at the + * VM_RADIX_MAXVAL value. + * We need to make sure that start address + * point to the next chunk (even if wrapping) + * to stay consistent with default scanning + * behaviour. Also, because of the nature + * of the wrapping, the wrap up checks must + * be done after all the necessary controls + * on start are completed. + */ + if ((VM_RADIX_MAXVAL - start) == 0) { + start++; + if (end == 0) + *exhausted = 1; + goto out; + } + continue; + } + CTR5(KTR_VM, + "lookupn: tree %p " KFRMT64(index) " slot %d found child %p", + rtree, KSPLT64L(start), KSPLT64H(start), slot, val); + out[outidx] = val; + if (++outidx == cnt || + (VM_RADIX_MAXVAL - start) == 0) { + start++; + if ((VM_RADIX_MAXVAL - start) == 0 && end == 0) + *exhausted = 1; + goto out; + } + } + MPASS((VM_RADIX_MAXVAL - start) != 0); + if (end != 0 && start >= end) + break; + } +out: + *next = start; + return (outidx); +} + +#if 0 +void +vm_radix_foreach(struct vm_radix *rtree, vm_pindex_t start, vm_pindex_t end, + int color, void (*iter)(void *)) +{ + struct vm_radix_node *rnode; + void *val; + int slot; + + if (rtree->rt_root == 0) + return; + while ((rnode = vm_radix_leaf(rtree, &start, end)) != NULL) { + slot = vm_radix_slot(start, 0); + for (; slot < VM_RADIX_COUNT; slot++, start++) { + if (end != 0 && start >= end) + return; + val = vm_radix_match(rnode->rn_child[slot], color); + if (val) + iter(val); + } + if (end != 0 && start >= end) + return; + } +} +#endif + + +/* + * Look up any entry at a position less than or equal to index. + */ +void * +vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + struct vm_radix_node *rnode; + struct vm_radix_node *child; + vm_pindex_t max; + vm_pindex_t inc; + void *val; + int slot; + int level; + + CTR3(KTR_VM, "lookup_le: tree %p, " KFRMT64(index), rtree, + KSPLT64L(index), KSPLT64H(index)); +restart: + level = vm_radix_height(rtree, &rnode); + if (rnode == NULL) + return (NULL); + max = VM_RADIX_MAX(level); + if (index > max || index == 0) + index = max; + /* + * Search the tree from the top for any leaf node holding an index + * lower than 'index'. + */ + level--; + while (rnode) { + slot = vm_radix_slot(index, level); + CTR6(KTR_VM, + "lookup_le: tree %p, " KFRMT64(index) ", level %d, slot %d, rnode %p", + rtree, KSPLT64L(index), KSPLT64H(index), level, slot, + rnode); + CTR2(KTR_VM, "lookup_le: rnode %p, child %p", rnode, + rnode->rn_child[slot]); + if (level == 0) + break; + /* + * If we don't have an exact match we must start our search + * from the next leaf and adjust our index appropriately. + */ + if ((child = rnode->rn_child[slot]) == NULL) { + /* + * Calculate how much to decrement our index by + * based on the tree level. We must set the + * lower bits to start from the end of the next + * leaf. + */ + inc = 1LL << (level * VM_RADIX_WIDTH); + index |= VM_RADIX_MAX(level); + index -= inc; + slot--; + CTR6(KTR_VM, + "lookup_le: " KFRMT64(start) ", " KFRMT64(inc) ", level %d slot %d", + KSPLT64L(index), KSPLT64H(index), KSPLT64L(inc), + KSPLT64H(inc), level, slot); + for (; slot >= 0; slot--, index -= inc) { + child = rnode->rn_child[slot]; + if (child) + break; + } + } + rnode = child; + level--; + } + if (rnode) { + for (; slot >= 0; slot--, index--) { + val = vm_radix_match(rnode->rn_child[slot], color); + if (val) + return (val); + } + } + if (index != -1) + goto restart; + return (NULL); +} + +/* + * Remove an entry from the tree. Expects the entry to be already present. + */ +void +vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + + vm_radix_sweep(rtree, index, color, 1); +} + +/* + * Remove and free all the nodes from the radix tree. + * This function is recrusive but there is a tight control on it as the + * maximum depth of the tree is fixed. + */ +void +vm_radix_reclaim_allnodes(struct vm_radix *rtree) +{ + struct vm_radix_node *root; + int level; + + if (rtree->rt_root == 0) + return; + level = vm_radix_height(rtree, &root); + vm_radix_reclaim_allnodes_internal(root, level - 1); + rtree->rt_root = 0; +} + +#ifdef notyet +/* + * Attempts to reduce the height of the tree. + */ +void +vm_radix_shrink(struct vm_radix *rtree) +{ + struct vm_radix_node *tmp, *root; + int level; + + if (rtree->rt_root == 0) + return; + level = vm_radix_height(rtree, &root); + + /* Adjust the height of the tree. */ + while (root->rn_count == 1 && root->rn_child[0] != NULL) { + tmp = root; + root->rn_count--; + root = root->rn_child[0]; + level--; + vm_radix_node_put(tmp); + } + /* Finally see if we have an empty tree. */ + if (root->rn_count == 0) { + vm_radix_node_put(root); + root = NULL; + level--; + } + vm_radix_setroot(rtree, root, level); +} +#endif diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h new file mode 100644 index 0000000..72a77e6 --- /dev/null +++ b/sys/vm/vm_radix.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> + * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef _VM_RADIX_H_ +#define _VM_RADIX_H_ + +#define VM_RADIX_BLACK 0x1 /* Black node. (leaf only) */ +#define VM_RADIX_RED 0x2 /* Red node. (leaf only) */ +#define VM_RADIX_ANY (VM_RADIX_RED | VM_RADIX_BLACK) +#define VM_RADIX_STACK 8 /* Nodes to store on stack. */ + +/* + * Radix tree root. The height and pointer are set together to permit + * coherent lookups while the root is modified. + */ +struct vm_radix { + uintptr_t rt_root; /* root + height */ +}; + +#ifdef _KERNEL + +/* + * Initialize the radix tree subsystem. + */ +void vm_radix_init(void); + +/* + * Functions which only work with black nodes. (object lock) + */ +int vm_radix_insert(struct vm_radix *, vm_pindex_t, void *); + +/* + * Functions which work on specified colors. (object, vm_page_queue_free locks) + */ +void *vm_radix_color(struct vm_radix *, vm_pindex_t, int); +void *vm_radix_lookup(struct vm_radix *, vm_pindex_t, int); +int vm_radix_lookupn(struct vm_radix *, vm_pindex_t, vm_pindex_t, int, + void **, int, vm_pindex_t *, u_int *); +void *vm_radix_lookup_le(struct vm_radix *, vm_pindex_t, int); +void vm_radix_reclaim_allnodes(struct vm_radix *); +void vm_radix_remove(struct vm_radix *, vm_pindex_t, int); + +/* + * Look up any entry at a position greater or equal to index. + */ +static inline void * +vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + void *val; + u_int dummy; + + if (vm_radix_lookupn(rtree, index, 0, color, &val, 1, &index, &dummy)) + return (val); + return (NULL); +} + +static inline void * +vm_radix_last(struct vm_radix *rtree, int color) +{ + + return vm_radix_lookup_le(rtree, 0, color); +} + +static inline void * +vm_radix_first(struct vm_radix *rtree, int color) +{ + + return vm_radix_lookup_ge(rtree, 0, color); +} + +static inline void * +vm_radix_next(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + + if (index == -1) + return (NULL); + return vm_radix_lookup_ge(rtree, index + 1, color); +} + +static inline void * +vm_radix_prev(struct vm_radix *rtree, vm_pindex_t index, int color) +{ + + if (index == 0) + return (NULL); + return vm_radix_lookup_le(rtree, index - 1, color); +} + +#endif /* _KERNEL */ +#endif /* !_VM_RADIX_H_ */ diff --git a/sys/vm/vm_reserv.c b/sys/vm/vm_reserv.c index e5ac1a5..549e710 100644 --- a/sys/vm/vm_reserv.c +++ b/sys/vm/vm_reserv.c @@ -341,33 +341,21 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, /* * Look for an existing reservation. */ - msucc = NULL; - mpred = object->root; - while (mpred != NULL) { + mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK); + if (mpred != NULL) { KASSERT(mpred->pindex != pindex, ("vm_reserv_alloc_contig: pindex already allocated")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; - else if (mpred->pindex < pindex) { - if (msucc != NULL || - (msucc = TAILQ_NEXT(mpred, listq)) == NULL) - break; - KASSERT(msucc->pindex != pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); - rv = vm_reserv_from_page(msucc); - if (rv->object == object && - vm_reserv_has_pindex(rv, pindex)) - goto found; - else if (pindex < msucc->pindex) - break; - } else if (msucc == NULL) { - msucc = mpred; - mpred = TAILQ_PREV(msucc, pglist, listq); - continue; - } - msucc = NULL; - mpred = object->root = vm_page_splay(pindex, object->root); + } + msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK); + if (msucc != NULL) { + KASSERT(msucc->pindex != pindex, + ("vm_reserv_alloc_page: pindex already allocated")); + rv = vm_reserv_from_page(msucc); + if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) + goto found; } /* @@ -507,33 +495,21 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex) /* * Look for an existing reservation. */ - msucc = NULL; - mpred = object->root; - while (mpred != NULL) { + mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK); + if (mpred != NULL) { KASSERT(mpred->pindex != pindex, ("vm_reserv_alloc_page: pindex already allocated")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; - else if (mpred->pindex < pindex) { - if (msucc != NULL || - (msucc = TAILQ_NEXT(mpred, listq)) == NULL) - break; - KASSERT(msucc->pindex != pindex, - ("vm_reserv_alloc_page: pindex already allocated")); - rv = vm_reserv_from_page(msucc); - if (rv->object == object && - vm_reserv_has_pindex(rv, pindex)) - goto found; - else if (pindex < msucc->pindex) - break; - } else if (msucc == NULL) { - msucc = mpred; - mpred = TAILQ_PREV(msucc, pglist, listq); - continue; - } - msucc = NULL; - mpred = object->root = vm_page_splay(pindex, object->root); + } + msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK); + if (msucc != NULL) { + KASSERT(msucc->pindex != pindex, + ("vm_reserv_alloc_page: pindex already allocated")); + rv = vm_reserv_from_page(msucc); + if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) + goto found; } /* diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index a6d78f4..1e3f4b4 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -373,6 +373,7 @@ vnode_pager_setsize(vp, nsize) vm_ooffset_t nsize; { vm_object_t object; + struct vnode *drop; vm_page_t m; vm_pindex_t nobjsize; @@ -398,16 +399,41 @@ vnode_pager_setsize(vp, nsize) /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. - * - * XXX for some reason (I don't know yet), if we take a - * completely invalid page and mark it partially valid - * it can screw up NFS reads, so we don't allow the case. */ if ((nsize & PAGE_MASK) && - (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && - m->valid != 0) { - int base = (int)nsize & PAGE_MASK; - int size = PAGE_SIZE - base; + (m = vm_radix_lookup(&object->rtree, OFF_TO_IDX(nsize), + VM_RADIX_ANY)) != NULL) { + int base; + int size; + + /* + * Eliminate any cached page as we would have to + * do too much work to save it. + */ + if (m->flags & PG_CACHED) { + drop = NULL; + mtx_lock(&vm_page_queue_free_mtx); + if (m->object == object) { + vm_page_cache_free(m); + if (object->cached_page_count == 0) + drop = vp; + } + mtx_unlock(&vm_page_queue_free_mtx); + if (drop) + vdrop(drop); + goto out; + } + /* + * XXX for some reason (I don't know yet), if we take a + * completely invalid page and mark it partially valid + * it can screw up NFS reads, so we don't allow the + * case. + */ + if (m->valid != 0 || m->object != object) + goto out; + + base = (int)nsize & PAGE_MASK; + size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case @@ -437,12 +463,9 @@ vnode_pager_setsize(vp, nsize) * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); - } else if ((nsize & PAGE_MASK) && - vm_page_is_cached(object, OFF_TO_IDX(nsize))) { - vm_page_cache_free(object, OFF_TO_IDX(nsize), - nobjsize); } } +out: object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_UNLOCK(object); |