diff options
author | markj <markj@FreeBSD.org> | 2017-05-23 07:27:30 +0000 |
---|---|---|
committer | markj <markj@FreeBSD.org> | 2017-05-23 07:27:30 +0000 |
commit | b1d352b152cd31bddbbb83670b7f45a213ca52cb (patch) | |
tree | d6aa7d62302443402a45530a50b241fd0f5128b9 /sys | |
parent | a6749049ae872846ba11cc9c7c14e8f8d61425f1 (diff) | |
download | FreeBSD-src-b1d352b152cd31bddbbb83670b7f45a213ca52cb.zip FreeBSD-src-b1d352b152cd31bddbbb83670b7f45a213ca52cb.tar.gz |
MFC r308474, r308691, r309203, r309365, r309703, r309898, r310720,
r308489, r308706:
Add PQ_LAUNDRY and remove PG_CACHED pages.
Diffstat (limited to 'sys')
-rw-r--r-- | sys/amd64/amd64/pmap.c | 42 | ||||
-rw-r--r-- | sys/arm64/arm64/pmap.c | 31 | ||||
-rw-r--r-- | sys/cddl/compat/opensolaris/sys/vnode.h | 3 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c | 4 | ||||
-rw-r--r-- | sys/fs/tmpfs/tmpfs_subr.c | 9 | ||||
-rw-r--r-- | sys/i386/i386/pmap.c | 42 | ||||
-rw-r--r-- | sys/kern/kern_exec.c | 2 | ||||
-rw-r--r-- | sys/kern/uipc_shm.c | 9 | ||||
-rw-r--r-- | sys/sys/vmmeter.h | 24 | ||||
-rw-r--r-- | sys/vm/_vm_radix.h | 4 | ||||
-rw-r--r-- | sys/vm/swap_pager.c | 33 | ||||
-rw-r--r-- | sys/vm/vm_fault.c | 12 | ||||
-rw-r--r-- | sys/vm/vm_map.c | 4 | ||||
-rw-r--r-- | sys/vm/vm_meter.c | 38 | ||||
-rw-r--r-- | sys/vm/vm_mmap.c | 3 | ||||
-rw-r--r-- | sys/vm/vm_object.c | 56 | ||||
-rw-r--r-- | sys/vm/vm_object.h | 19 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 764 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 49 | ||||
-rw-r--r-- | sys/vm/vm_pageout.c | 693 | ||||
-rw-r--r-- | sys/vm/vm_phys.c | 2 | ||||
-rw-r--r-- | sys/vm/vm_radix.c | 75 | ||||
-rw-r--r-- | sys/vm/vm_radix.h | 2 | ||||
-rw-r--r-- | sys/vm/vm_reserv.c | 109 | ||||
-rw-r--r-- | sys/vm/vm_reserv.h | 3 | ||||
-rw-r--r-- | sys/vm/vnode_pager.c | 10 |
26 files changed, 938 insertions, 1104 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 45d4c1e..551413f 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -614,7 +614,6 @@ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); -static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); @@ -625,7 +624,7 @@ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); -static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, @@ -2218,29 +2217,17 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) } /* - * Looks for a page table page mapping the specified virtual address in the - * specified pmap's collection of idle page table pages. Returns NULL if there - * is no page table page corresponding to the specified virtual address. + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. */ static __inline vm_page_t -pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); -} - -/* - * Removes the specified page table page from the specified pmap's collection - * of idle page table pages. The specified page table page must be a member of - * the pmap's collection. - */ -static __inline void -pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) -{ - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - vm_radix_remove(&pmap->pm_root, mpte->pindex); + return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* @@ -3460,10 +3447,8 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); - if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != - NULL) - pmap_remove_pt_page(pmap, mpte); - else { + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); @@ -3577,11 +3562,10 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mpte = pmap_lookup_pt_page(pmap, va); + mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); - pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; @@ -3668,9 +3652,8 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { - mpte = pmap_lookup_pt_page(pmap, sva); + mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); @@ -5533,9 +5516,8 @@ pmap_remove_pages(pmap_t pmap) TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } - mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 6839758..5e2e7fc 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -2514,29 +2514,17 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) } /* - * Looks for a page table page mapping the specified virtual address in the - * specified pmap's collection of idle page table pages. Returns NULL if there - * is no page table page corresponding to the specified virtual address. + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. */ static __inline vm_page_t -pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - return (vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(va))); -} - -/* - * Removes the specified page table page from the specified pmap's collection - * of idle page table pages. The specified page table page must be a member of - * the pmap's collection. - */ -static __inline void -pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) -{ - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - vm_radix_remove(&pmap->pm_root, mpte->pindex); + return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* @@ -3605,10 +3593,9 @@ pmap_remove_pages(pmap_t pmap) TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } - ml3 = pmap_lookup_pt_page(pmap, + ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { - pmap_remove_pt_page(pmap, ml3); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); @@ -4381,9 +4368,7 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, return (NULL); } - if ((ml3 = pmap_lookup_pt_page(pmap, va)) != NULL) { - pmap_remove_pt_page(pmap, ml3); - } else { + if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h index e7a92ae..d15cd88 100644 --- a/sys/cddl/compat/opensolaris/sys/vnode.h +++ b/sys/cddl/compat/opensolaris/sys/vnode.h @@ -75,8 +75,7 @@ vn_is_readonly(vnode_t *vp) #define vn_mountedvfs(vp) ((vp)->v_mountedhere) #define vn_has_cached_data(vp) \ ((vp)->v_object != NULL && \ - ((vp)->v_object->resident_page_count > 0 || \ - !vm_object_cache_is_empty((vp)->v_object))) + (vp)->v_object->resident_page_count > 0) #define vn_exists(vp) do { } while (0) #define vn_invalid(vp) do { } while (0) #define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 3a44201..b715a48 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -426,10 +426,6 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) continue; } vm_page_sbusy(pp); - } else if (pp == NULL) { - pp = vm_page_alloc(obj, OFF_TO_IDX(start), - VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED | - VM_ALLOC_SBUSY); } else { ASSERT(pp != NULL && !pp->valid); pp = NULL; diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index f507807..2aa879a 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -1401,12 +1401,9 @@ retry: VM_WAIT; VM_OBJECT_WLOCK(uobj); goto retry; - } else if (m->valid != VM_PAGE_BITS_ALL) - rv = vm_pager_get_pages(uobj, &m, 1, - NULL, NULL); - else - /* A cached page was reactivated. */ - rv = VM_PAGER_OK; + } + rv = vm_pager_get_pages(uobj, &m, 1, NULL, + NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index db71c4d..1da9241 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -306,7 +306,6 @@ static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); -static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, @@ -316,7 +315,7 @@ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); -static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, @@ -1727,29 +1726,17 @@ pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) } /* - * Looks for a page table page mapping the specified virtual address in the - * specified pmap's collection of idle page table pages. Returns NULL if there - * is no page table page corresponding to the specified virtual address. + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. */ static __inline vm_page_t -pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); -} - -/* - * Removes the specified page table page from the specified pmap's collection - * of idle page table pages. The specified page table page must be a member of - * the pmap's collection. - */ -static __inline void -pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) -{ - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - vm_radix_remove(&pmap->pm_root, mpte->pindex); + return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* @@ -2645,10 +2632,8 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); - if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != - NULL) - pmap_remove_pt_page(pmap, mpte); - else { + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); @@ -2786,11 +2771,10 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mpte = pmap_lookup_pt_page(pmap, va); + mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); - pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; @@ -2872,9 +2856,8 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { - mpte = pmap_lookup_pt_page(pmap, sva); + mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); @@ -4616,9 +4599,8 @@ pmap_remove_pages(pmap_t pmap) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } - mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 7d9adb0..1a41aac 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1006,7 +1006,7 @@ exec_map_first_page(imgp) break; } else { ma[i] = vm_page_alloc(object, i, - VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); + VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 0a45380..0aee62f 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -455,12 +455,9 @@ retry: VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; - } else if (m->valid != VM_PAGE_BITS_ALL) - rv = vm_pager_get_pages(object, &m, 1, - NULL, NULL); - else - /* A cached page was reactivated. */ - rv = VM_PAGER_OK; + } + rv = vm_pager_get_pages(object, &m, 1, NULL, + NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index 55d3053..517be2d 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -75,9 +75,10 @@ struct vmmeter { u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */ u_int v_vnodepgsout; /* (p) vnode pager pages paged out */ u_int v_intrans; /* (p) intransit blocking page faults */ - u_int v_reactivated; /* (f) pages reactivated from free list */ + u_int v_reactivated; /* (p) pages reactivated by the pagedaemon */ u_int v_pdwakeups; /* (p) times daemon has awaken from sleep */ u_int v_pdpages; /* (p) pages analyzed by daemon */ + u_int v_pdshortfalls; /* (p) page reclamation shortfalls */ u_int v_tcached; /* (p) total pages cached */ u_int v_dfree; /* (p) pages freed by daemon */ @@ -96,6 +97,7 @@ struct vmmeter { u_int v_active_count; /* (q) pages active */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ + u_int v_laundry_count; /* (q) pages eligible for laundering */ u_int v_cache_count; /* (f) pages on cache queue */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ @@ -111,7 +113,6 @@ struct vmmeter { u_int v_vforkpages; /* (p) VM pages affected by vfork() */ u_int v_rforkpages; /* (p) VM pages affected by rfork() */ u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */ - u_int v_spare[2]; }; #ifdef _KERNEL @@ -184,6 +185,25 @@ vm_paging_needed(void) (u_int)vm_pageout_wakeup_thresh); } +/* + * Return the number of pages we need to launder. + * A positive number indicates that we have a shortfall of clean pages. + */ +static inline int +vm_laundry_target(void) +{ + + return (vm_paging_target()); +} + +/* + * Obtain the value of a per-CPU counter. + */ +#define VM_METER_PCPU_CNT(member) \ + vm_meter_cnt(__offsetof(struct vmmeter, member)) + +u_int vm_meter_cnt(size_t); + #endif /* systemwide totals computed every five seconds */ diff --git a/sys/vm/_vm_radix.h b/sys/vm/_vm_radix.h index 1d06d0a..f066462 100644 --- a/sys/vm/_vm_radix.h +++ b/sys/vm/_vm_radix.h @@ -36,12 +36,8 @@ */ struct vm_radix { uintptr_t rt_root; - uint8_t rt_flags; }; -#define RT_INSERT_INPROG 0x01 -#define RT_TRIE_MODIFIED 0x02 - #ifdef _KERNEL static __inline boolean_t diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 30f6d97..0167117 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1126,7 +1126,7 @@ swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, if (shift != 0) { for (i = 1; i <= shift; i++) { p = vm_page_alloc(object, m[0]->pindex - i, - VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); + VM_ALLOC_NORMAL); if (p == NULL) { /* Shift allocated pages to the left. */ for (j = 0; j < i - 1; j++) @@ -1144,8 +1144,7 @@ swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, - m[reqcount - 1]->pindex + i + 1, - VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); + m[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[shift + reqcount + i] = p; @@ -1549,17 +1548,18 @@ swp_pager_async_iodone(struct buf *bp) * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). + * A page is only written to swap after a period of + * inactivity. Therefore, we do not expect it to be + * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); + vm_page_lock(m); + vm_page_deactivate_noreuse(m); + vm_page_unlock(m); vm_page_sunbusy(m); - if (vm_page_count_severe()) { - vm_page_lock(m); - vm_page_try_to_cache(m); - vm_page_unlock(m); - } } } @@ -1635,12 +1635,15 @@ swap_pager_isswapped(vm_object_t object, struct swdevt *sp) /* * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * - * This routine dissociates the page at the given index within a - * swap block from its backing store, paging it in if necessary. - * If the page is paged in, it is placed in the inactive queue, - * since it had its backing store ripped out from under it. - * We also attempt to swap in all other pages in the swap block, - * we only guarantee that the one at the specified index is + * This routine dissociates the page at the given index within an object + * from its backing store, paging it in if it does not reside in memory. + * If the page is paged in, it is marked dirty and placed in the laundry + * queue. The page is marked dirty because it no longer has backing + * store. It is placed in the laundry queue because it has not been + * accessed recently. Otherwise, it would already reside in memory. + * + * We also attempt to swap in all other pages in the swap block. + * However, we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we @@ -1669,7 +1672,7 @@ swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); - vm_page_deactivate(m); + vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index ba0c775..2a90c15 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -485,11 +485,12 @@ int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { - vm_prot_t prot; - vm_object_t next_object; struct faultstate fs; struct vnode *vp; + vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; + vm_pindex_t retry_pindex; + vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; int locked, nera, result, rv; u_char behavior; @@ -755,8 +756,7 @@ RetryFault:; unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; - } else if (fs.m->valid == VM_PAGE_BITS_ALL) - break; + } } readrest: @@ -1143,10 +1143,6 @@ readrest: * lookup. */ if (!fs.lookup_still_valid) { - vm_object_t retry_object; - vm_pindex_t retry_pindex; - vm_prot_t retry_prot; - if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index cd72cf8..2296fb1 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1858,9 +1858,7 @@ vm_map_submap( * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within - * the specified address range are mapped. Because these mappings are - * being created speculatively, cached pages are not reactivated and - * mapped. + * the specified address range are mapped. */ static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 5aa6085..6e2199e 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -209,29 +209,37 @@ vmtotal(SYSCTL_HANDLER_ARGS) } /* - * vcnt() - accumulate statistics from all cpus and the global cnt - * structure. + * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt + * structure. * * The vmmeter structure is now per-cpu as well as global. Those * statistics which can be kept on a per-cpu basis (to avoid cache * stalls between cpus) can be moved to the per-cpu vmmeter. Remaining * statistics, such as v_free_reserved, are left in the global * structure. - * - * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) */ -static int -vcnt(SYSCTL_HANDLER_ARGS) +u_int +vm_meter_cnt(size_t offset) { - int count = *(int *)arg1; - int offset = (char *)arg1 - (char *)&vm_cnt; + struct pcpu *pcpu; + u_int count; int i; + count = *(u_int *)((char *)&vm_cnt + offset); CPU_FOREACH(i) { - struct pcpu *pcpu = pcpu_find(i); - count += *(int *)((char *)&pcpu->pc_cnt + offset); + pcpu = pcpu_find(i); + count += *(u_int *)((char *)&pcpu->pc_cnt + offset); } - return (SYSCTL_OUT(req, &count, sizeof(int))); + return (count); +} + +static int +cnt_sysctl(SYSCTL_HANDLER_ARGS) +{ + u_int count; + + count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt); + return (SYSCTL_OUT(req, &count, sizeof(count))); } SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, @@ -246,8 +254,8 @@ SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); #define VM_STATS(parent, var, descr) \ SYSCTL_PROC(parent, OID_AUTO, var, \ - CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, vcnt, \ - "IU", descr) + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \ + cnt_sysctl, "IU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) @@ -271,9 +279,10 @@ VM_STATS_VM(v_vnodeout, "Vnode pager pageouts"); VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); -VM_STATS_VM(v_reactivated, "Pages reactivated from free list"); +VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon"); +VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_tcached, "Total pages cached"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); @@ -288,6 +297,7 @@ VM_STATS_VM(v_wire_count, "Wired pages"); VM_STATS_VM(v_active_count, "Active pages"); VM_STATS_VM(v_inactive_target, "Desired inactive pages"); VM_STATS_VM(v_inactive_count, "Inactive pages"); +VM_STATS_VM(v_laundry_count, "Pages eligible for laundering"); VM_STATS_VM(v_cache_count, "Pages on cache queue"); VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index aae3771..68c2108 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -849,9 +849,6 @@ RestartScan: pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); - if (m == NULL && - vm_page_is_cached(object, pindex)) - mincoreinfo = MINCORE_INCORE; if (m != NULL && m->valid == 0) m = NULL; if (m != NULL) diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 2e9d16f..6db1ac4 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -178,9 +178,6 @@ vm_object_zdtor(void *mem, int size, void *arg) ("object %p has reservations", object)); #endif - KASSERT(vm_object_cache_is_empty(object), - ("object %p has cached pages", - object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); @@ -208,12 +205,9 @@ vm_object_zinit(void *mem, int size, int flags) object->type = OBJT_DEAD; object->ref_count = 0; object->rtree.rt_root = 0; - object->rtree.rt_flags = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; - object->cache.rt_root = 0; - object->cache.rt_flags = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); @@ -792,8 +786,6 @@ vm_object_terminate(vm_object_t object) if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif - if (__predict_false(!vm_object_cache_is_empty(object))) - vm_page_cache_free(object, 0, 0); KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, @@ -1135,13 +1127,6 @@ shadowlookup: } else if ((tobject->flags & OBJ_UNMANAGED) != 0) goto unlock_tobject; m = vm_page_lookup(tobject, tpindex); - if (m == NULL && advise == MADV_WILLNEED) { - /* - * If the page is cached, reactivate it. - */ - m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED | - VM_ALLOC_NOBUSY); - } if (m == NULL) { /* * There may be swap even if there is no backing page @@ -1371,7 +1356,7 @@ retry: goto retry; } - /* vm_page_rename() will handle dirty and cache. */ + /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); @@ -1406,19 +1391,6 @@ retry: swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); - - /* - * Transfer any cached pages from orig_object to new_object. - * If swap_pager_copy() found swapped out pages within the - * specified range of orig_object, then it changed - * new_object's type to OBJT_SWAP when it transferred those - * pages to new_object. Otherwise, new_object's type - * should still be OBJT_DEFAULT and orig_object should not - * contain any cached pages within the specified range. - */ - if (__predict_false(!vm_object_cache_is_empty(orig_object))) - vm_page_cache_transfer(orig_object, offidxstart, - new_object); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); @@ -1471,6 +1443,13 @@ vm_object_scan_all_shadowed(vm_object_t object) backing_object = object->backing_object; + /* + * Initial conditions: + * + * We do not want to have to test for the existence of swap + * pages in the backing object. XXX but with the new swapper this + * would be pretty easy to do. + */ if (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) return (false); @@ -1622,8 +1601,7 @@ vm_object_collapse_scan(vm_object_t object, int op) * backing object to the main object. * * If the page was mapped to a process, it can remain mapped - * through the rename. vm_page_rename() will handle dirty and - * cache. + * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, @@ -1758,13 +1736,6 @@ vm_object_collapse(vm_object_t object) backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); - - /* - * Free any cached pages from backing_object. - */ - if (__predict_false( - !vm_object_cache_is_empty(backing_object))) - vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. @@ -1893,7 +1864,7 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) - goto skipmemq; + return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); @@ -1950,9 +1921,6 @@ next: vm_page_unlock(p); } vm_object_pip_wakeup(object); -skipmemq: - if (__predict_false(!vm_object_cache_is_empty(object))) - vm_page_cache_free(object, start, end); } /* @@ -2333,9 +2301,9 @@ sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) * sysctl is only meant to give an * approximation of the system anyway. */ - if (m->queue == PQ_ACTIVE) + if (vm_page_active(m)) kvo.kvo_active++; - else if (m->queue == PQ_INACTIVE) + else if (vm_page_inactive(m)) kvo.kvo_inactive++; } diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 5b65d76..9b2192e 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -79,17 +79,6 @@ * * vm_object_t Virtual memory object. * - * The root of cached pages pool is protected by both the per-object lock - * and the free pages queue mutex. - * On insert in the cache radix trie, the per-object lock is expected - * to be already held and the free pages queue mutex will be - * acquired during the operation too. - * On remove and lookup from the cache radix trie, only the free - * pages queue mutex is expected to be locked. - * These rules allow for reliably checking for the presence of cached - * pages with only the per-object lock held, thereby reducing contention - * for the free pages queue mutex. - * * List of locks * (c) const until freed * (o) per-object lock @@ -118,7 +107,6 @@ struct vm_object { vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ - struct vm_radix cache; /* (o + f) root of the cache page radix trie */ void *handle; union { /* @@ -306,13 +294,6 @@ void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); -static __inline boolean_t -vm_object_cache_is_empty(vm_object_t object) -{ - - return (vm_radix_is_empty(&object->cache)); -} - void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); extern int umtx_shm_vnobj_persistent; diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 7c77b22..6d8b364 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -155,8 +155,7 @@ static int vm_pageout_pages_needed; static uma_zone_t fakepg_zone; -static struct vnode *vm_page_alloc_init(vm_page_t m); -static void vm_page_cache_turn_free(vm_page_t m); +static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_wakeup(void); @@ -391,6 +390,10 @@ vm_page_domain_init(struct vm_domain *vmd) "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = + "vm laundry pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = + &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; @@ -1136,9 +1139,7 @@ void vm_page_dirty_KBI(vm_page_t m) { - /* These assertions refer to this operation by its public name. */ - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_dirty: page in cache!")); + /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; @@ -1262,9 +1263,8 @@ vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) /* * vm_page_remove: * - * Removes the given mem entry from the object/offset-page - * table and the object page list, but do not invalidate/terminate - * the backing store. + * Removes the specified page from its containing object, but does not + * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ @@ -1272,6 +1272,7 @@ void vm_page_remove(vm_page_t m) { vm_object_t object; + vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); @@ -1280,11 +1281,12 @@ vm_page_remove(vm_page_t m) VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); + mrem = vm_radix_remove(&object->rtree, m->pindex); + KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ - vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); /* @@ -1433,9 +1435,7 @@ vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating - * swap. If the page is on the cache, we have to deactivate it - * or vm_page_dirty() will panic. Dirty pages are not allowed - * on the cache. + * swap. * * The objects must be locked. */ @@ -1481,142 +1481,6 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) } /* - * Convert all of the given object's cached pages that have a - * pindex within the given range into free pages. If the value - * zero is given for "end", then the range's upper bound is - * infinity. If the given object is backed by a vnode and it - * transitions from having one or more cached pages to none, the - * vnode's hold count is reduced. - */ -void -vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) -{ - vm_page_t m; - boolean_t empty; - - mtx_lock(&vm_page_queue_free_mtx); - if (__predict_false(vm_radix_is_empty(&object->cache))) { - mtx_unlock(&vm_page_queue_free_mtx); - return; - } - while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) { - if (end != 0 && m->pindex >= end) - break; - vm_radix_remove(&object->cache, m->pindex); - vm_page_cache_turn_free(m); - } - empty = vm_radix_is_empty(&object->cache); - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && empty) - vdrop(object->handle); -} - -/* - * Returns the cached page that is associated with the given - * object and offset. If, however, none exists, returns NULL. - * - * The free page queue must be locked. - */ -static inline vm_page_t -vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - return (vm_radix_lookup(&object->cache, pindex)); -} - -/* - * Remove the given cached page from its containing object's - * collection of cached pages. - * - * The free page queue must be locked. - */ -static void -vm_page_cache_remove(vm_page_t m) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_page_cache_remove: page %p is not cached", m)); - vm_radix_remove(&m->object->cache, m->pindex); - m->object = NULL; - vm_cnt.v_cache_count--; -} - -/* - * Transfer all of the cached pages with offset greater than or - * equal to 'offidxstart' from the original object's cache to the - * new object's cache. However, any cached pages with offset - * greater than or equal to the new object's size are kept in the - * original object. Initially, the new object's cache must be - * empty. Offset 'offidxstart' in the original object must - * correspond to offset zero in the new object. - * - * The new object must be locked. - */ -void -vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, - vm_object_t new_object) -{ - vm_page_t m; - - /* - * Insertion into an object's collection of cached pages - * requires the object to be locked. In contrast, removal does - * not. - */ - VM_OBJECT_ASSERT_WLOCKED(new_object); - KASSERT(vm_radix_is_empty(&new_object->cache), - ("vm_page_cache_transfer: object %p has cached pages", - new_object)); - mtx_lock(&vm_page_queue_free_mtx); - while ((m = vm_radix_lookup_ge(&orig_object->cache, - offidxstart)) != NULL) { - /* - * Transfer all of the pages with offset greater than or - * equal to 'offidxstart' from the original object's - * cache to the new object's cache. - */ - if ((m->pindex - offidxstart) >= new_object->size) - break; - vm_radix_remove(&orig_object->cache, m->pindex); - /* Update the page's object and offset. */ - m->object = new_object; - m->pindex -= offidxstart; - if (vm_radix_insert(&new_object->cache, m)) - vm_page_cache_turn_free(m); - } - mtx_unlock(&vm_page_queue_free_mtx); -} - -/* - * Returns TRUE if a cached page is associated with the given object and - * offset, and FALSE otherwise. - * - * The object must be locked. - */ -boolean_t -vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) -{ - vm_page_t m; - - /* - * Insertion into an object's collection of cached pages requires the - * object to be locked. Therefore, if the object is locked and the - * object's collection is empty, there is no need to acquire the free - * page queues lock in order to prove that the specified page doesn't - * exist. - */ - VM_OBJECT_ASSERT_WLOCKED(object); - if (__predict_true(vm_object_cache_is_empty(object))) - return (FALSE); - mtx_lock(&vm_page_queue_free_mtx); - m = vm_page_cache_lookup(object, pindex); - mtx_unlock(&vm_page_queue_free_mtx); - return (m != NULL); -} - -/* * vm_page_alloc: * * Allocate and return a page that is associated with the specified @@ -1632,9 +1496,6 @@ vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate - * VM_ALLOC_IFCACHED return page only if it is cached - * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page - * is cached * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and @@ -1648,21 +1509,21 @@ vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { - struct vnode *vp = NULL; - vm_object_t m_object; vm_page_t m, mpred; int flags, req_class; - mpred = 0; /* XXX: pacify gcc */ + mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), - ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, - req)); + ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); + if (__predict_false((req & VM_ALLOC_IFCACHED) != 0)) + return (NULL); + req_class = req & VM_ALLOC_CLASS_MASK; /* @@ -1678,45 +1539,27 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) } /* - * The page allocation request can came from consumers which already - * hold the free page queue mutex, like vm_page_insert() in - * vm_page_cache(). + * Allocate a page if the number of free pages exceeds the minimum + * for the request class. */ - mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); + mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) { /* - * Allocate from the free queue if the number of free pages - * exceeds the minimum for the request class. + * Can we allocate the page from a reservation? */ - if (object != NULL && - (m = vm_page_cache_lookup(object, pindex)) != NULL) { - if ((req & VM_ALLOC_IFNOTCACHED) != 0) { - mtx_unlock(&vm_page_queue_free_mtx); - return (NULL); - } - if (vm_phys_unfree_page(m)) - vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); #if VM_NRESERVLEVEL > 0 - else if (!vm_reserv_reactivate_page(m)) -#else - else -#endif - panic("vm_page_alloc: cache page %p is missing" - " from the free queue", m); - } else if ((req & VM_ALLOC_IFCACHED) != 0) { - mtx_unlock(&vm_page_queue_free_mtx); - return (NULL); -#if VM_NRESERVLEVEL > 0 - } else if (object == NULL || (object->flags & (OBJ_COLORED | + if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = - vm_reserv_alloc_page(object, pindex, mpred)) == NULL) { -#else - } else { + vm_reserv_alloc_page(object, pindex, mpred)) == NULL) #endif + { + /* + * If not, allocate it from the free page queues. + */ m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 @@ -1742,37 +1585,11 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); - KASSERT(m->queue == PQ_NONE, - ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); - KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); - KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); - KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m)); - KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); - KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, - ("vm_page_alloc: page %p has unexpected memattr %d", m, - pmap_page_get_memattr(m))); - if ((m->flags & PG_CACHED) != 0) { - KASSERT((m->flags & PG_ZERO) == 0, - ("vm_page_alloc: cached page %p is PG_ZERO", m)); - KASSERT(m->valid != 0, - ("vm_page_alloc: cached page %p is invalid", m)); - if (m->object == object && m->pindex == pindex) - vm_cnt.v_reactivated++; - else - m->valid = 0; - m_object = m->object; - vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && - vm_object_cache_is_empty(m_object)) - vp = m_object->handle; - } else { - KASSERT(m->valid == 0, - ("vm_page_alloc: free page %p is valid", m)); - vm_phys_freecnt_adj(m, -1); - if ((m->flags & PG_ZERO) != 0) - vm_page_zero_count--; - } + vm_phys_freecnt_adj(m, -1); + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); + vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. @@ -1804,18 +1621,16 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - /* See the comment below about hold count. */ - if (vp != NULL) - vdrop(vp); pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } - m->object = NULL; + KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; - vm_page_free(m); + /* Don't change PG_ZERO. */ + vm_page_free_toq(m); return (NULL); } @@ -1827,15 +1642,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) m->pindex = pindex; /* - * The following call to vdrop() must come after the above call - * to vm_page_insert() in case both affect the same object and - * vnode. Otherwise, the affected vnode's hold count could - * temporarily become zero. - */ - if (vp != NULL) - vdrop(vp); - - /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ @@ -1845,16 +1651,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) return (m); } -static void -vm_page_alloc_contig_vdrop(struct spglist *lst) -{ - - while (!SLIST_EMPTY(lst)) { - vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv); - SLIST_REMOVE_HEAD(lst, plinks.s.ss); - } -} - /* * vm_page_alloc_contig: * @@ -1876,6 +1672,8 @@ vm_page_alloc_contig_vdrop(struct spglist *lst) * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * + * The specified object may not contain fictitious pages. + * * The caller must always specify an allocation class. * * allocation classes: @@ -1899,22 +1697,21 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { - struct vnode *drop; - struct spglist deferred_vdrop_list; - vm_page_t m, m_tmp, m_ret; - u_int flags; + vm_page_t m, m_ret, mpred; + u_int busy_lock, flags, oflags; int req_class; + mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), - ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, + ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); - KASSERT(object->type == OBJT_PHYS, - ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", + KASSERT((object->flags & OBJ_FICTITIOUS) == 0, + ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); @@ -1926,19 +1723,34 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; - SLIST_INIT(&deferred_vdrop_list); + if (object != NULL) { + mpred = vm_radix_lookup_le(&object->rtree, pindex); + KASSERT(mpred == NULL || mpred->pindex != pindex, + ("vm_page_alloc_contig: pindex already allocated")); + } + + /* + * Can we allocate the pages without the number of free pages falling + * below the lower bound for the allocation class? + */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages)) { + /* + * Can we allocate the pages from a reservation? + */ #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, - low, high, alignment, boundary)) == NULL) + low, high, alignment, boundary, mpred)) == NULL) #endif + /* + * If not, allocate them from the free page queues. + */ m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { @@ -1948,17 +1760,7 @@ retry: return (NULL); } if (m_ret != NULL) - for (m = m_ret; m < &m_ret[npages]; m++) { - drop = vm_page_alloc_init(m); - if (drop != NULL) { - /* - * Enqueue the vnode for deferred vdrop(). - */ - m->plinks.s.pv = drop; - SLIST_INSERT_HEAD(&deferred_vdrop_list, m, - plinks.s.ss); - } - } + vm_phys_freecnt_adj(m_ret, -npages); else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, @@ -1966,9 +1768,14 @@ retry: goto retry; #endif } + for (m = m_ret; m < &m_ret[npages]; m++) + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); + for (m = m_ret; m < &m_ret[npages]; m++) + vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. @@ -1978,6 +1785,13 @@ retry: flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; + oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? + VPO_UNMANAGED : 0; + busy_lock = VPB_UNBUSIED; + if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) + busy_lock = VPB_SINGLE_EXCLUSIVER; + if ((req & VM_ALLOC_SBUSY) != 0) + busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { @@ -1988,98 +1802,61 @@ retry: for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; - m->busy_lock = VPB_UNBUSIED; - if (object != NULL) { - if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) - m->busy_lock = VPB_SINGLE_EXCLUSIVER; - if ((req & VM_ALLOC_SBUSY) != 0) - m->busy_lock = VPB_SHARERS_WORD(1); - } + m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; - /* Unmanaged pages don't use "act_count". */ - m->oflags = VPO_UNMANAGED; + m->act_count = 0; + m->oflags = oflags; if (object != NULL) { - if (vm_page_insert(m, object, pindex)) { - vm_page_alloc_contig_vdrop( - &deferred_vdrop_list); - if (vm_paging_needed()) - pagedaemon_wakeup(); + if (vm_page_insert_after(m, object, pindex, mpred)) { + pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) - atomic_subtract_int(&vm_cnt.v_wire_count, - npages); - for (m_tmp = m, m = m_ret; - m < &m_ret[npages]; m++) { - if ((req & VM_ALLOC_WIRED) != 0) + atomic_subtract_int( + &vm_cnt.v_wire_count, npages); + KASSERT(m->object == NULL, + ("page %p has object", m)); + mpred = m; + for (m = m_ret; m < &m_ret[npages]; m++) { + if (m <= mpred && + (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; - if (m >= m_tmp) { - m->object = NULL; - m->oflags |= VPO_UNMANAGED; - } + m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; - vm_page_free(m); + /* Don't change PG_ZERO. */ + vm_page_free_toq(m); } return (NULL); } + mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } - vm_page_alloc_contig_vdrop(&deferred_vdrop_list); if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* - * Initialize a page that has been freshly dequeued from a freelist. - * The caller has to drop the vnode returned, if it is not NULL. - * - * This function may only be used to initialize unmanaged pages. - * - * To be called with vm_page_queue_free_mtx held. + * Check a page that has been freshly dequeued from a freelist. */ -static struct vnode * -vm_page_alloc_init(vm_page_t m) +static void +vm_page_alloc_check(vm_page_t m) { - struct vnode *drop; - vm_object_t m_object; + KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, - ("vm_page_alloc_init: page %p has unexpected queue %d", - m, m->queue)); - KASSERT(m->wire_count == 0, - ("vm_page_alloc_init: page %p is wired", m)); - KASSERT(m->hold_count == 0, - ("vm_page_alloc_init: page %p is held", m)); - KASSERT(!vm_page_busied(m), - ("vm_page_alloc_init: page %p is busy", m)); - KASSERT(m->dirty == 0, - ("vm_page_alloc_init: page %p is dirty", m)); + ("page %p has unexpected queue %d", m, m->queue)); + KASSERT(m->wire_count == 0, ("page %p is wired", m)); + KASSERT(m->hold_count == 0, ("page %p is held", m)); + KASSERT(!vm_page_busied(m), ("page %p is busy", m)); + KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, - ("vm_page_alloc_init: page %p has unexpected memattr %d", + ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - drop = NULL; - if ((m->flags & PG_CACHED) != 0) { - KASSERT((m->flags & PG_ZERO) == 0, - ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); - m->valid = 0; - m_object = m->object; - vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && - vm_object_cache_is_empty(m_object)) - drop = m_object->handle; - } else { - KASSERT(m->valid == 0, - ("vm_page_alloc_init: free page %p is valid", m)); - vm_phys_freecnt_adj(m, -1); - if ((m->flags & PG_ZERO) != 0) - vm_page_zero_count--; - } - return (drop); + KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* @@ -2105,7 +1882,6 @@ vm_page_alloc_init(vm_page_t m) vm_page_t vm_page_alloc_freelist(int flind, int req) { - struct vnode *drop; vm_page_t m; u_int flags; int req_class; @@ -2121,7 +1897,7 @@ vm_page_alloc_freelist(int flind, int req) /* * Do not allocate reserved pages unless the req has asked for it. */ - mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); + mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || @@ -2139,8 +1915,11 @@ vm_page_alloc_freelist(int flind, int req) mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } - drop = vm_page_alloc_init(m); + vm_phys_freecnt_adj(m, -1); + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); + vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. @@ -2160,8 +1939,6 @@ vm_page_alloc_freelist(int flind, int req) } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; - if (drop != NULL) - vdrop(drop); if (vm_paging_needed()) pagedaemon_wakeup(); return (m); @@ -2284,41 +2061,11 @@ retry: } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); - /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && - object->type != OBJT_VNODE) + object->type != OBJT_VNODE) { run_ext = 0; - else if ((m->flags & PG_CACHED) != 0 || - m != vm_page_lookup(object, m->pindex)) { - /* - * The page is cached or recently converted - * from cached to free. - */ -#if VM_NRESERVLEVEL > 0 - if (level >= 0) { - /* - * The page is reserved. Extend the - * current run by one page. - */ - run_ext = 1; - } else -#endif - if ((order = m->order) < VM_NFREEORDER) { - /* - * The page is enqueued in the - * physical memory allocator's cache/ - * free page queues. Moreover, it is - * the first page in a power-of-two- - * sized run of contiguous cache/free - * pages. Add these pages to the end - * of the current run, and jump - * ahead. - */ - run_ext = 1 << order; - m_inc = 1 << order; - } else - run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { @@ -2351,18 +2098,18 @@ unlock: } else if (level >= 0) { /* * The page is reserved but not yet allocated. In - * other words, it is still cached or free. Extend - * the current run by one page. + * other words, it is still free. Extend the current + * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory - * allocator's cache/free page queues. Moreover, it - * is the first page in a power-of-two-sized run of - * contiguous cache/free pages. Add these pages to - * the end of the current run, and jump ahead. + * allocator's free page queues. Moreover, it is the + * first page in a power-of-two-sized run of + * contiguous free pages. Add these pages to the end + * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; @@ -2370,16 +2117,15 @@ unlock: /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's - * cache/free page queues. However, it is not the - * first page in a run of contiguous cache/free pages. - * (This case rarely occurs because the scan is - * performed in ascending order.) (2) It is not - * reserved, and it is transitioning from free to - * allocated. (Conversely, the transition from - * allocated to free for managed pages is blocked by - * the page lock.) (3) It is allocated but not - * contained by an object and not wired, e.g., - * allocated by Xen's balloon driver. + * free page queues. However, it is not the first + * page in a run of contiguous free pages. (This case + * rarely occurs because the scan is performed in + * ascending order.) (2) It is not reserved, and it is + * transitioning from free to allocated. (Conversely, + * the transition from allocated to free for managed + * pages is blocked by the page lock.) (3) It is + * allocated but not contained by an object and not + * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } @@ -2480,20 +2226,12 @@ retry: } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); - /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; - else if ((m->flags & PG_CACHED) != 0 || - m != vm_page_lookup(object, m->pindex)) { - /* - * The page is cached or recently converted - * from cached to free. - */ - VM_OBJECT_WUNLOCK(object); - goto cached; - } else if (object->memattr != VM_MEMATTR_DEFAULT) + else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (m->queue != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == @@ -2594,17 +2332,16 @@ retry: unlock: VM_OBJECT_WUNLOCK(object); } else { -cached: mtx_lock(&vm_page_queue_free_mtx); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory - * allocator's cache/free page queues. - * Moreover, it is the first page in a power- - * of-two-sized run of contiguous cache/free - * pages. Jump ahead to the last page within - * that run, and continue from there. + * allocator's free page queues. Moreover, it + * is the first page in a power-of-two-sized + * run of contiguous free pages. Jump ahead + * to the last page within that run, and + * continue from there. */ m += (1 << order) - 1; } @@ -2653,9 +2390,9 @@ CTASSERT(powerof2(NRUNS)); * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may - * fail due to a shortage of cache/free pages. When reclamation fails, - * callers are expected to perform VM_WAIT before retrying a failed - * allocation operation, e.g., vm_page_alloc_contig(). + * fail due to a shortage of free pages. When reclamation fails, callers + * are expected to perform VM_WAIT before retrying a failed allocation + * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * @@ -2690,8 +2427,8 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, req_class = VM_ALLOC_SYSTEM; /* - * Return if the number of cached and free pages cannot satisfy the - * requested allocation. + * Return if the number of free pages cannot satisfy the requested + * allocation. */ count = vm_cnt.v_free_count + vm_cnt.v_cache_count; if (count < npages + vm_cnt.v_free_reserved || (count < npages + @@ -2809,7 +2546,10 @@ struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { - return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); + if (vm_page_in_laundry(m)) + return (&vm_dom[0].vmd_pagequeues[m->queue]); + else + return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* @@ -2871,7 +2611,10 @@ vm_page_enqueue(uint8_t queue, vm_page_t m) KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); - pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; + if (queue == PQ_LAUNDRY) + pq = &vm_dom[0].vmd_pagequeues[queue]; + else + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); @@ -2955,9 +2698,8 @@ vm_page_activate(vm_page_t m) /* * vm_page_free_wakeup: * - * Helper routine for vm_page_free_toq() and vm_page_cache(). This - * routine is called when a page has been added to the cache or free - * queues. + * Helper routine for vm_page_free_toq(). This routine is called + * when a page is added to the free queues. * * The page queues must be locked. */ @@ -2987,27 +2729,6 @@ vm_page_free_wakeup(void) } /* - * Turn a cached page into a free page, by changing its attributes. - * Keep the statistics up-to-date. - * - * The free page queue must be locked. - */ -static void -vm_page_cache_turn_free(vm_page_t m) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - - m->object = NULL; - m->valid = 0; - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_page_cache_turn_free: page %p is not cached", m)); - m->flags &= ~PG_CACHED; - vm_cnt.v_cache_count--; - vm_phys_freecnt_adj(m, 1); -} - -/* * vm_page_free_toq: * * Returns the given page to the free list, @@ -3066,8 +2787,8 @@ vm_page_free_toq(vm_page_t m) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* - * Insert the page into the physical memory allocator's - * cache/free page queues. + * Insert the page into the physical memory allocator's free + * page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); @@ -3159,11 +2880,8 @@ vm_page_unwire(vm_page_t m, uint8_t queue) if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && - m->object != NULL && queue != PQ_NONE) { - if (queue == PQ_INACTIVE) - m->flags &= ~PG_WINATCFLS; + m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); - } return (TRUE); } else return (FALSE); @@ -3174,21 +2892,10 @@ vm_page_unwire(vm_page_t m, uint8_t queue) /* * Move the specified page to the inactive queue. * - * Many pages placed on the inactive queue should actually go - * into the cache, but it is difficult to figure out which. What - * we do instead, if the inactive target is well met, is to put - * clean pages at the head of the inactive queue instead of the tail. - * This will cause them to be moved to the cache more quickly and - * if not actively re-referenced, reclaimed more quickly. If we just - * stick these pages at the end of the inactive queue, heavy filesystem - * meta-data accesses can cause an unnecessary paging load on memory bound - * processes. This optimization causes one-time-use metadata to be - * reused more quickly. - * - * Normally noreuse is FALSE, resulting in LRU operation. noreuse is set - * to TRUE if we want this page to be 'as if it were placed in the cache', - * except without unmapping it from the process address space. In - * practice this is implemented by inserting the page at the head of the + * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive + * queue. However, setting "noreuse" to TRUE will accelerate the specified + * page's reclamation, but it will not unmap the page from any address space. + * This is implemented by inserting the page near the head of the inactive * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. @@ -3216,7 +2923,6 @@ _vm_page_deactivate(vm_page_t m, boolean_t noreuse) } else { if (queue != PQ_NONE) vm_page_dequeue(m); - m->flags &= ~PG_WINATCFLS; vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; @@ -3256,24 +2962,25 @@ vm_page_deactivate_noreuse(vm_page_t m) } /* - * vm_page_try_to_cache: + * vm_page_launder * - * Returns 0 on failure, 1 on success + * Put a page in the laundry. */ -int -vm_page_try_to_cache(vm_page_t m) +void +vm_page_launder(vm_page_t m) { + int queue; - vm_page_lock_assert(m, MA_OWNED); - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (m->dirty || m->hold_count || m->wire_count || - (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) - return (0); - pmap_remove_all(m); - if (m->dirty) - return (0); - vm_page_cache(m); - return (1); + vm_page_assert_locked(m); + if ((queue = m->queue) != PQ_LAUNDRY) { + if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { + if (queue != PQ_NONE) + vm_page_dequeue(m); + vm_page_enqueue(PQ_LAUNDRY, m); + } else + KASSERT(queue == PQ_NONE, + ("wired page %p is queued", m)); + } } /* @@ -3300,112 +3007,6 @@ vm_page_try_to_free(vm_page_t m) } /* - * vm_page_cache - * - * Put the specified page onto the page cache queue (if appropriate). - * - * The object and page must be locked. - */ -void -vm_page_cache(vm_page_t m) -{ - vm_object_t object; - boolean_t cache_was_empty; - - vm_page_lock_assert(m, MA_OWNED); - object = m->object; - VM_OBJECT_ASSERT_WLOCKED(object); - if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) || - m->hold_count || m->wire_count) - panic("vm_page_cache: attempting to cache busy page"); - KASSERT(!pmap_page_is_mapped(m), - ("vm_page_cache: page %p is mapped", m)); - KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m)); - if (m->valid == 0 || object->type == OBJT_DEFAULT || - (object->type == OBJT_SWAP && - !vm_pager_has_page(object, m->pindex, NULL, NULL))) { - /* - * Hypothesis: A cache-eligible page belonging to a - * default object or swap object but without a backing - * store must be zero filled. - */ - vm_page_free(m); - return; - } - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_cache: page %p is already cached", m)); - - /* - * Remove the page from the paging queues. - */ - vm_page_remque(m); - - /* - * Remove the page from the object's collection of resident - * pages. - */ - vm_radix_remove(&object->rtree, m->pindex); - TAILQ_REMOVE(&object->memq, m, listq); - object->resident_page_count--; - - /* - * Restore the default memory attribute to the page. - */ - if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) - pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); - - /* - * Insert the page into the object's collection of cached pages - * and the physical memory allocator's cache/free page queues. - */ - m->flags &= ~PG_ZERO; - mtx_lock(&vm_page_queue_free_mtx); - cache_was_empty = vm_radix_is_empty(&object->cache); - if (vm_radix_insert(&object->cache, m)) { - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && - object->resident_page_count == 0) - vdrop(object->handle); - m->object = NULL; - vm_page_free(m); - return; - } - - /* - * The above call to vm_radix_insert() could reclaim the one pre- - * existing cached page from this object, resulting in a call to - * vdrop(). - */ - if (!cache_was_empty) - cache_was_empty = vm_radix_is_singleton(&object->cache); - - m->flags |= PG_CACHED; - vm_cnt.v_cache_count++; - PCPU_INC(cnt.v_tcached); -#if VM_NRESERVLEVEL > 0 - if (!vm_reserv_free_page(m)) { -#else - if (TRUE) { -#endif - vm_phys_free_pages(m, 0); - } - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); - - /* - * Increment the vnode's hold count if this is the object's only - * cached page. Decrement the vnode's hold count if this was - * the object's only resident page. - */ - if (object->type == OBJT_VNODE) { - if (cache_was_empty && object->resident_page_count != 0) - vhold(object->handle); - else if (!cache_was_empty && object->resident_page_count == 0) - vdrop(object->handle); - } -} - -/* * vm_page_advise * * Deactivate or do nothing, as appropriate. @@ -3421,16 +3022,9 @@ vm_page_advise(vm_page_t m, int advice) if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed - * up by the system. However, such pages are often reused - * quickly by malloc() so we do not do anything that would - * cause a page fault if we can help it. - * - * Specifically, we do not try to actually free the page now - * nor do we try to put it in the cache (which would cause a - * page fault on reuse). - * - * But we do make the page as freeable as we can without - * actually taking the step of unmapping it. + * without first paging it out. MADV_FREE pages are often + * quickly reused by malloc(3), so we do not do anything that + * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) @@ -3448,11 +3042,13 @@ vm_page_advise(vm_page_t m, int advice) /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that - * the page will be reused quickly. Dirty pages are given a chance to - * cycle once through the inactive queue before becoming eligible for - * laundering. + * the page will be reused quickly. Dirty pages not already in the + * laundry are moved there. */ - _vm_page_deactivate(m, m->dirty == 0); + if (m->dirty == 0) + vm_page_deactivate_noreuse(m); + else + vm_page_launder(m); } /* @@ -3517,8 +3113,7 @@ retrylookup: VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; - } else if (m->valid != 0) - return (m); + } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); @@ -3961,6 +3556,7 @@ DB_SHOW_COMMAND(page, vm_page_print_page_info) db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); + db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); @@ -3975,12 +3571,14 @@ DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) db_printf("pq_free %d pq_cache %d\n", vm_cnt.v_free_count, vm_cnt.v_cache_count); for (dom = 0; dom < vm_ndomains; dom++) { - db_printf("dom %d page_cnt %d free %d pq_act %d pq_inact %d\n", + db_printf( + "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, - vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt); + vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, + vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt); } } diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 85c6ac5..1ee8dde 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -206,7 +206,8 @@ struct vm_page { #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 -#define PQ_COUNT 2 +#define PQ_LAUNDRY 2 +#define PQ_COUNT 3 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); @@ -228,6 +229,7 @@ struct vm_domain { boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; + struct vm_page vmd_laundry_marker; struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ }; @@ -236,6 +238,7 @@ extern struct vm_domain vm_dom[MAXMEMDOM]; #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) +#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #ifdef _KERNEL @@ -323,11 +326,9 @@ extern struct mtx_padalign pa_lock[]; * Page flags. If changed at any other time than page allocation or * freeing, the modification must be protected by the vm_page lock. */ -#define PG_CACHED 0x0001 /* page is cached */ #define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ #define PG_ZERO 0x0008 /* page is zeroed */ #define PG_MARKER 0x0010 /* special queue marker page */ -#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */ #define PG_NODUMP 0x0080 /* don't include this page in a dump */ #define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */ @@ -351,19 +352,16 @@ extern struct mtx_padalign pa_lock[]; * free * Available for allocation now. * - * cache - * Almost available for allocation. Still associated with - * an object, but clean and immediately freeable. - * - * The following lists are LRU sorted: - * * inactive * Low activity, candidates for reclamation. + * This list is approximately LRU ordered. + * + * laundry * This is the list of pages that should be * paged out next. * * active - * Pages that are "active" i.e. they have been + * Pages that are "active", i.e., they have been * recently referenced. * */ @@ -407,8 +405,8 @@ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); #define VM_ALLOC_ZERO 0x0040 /* (acfg) Try to obtain a zeroed page */ #define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* (acg) Do not busy the page */ -#define VM_ALLOC_IFCACHED 0x0400 /* (ag) Fail if page is not cached */ -#define VM_ALLOC_IFNOTCACHED 0x0800 /* (ag) Fail if page is cached */ +#define VM_ALLOC_IFCACHED 0x0400 +#define VM_ALLOC_IFNOTCACHED 0x0800 #define VM_ALLOC_IGN_SBUSY 0x1000 /* (g) Ignore shared busy flag */ #define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ #define VM_ALLOC_SBUSY 0x4000 /* (acg) Shared busy the page */ @@ -451,10 +449,6 @@ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); -void vm_page_cache(vm_page_t); -void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t); -void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); -int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); @@ -464,7 +458,7 @@ vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); -boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); +void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); @@ -698,5 +692,26 @@ vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, (void)mret; } +static inline bool +vm_page_active(vm_page_t m) +{ + + return (m->queue == PQ_ACTIVE); +} + +static inline bool +vm_page_inactive(vm_page_t m) +{ + + return (m->queue == PQ_INACTIVE); +} + +static inline bool +vm_page_in_laundry(vm_page_t m) +{ + + return (m->queue == PQ_LAUNDRY); +} + #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index cd8fe45..cd7bfb6 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -119,7 +119,7 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); -static int vm_pageout_clean(vm_page_t m); +static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, @@ -154,6 +154,9 @@ static struct kproc_desc vm_kp = { SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif +/* Pagedaemon activity rates, in subdivisions of one second. */ +#define VM_LAUNDER_RATE 10 +#define VM_INACT_SCAN_RATE 2 int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_wakeup_thresh; @@ -161,6 +164,13 @@ static int vm_pageout_oom_seq = 12; bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ bool vm_pages_needed; /* Are threads waiting for free pages? */ +/* Pending request for dirty page laundering. */ +static enum { + VM_LAUNDRY_IDLE, + VM_LAUNDRY_BACKGROUND, + VM_LAUNDRY_SHORTFALL +} vm_laundry_request = VM_LAUNDRY_IDLE; + #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; @@ -168,9 +178,7 @@ static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif -static int vm_max_launder = 32; static int vm_pageout_update_period; -static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; @@ -193,9 +201,6 @@ SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); -SYSCTL_INT(_vm, OID_AUTO, max_launder, - CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); - SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); @@ -215,9 +220,6 @@ SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif -SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, - CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); - SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); @@ -229,6 +231,25 @@ SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); +static int act_scan_laundry_weight = 3; +SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW, + &act_scan_laundry_weight, 0, + "weight given to clean vs. dirty pages in active queue scans"); + +static u_int vm_background_launder_target; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW, + &vm_background_launder_target, 0, + "background laundering target, in pages"); + +static u_int vm_background_launder_rate = 4096; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW, + &vm_background_launder_rate, 0, + "background laundering rate, in kilobytes per second"); + +static u_int vm_background_launder_max = 20 * 1024; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW, + &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); + #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -236,7 +257,11 @@ int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); +static u_int isqrt(u_int num); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); +static int vm_pageout_launder(struct vm_domain *vmd, int launder, + bool in_shortfall); +static void vm_pageout_laundry_worker(void *arg); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); @@ -387,7 +412,7 @@ vm_pageout_cluster(vm_page_t m) /* * We can cluster only if the page is not clean, busy, or held, and - * the page is inactive. + * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file @@ -413,7 +438,7 @@ more: break; } vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; @@ -439,7 +464,7 @@ more: if (p->dirty == 0) break; vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; @@ -519,23 +544,33 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); + /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* - * Page outside of range of object. Right now we - * essentially lose the changes by pretending it - * worked. + * The page is outside the object's range. We pretend + * that the page out worked and clean the page, so the + * changes will be lost if the page is reclaimed by + * the page daemon. */ vm_page_undirty(mt); + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* - * If page couldn't be paged out, then reactivate the - * page so it doesn't clog the inactive list. (We - * will try paging out it again later). + * If the page couldn't be paged out, then reactivate + * it so that it doesn't clog the laundry and inactive + * queues. (We will try paging it out again later). */ vm_page_lock(mt); vm_page_activate(mt); @@ -617,10 +652,10 @@ vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } - if (p->queue != PQ_ACTIVE && act_delta != 0) { + if (!vm_page_active(p) && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; - } else if (p->queue == PQ_ACTIVE) { + } else if (vm_page_active(p)) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); @@ -636,7 +671,7 @@ vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, p->act_count += ACT_ADVANCE; vm_page_requeue(p); } - } else if (p->queue == PQ_INACTIVE) + } else if (vm_page_inactive(p)) pmap_remove_all(p); vm_page_unlock(p); } @@ -739,7 +774,7 @@ vm_pageout_map_deactivate_pages(map, desired) * Returns 0 on success and an errno otherwise. */ static int -vm_pageout_clean(vm_page_t m) +vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; @@ -797,7 +832,7 @@ vm_pageout_clean(vm_page_t m) * (3) reallocated to a different offset, or * (4) cleaned. */ - if (m->queue != PQ_INACTIVE || m->object != object || + if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; @@ -821,7 +856,7 @@ vm_pageout_clean(vm_page_t m) * laundry. If it is still in the laundry, then we * start the cleaning operation. */ - if (vm_pageout_cluster(m) == 0) + if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: @@ -840,11 +875,390 @@ unlock_mp: } /* + * Attempt to launder the specified number of pages. + * + * Returns the number of pages successfully laundered. + */ +static int +vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) +{ + struct vm_pagequeue *pq; + vm_object_t object; + vm_page_t m, next; + int act_delta, error, maxscan, numpagedout, starting_target; + int vnodes_skipped; + bool pageout_ok, queue_locked; + + starting_target = launder; + vnodes_skipped = 0; + + /* + * Scan the laundry queue for pages eligible to be laundered. We stop + * once the target number of dirty pages have been laundered, or once + * we've reached the end of the queue. A single iteration of this loop + * may cause more than one page to be laundered because of clustering. + * + * maxscan ensures that we don't re-examine requeued pages. Any + * additional pages written as part of a cluster are subtracted from + * maxscan since they must be taken from the laundry queue. + */ + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + maxscan = pq->pq_cnt; + + vm_pagequeue_lock(pq); + queue_locked = true; + for (m = TAILQ_FIRST(&pq->pq_pl); + m != NULL && maxscan-- > 0 && launder > 0; + m = next) { + vm_pagequeue_assert_locked(pq); + KASSERT(queue_locked, ("unlocked laundry queue")); + KASSERT(vm_page_in_laundry(m), + ("page %p has an inconsistent queue", m)); + next = TAILQ_NEXT(m, plinks.q); + if ((m->flags & PG_MARKER) != 0) + continue; + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); + if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { + vm_page_unlock(m); + continue; + } + object = m->object; + if ((!VM_OBJECT_TRYWLOCK(object) && + (!vm_pageout_fallback_object_lock(m, &next) || + m->hold_count != 0)) || vm_page_busied(m)) { + VM_OBJECT_WUNLOCK(object); + vm_page_unlock(m); + continue; + } + + /* + * Unlock the laundry queue, invalidating the 'next' pointer. + * Use a marker to remember our place in the laundry queue. + */ + TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, + plinks.q); + vm_pagequeue_unlock(pq); + queue_locked = false; + + /* + * Invalid pages can be easily freed. They cannot be + * mapped; vm_page_free() asserts this. + */ + if (m->valid == 0) + goto free_page; + + /* + * If the page has been referenced and the object is not dead, + * reactivate or requeue the page depending on whether the + * object is mapped. + */ + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta = 1; + } else + act_delta = 0; + if (object->ref_count != 0) + act_delta += pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + } + if (act_delta != 0) { + if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); + vm_page_activate(m); + + /* + * Increase the activation count if the page + * was referenced while in the laundry queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; + + /* + * If this was a background laundering, count + * activated pages towards our target. The + * purpose of background laundering is to ensure + * that pages are eventually cycled through the + * laundry queue, and an activation is a valid + * way out. + */ + if (!in_shortfall) + launder--; + goto drop_page; + } else if ((object->flags & OBJ_DEAD) == 0) + goto requeue_page; + } + + /* + * If the page appears to be clean at the machine-independent + * layer, then remove all of its mappings from the pmap in + * anticipation of freeing it. If, however, any of the page's + * mappings allow write access, then the page may still be + * modified until the last of those mappings are removed. + */ + if (object->ref_count != 0) { + vm_page_test_dirty(m); + if (m->dirty == 0) + pmap_remove_all(m); + } + + /* + * Clean pages are freed, and dirty pages are paged out unless + * they belong to a dead object. Requeueing dirty pages from + * dead objects is pointless, as they are being paged out and + * freed by the thread that destroyed the object. + */ + if (m->dirty == 0) { +free_page: + vm_page_free(m); + PCPU_INC(cnt.v_dfree); + } else if ((object->flags & OBJ_DEAD) == 0) { + if (object->type != OBJT_SWAP && + object->type != OBJT_DEFAULT) + pageout_ok = true; + else if (disable_swap_pageouts) + pageout_ok = false; + else + pageout_ok = true; + if (!pageout_ok) { +requeue_page: + vm_pagequeue_lock(pq); + queue_locked = true; + vm_page_requeue_locked(m); + goto drop_page; + } + + /* + * Form a cluster with adjacent, dirty pages from the + * same object, and page out that entire cluster. + * + * The adjacent, dirty pages must also be in the + * laundry. However, their mappings are not checked + * for new references. Consequently, a recently + * referenced page may be paged out. However, that + * page will not be prematurely reclaimed. After page + * out, the page will be placed in the inactive queue, + * where any new references will be detected and the + * page reactivated. + */ + error = vm_pageout_clean(m, &numpagedout); + if (error == 0) { + launder -= numpagedout; + maxscan -= numpagedout - 1; + } else if (error == EDEADLK) { + pageout_lock_miss++; + vnodes_skipped++; + } + goto relock_queue; + } +drop_page: + vm_page_unlock(m); + VM_OBJECT_WUNLOCK(object); +relock_queue: + if (!queue_locked) { + vm_pagequeue_lock(pq); + queue_locked = true; + } + next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); + } + vm_pagequeue_unlock(pq); + + /* + * Wakeup the sync daemon if we skipped a vnode in a writeable object + * and we didn't launder enough pages. + */ + if (vnodes_skipped > 0 && launder > 0) + (void)speedup_syncer(); + + return (starting_target - launder); +} + +/* + * Compute the integer square root. + */ +static u_int +isqrt(u_int num) +{ + u_int bit, root, tmp; + + bit = 1u << ((NBBY * sizeof(u_int)) - 2); + while (bit > num) + bit >>= 2; + root = 0; + while (bit != 0) { + tmp = root + bit; + root >>= 1; + if (num >= tmp) { + num -= tmp; + root += bit; + } + bit >>= 2; + } + return (root); +} + +/* + * Perform the work of the laundry thread: periodically wake up and determine + * whether any pages need to be laundered. If so, determine the number of pages + * that need to be laundered, and launder them. + */ +static void +vm_pageout_laundry_worker(void *arg) +{ + struct vm_domain *domain; + struct vm_pagequeue *pq; + uint64_t nclean, ndirty; + u_int last_launder, wakeups; + int domidx, last_target, launder, shortfall, shortfall_cycle, target; + bool in_shortfall; + + domidx = (uintptr_t)arg; + domain = &vm_dom[domidx]; + pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; + KASSERT(domain->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); + + shortfall = 0; + in_shortfall = false; + shortfall_cycle = 0; + target = 0; + last_launder = 0; + + /* + * The pageout laundry worker is never done, so loop forever. + */ + for (;;) { + KASSERT(target >= 0, ("negative target %d", target)); + KASSERT(shortfall_cycle >= 0, + ("negative cycle %d", shortfall_cycle)); + launder = 0; + wakeups = VM_METER_PCPU_CNT(v_pdwakeups); + + /* + * First determine whether we need to launder pages to meet a + * shortage of free pages. + */ + if (shortfall > 0) { + in_shortfall = true; + shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; + target = shortfall; + } else if (!in_shortfall) + goto trybackground; + else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { + /* + * We recently entered shortfall and began laundering + * pages. If we have completed that laundering run + * (and we are no longer in shortfall) or we have met + * our laundry target through other activity, then we + * can stop laundering pages. + */ + in_shortfall = false; + target = 0; + goto trybackground; + } + last_launder = wakeups; + launder = target / shortfall_cycle--; + goto dolaundry; + + /* + * There's no immediate need to launder any pages; see if we + * meet the conditions to perform background laundering: + * + * 1. The ratio of dirty to clean inactive pages exceeds the + * background laundering threshold and the pagedaemon has + * been woken up to reclaim pages since our last + * laundering, or + * 2. we haven't yet reached the target of the current + * background laundering run. + * + * The background laundering threshold is not a constant. + * Instead, it is a slowly growing function of the number of + * page daemon wakeups since the last laundering. Thus, as the + * ratio of dirty to clean inactive pages grows, the amount of + * memory pressure required to trigger laundering decreases. + */ +trybackground: + nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; + ndirty = vm_cnt.v_laundry_count; + if (target == 0 && wakeups != last_launder && + ndirty * isqrt(wakeups - last_launder) >= nclean) { + target = vm_background_launder_target; + } + + /* + * We have a non-zero background laundering target. If we've + * laundered up to our maximum without observing a page daemon + * wakeup, just stop. This is a safety belt that ensures we + * don't launder an excessive amount if memory pressure is low + * and the ratio of dirty to clean pages is large. Otherwise, + * proceed at the background laundering rate. + */ + if (target > 0) { + if (wakeups != last_launder) { + last_launder = wakeups; + last_target = target; + } else if (last_target - target >= + vm_background_launder_max * PAGE_SIZE / 1024) { + target = 0; + } + launder = vm_background_launder_rate * PAGE_SIZE / 1024; + launder /= VM_LAUNDER_RATE; + if (launder > target) + launder = target; + } + +dolaundry: + if (launder > 0) { + /* + * Because of I/O clustering, the number of laundered + * pages could exceed "target" by the maximum size of + * a cluster minus one. + */ + target -= min(vm_pageout_launder(domain, launder, + in_shortfall), target); + pause("laundp", hz / VM_LAUNDER_RATE); + } + + /* + * If we're not currently laundering pages and the page daemon + * hasn't posted a new request, sleep until the page daemon + * kicks us. + */ + vm_pagequeue_lock(pq); + if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) + (void)mtx_sleep(&vm_laundry_request, + vm_pagequeue_lockptr(pq), PVM, "launds", 0); + + /* + * If the pagedaemon has indicated that it's in shortfall, start + * a shortfall laundering unless we're already in the middle of + * one. This may preempt a background laundering. + */ + if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && + (!in_shortfall || shortfall_cycle == 0)) { + shortfall = vm_laundry_target() + vm_pageout_deficit; + target = 0; + } else + shortfall = 0; + + if (target == 0) + vm_laundry_request = VM_LAUNDRY_IDLE; + vm_pagequeue_unlock(pq); + } +} + +/* * vm_pageout_scan does the dirty work for the pageout daemon. * - * pass 0 - Update active LRU/deactivate pages - * pass 1 - Free inactive pages - * pass 2 - Launder dirty pages + * pass == 0: Update active LRU/deactivate pages + * pass >= 1: Free inactive pages * * Returns true if pass was zero or enough pages were freed by the inactive * queue scan to meet the target. @@ -856,10 +1270,9 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) struct vm_pagequeue *pq; vm_object_t object; long min_scan; - int act_delta, addl_page_shortage, deficit, error, inactq_shortage; - int maxlaunder, maxscan, page_shortage, scan_tick, scanned; - int starting_page_shortage, vnodes_skipped; - boolean_t pageout_ok, queue_locked; + int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; + int page_shortage, scan_tick, scanned, starting_page_shortage; + boolean_t queue_locked; /* * If we need to reclaim memory ask kernel caches to return @@ -901,23 +1314,6 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) starting_page_shortage = page_shortage; /* - * maxlaunder limits the number of dirty pages we flush per scan. - * For most systems a smaller value (16 or 32) is more robust under - * extreme memory and disk pressure because any unnecessary writes - * to disk can result in extreme performance degredation. However, - * systems with excessive dirty pages (especially when MAP_NOSYNC is - * used) will die horribly with limited laundering. If the pageout - * daemon cannot clean enough pages in the first pass, we let it go - * all out in succeeding passes. - */ - if ((maxlaunder = vm_max_launder) <= 1) - maxlaunder = 1; - if (pass > 1) - maxlaunder = 10000; - - vnodes_skipped = 0; - - /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make @@ -932,7 +1328,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); - KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); + KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); @@ -995,11 +1391,15 @@ unlock_page: KASSERT(m->hold_count == 0, ("Held page %p", m)); /* - * We unlock the inactive page queue, invalidating the - * 'next' pointer. Use our marker to remember our - * place. + * Dequeue the inactive page and unlock the inactive page + * queue, invalidating the 'next' pointer. Dequeueing the + * page here avoids a later reacquisition (and release) of + * the inactive page queue lock when vm_page_activate(), + * vm_page_free(), or vm_page_launder() is called. Use a + * marker to remember our place in the inactive queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); + vm_page_dequeue_locked(m); vm_pagequeue_unlock(pq); queue_locked = FALSE; @@ -1028,6 +1428,7 @@ unlock_page: } if (act_delta != 0) { if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); vm_page_activate(m); /* @@ -1039,8 +1440,14 @@ unlock_page: */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; - } else if ((object->flags & OBJ_DEAD) == 0) - goto requeue_page; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_pagequeue_lock(pq); + queue_locked = TRUE; + m->queue = PQ_INACTIVE; + TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + vm_pagequeue_cnt_inc(pq); + goto drop_page; + } } /* @@ -1056,83 +1463,23 @@ unlock_page: pmap_remove_all(m); } + /* + * Clean pages can be freed, but dirty pages must be sent back + * to the laundry, unless they belong to a dead object. + * Requeueing dirty pages from dead objects is pointless, as + * they are being paged out and freed by the thread that + * destroyed the object. + */ if (m->dirty == 0) { - /* - * Clean pages can be freed. - */ free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; - } else if ((object->flags & OBJ_DEAD) != 0) { - /* - * Leave dirty pages from dead objects at the front of - * the queue. They are being paged out and freed by - * the thread that destroyed the object. They will - * leave the queue shortly after the scan finishes, so - * they should be discounted from the inactive count. - */ - addl_page_shortage++; - } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { - /* - * Dirty pages need to be paged out, but flushing - * a page is extremely expensive versus freeing - * a clean page. Rather then artificially limiting - * the number of pages we can flush, we instead give - * dirty pages extra priority on the inactive queue - * by forcing them to be cycled through the queue - * twice before being flushed, after which the - * (now clean) page will cycle through once more - * before being freed. This significantly extends - * the thrash point for a heavily loaded machine. - */ - m->flags |= PG_WINATCFLS; -requeue_page: - vm_pagequeue_lock(pq); - queue_locked = TRUE; - vm_page_requeue_locked(m); - } else if (maxlaunder > 0) { - /* - * We always want to try to flush some dirty pages if - * we encounter them, to keep the system stable. - * Normally this number is small, but under extreme - * pressure where there are insufficient clean pages - * on the inactive queue, we may have to go all out. - */ - - if (object->type != OBJT_SWAP && - object->type != OBJT_DEFAULT) - pageout_ok = TRUE; - else if (disable_swap_pageouts) - pageout_ok = FALSE; - else if (defer_swap_pageouts) - pageout_ok = vm_page_count_min(); - else - pageout_ok = TRUE; - if (!pageout_ok) - goto requeue_page; - error = vm_pageout_clean(m); - /* - * Decrement page_shortage on success to account for - * the (future) cleaned page. Otherwise we could wind - * up laundering or cleaning too many pages. - */ - if (error == 0) { - page_shortage--; - maxlaunder--; - } else if (error == EDEADLK) { - pageout_lock_miss++; - vnodes_skipped++; - } else if (error == EBUSY) { - addl_page_shortage++; - } - vm_page_lock_assert(m, MA_NOTOWNED); - goto relock_queue; - } + } else if ((object->flags & OBJ_DEAD) == 0) + vm_page_launder(m); drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); -relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = TRUE; @@ -1142,6 +1489,24 @@ relock_queue: } vm_pagequeue_unlock(pq); + /* + * Wake up the laundry thread so that it can perform any needed + * laundering. If we didn't meet our target, we're in shortfall and + * need to launder more aggressively. + */ + if (vm_laundry_request == VM_LAUNDRY_IDLE && + starting_page_shortage > 0) { + pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; + vm_pagequeue_lock(pq); + if (page_shortage > 0) { + vm_laundry_request = VM_LAUNDRY_SHORTFALL; + PCPU_INC(cnt.v_pdshortfalls); + } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) + vm_laundry_request = VM_LAUNDRY_BACKGROUND; + wakeup(&vm_laundry_request); + vm_pagequeue_unlock(pq); + } + #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't free the targeted number of @@ -1152,14 +1517,6 @@ relock_queue: #endif /* - * Wakeup the sync daemon if we skipped a vnode in a writeable object - * and we didn't free enough pages. - */ - if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - - vm_cnt.v_free_min) - (void)speedup_syncer(); - - /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ @@ -1167,10 +1524,20 @@ relock_queue: /* * Compute the number of pages we want to try to move from the - * active queue to the inactive queue. + * active queue to either the inactive or laundry queue. + * + * When scanning active pages, we make clean pages count more heavily + * towards the page shortage than dirty pages. This is because dirty + * pages must be laundered before they can be reused and thus have less + * utility when attempting to quickly alleviate a shortage. However, + * this weighting also causes the scan to deactivate dirty pages more + * more aggressively, improving the effectiveness of clustering and + * ensuring that they can eventually be reused. */ - inactq_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + + inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + + vm_cnt.v_laundry_count / act_scan_laundry_weight) + vm_paging_target() + deficit + addl_page_shortage; + page_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); @@ -1254,14 +1621,44 @@ relock_queue: m->act_count -= min(m->act_count, ACT_DECLINE); /* - * Move this page to the tail of the active or inactive + * Move this page to the tail of the active, inactive or laundry * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); - vm_page_deactivate(m); - inactq_shortage--; + + /* + * When not short for inactive pages, let dirty pages go + * through the inactive queue before moving to the + * laundry queues. This gives them some extra time to + * be reactivated, potentially avoiding an expensive + * pageout. During a page shortage, the inactive queue + * is necessarily small, so we may move dirty pages + * directly to the laundry queue. + */ + if (inactq_shortage <= 0) + vm_page_deactivate(m); + else { + /* + * Calling vm_page_test_dirty() here would + * require acquisition of the object's write + * lock. However, during a page shortage, + * directing dirty pages into the laundry + * queue is only an optimization and not a + * requirement. Therefore, we simply rely on + * the opportunistic updates to the page's + * dirty field by the pmap. + */ + if (m->dirty == 0) { + vm_page_deactivate(m); + inactq_shortage -= + act_scan_laundry_weight; + } else { + vm_page_launder(m); + inactq_shortage--; + } + } } else vm_page_requeue_locked(m); vm_page_unlock(m); @@ -1569,14 +1966,14 @@ vm_pageout_worker(void *arg) * thread during the previous scan, which must have * been a level 0 scan, or vm_pageout_wanted was * already set and the scan failed to free enough - * pages. If we haven't yet performed a level >= 2 - * scan (unlimited dirty cleaning), then upgrade the - * level and scan again now. Otherwise, sleep a bit - * and try again later. + * pages. If we haven't yet performed a level >= 1 + * (page reclamation) scan, then increase the level + * and scan again now. Otherwise, sleep a bit and + * try again later. */ mtx_unlock(&vm_page_queue_free_mtx); - if (pass > 1) - pause("psleep", hz / 2); + if (pass >= 1) + pause("psleep", hz / VM_INACT_SCAN_RATE); pass++; } else { /* @@ -1647,6 +2044,14 @@ vm_pageout_init(void) /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; + + /* + * Target amount of memory to move out of the laundry queue during a + * background laundering. This is proportional to the amount of system + * memory. + */ + vm_background_launder_target = (vm_cnt.v_free_target - + vm_cnt.v_free_min) / 10; } /* @@ -1661,6 +2066,10 @@ vm_pageout(void) #endif swap_pager_swap_init(); + error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, + 0, 0, "laundry: dom0"); + if (error != 0) + panic("starting laundry for domain 0, error %d", error); #ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index ab48f58..484417b 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -1314,7 +1314,7 @@ vm_phys_zero_pages_idle(void) for (;;) { TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) { for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { - if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { + if ((m_tmp->flags & PG_ZERO) == 0) { vm_phys_unfree_page(m_tmp); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); diff --git a/sys/vm/vm_radix.c b/sys/vm/vm_radix.c index 80c8bd0..4f0a575 100644 --- a/sys/vm/vm_radix.c +++ b/sys/vm/vm_radix.c @@ -339,8 +339,6 @@ vm_radix_insert(struct vm_radix *rtree, vm_page_t page) index = page->pindex; -restart: - /* * The owner of record for root is not really important because it * will never be used. @@ -358,32 +356,10 @@ restart: panic("%s: key %jx is already present", __func__, (uintmax_t)index); clev = vm_radix_keydiff(m->pindex, index); - - /* - * During node allocation the trie that is being - * walked can be modified because of recursing radix - * trie operations. - * If this is the case, the recursing functions signal - * such situation and the insert operation must - * start from scratch again. - * The freed radix node will then be in the UMA - * caches very likely to avoid the same situation - * to happen. - */ - rtree->rt_flags |= RT_INSERT_INPROG; tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); - rtree->rt_flags &= ~RT_INSERT_INPROG; - if (tmp == NULL) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; + if (tmp == NULL) return (ENOMEM); - } - if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; - tmp->rn_count = 0; - vm_radix_node_put(tmp); - goto restart; - } *parentp = tmp; vm_radix_addpage(tmp, index, clev, page); vm_radix_addpage(tmp, m->pindex, clev, m); @@ -407,21 +383,9 @@ restart: */ newind = rnode->rn_owner; clev = vm_radix_keydiff(newind, index); - - /* See the comments above. */ - rtree->rt_flags |= RT_INSERT_INPROG; tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); - rtree->rt_flags &= ~RT_INSERT_INPROG; - if (tmp == NULL) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; + if (tmp == NULL) return (ENOMEM); - } - if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; - tmp->rn_count = 0; - vm_radix_node_put(tmp); - goto restart; - } *parentp = tmp; vm_radix_addpage(tmp, index, clev, page); slot = vm_radix_slot(newind, clev); @@ -696,51 +660,37 @@ descend: } /* - * Remove the specified index from the tree. - * Panics if the key is not present. + * Remove the specified index from the trie, and return the value stored at + * that index. If the index is not present, return NULL. */ -void +vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *rnode, *parent; vm_page_t m; int i, slot; - /* - * Detect if a page is going to be removed from a trie which is - * already undergoing another trie operation. - * Right now this is only possible for vm_radix_remove() recursing - * into vm_radix_insert(). - * If this is the case, the caller must be notified about this - * situation. It will also takecare to update the RT_TRIE_MODIFIED - * accordingly. - * The RT_TRIE_MODIFIED bit is set here because the remove operation - * will always succeed. - */ - if ((rtree->rt_flags & RT_INSERT_INPROG) != 0) - rtree->rt_flags |= RT_TRIE_MODIFIED; - rnode = vm_radix_getroot(rtree); if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex != index) - panic("%s: invalid key found", __func__); + return (NULL); vm_radix_setroot(rtree, NULL); - return; + return (m); } parent = NULL; for (;;) { if (rnode == NULL) - panic("vm_radix_remove: impossible to locate the key"); + return (NULL); slot = vm_radix_slot(index, rnode->rn_clev); if (vm_radix_isleaf(rnode->rn_child[slot])) { m = vm_radix_topage(rnode->rn_child[slot]); if (m->pindex != index) - panic("%s: invalid key found", __func__); + return (NULL); rnode->rn_child[slot] = NULL; rnode->rn_count--; if (rnode->rn_count > 1) - break; + return (m); for (i = 0; i < VM_RADIX_COUNT; i++) if (rnode->rn_child[i] != NULL) break; @@ -757,7 +707,7 @@ vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) rnode->rn_count--; rnode->rn_child[i] = NULL; vm_radix_node_put(rnode); - break; + return (m); } parent = rnode; rnode = rnode->rn_child[slot]; @@ -774,9 +724,6 @@ vm_radix_reclaim_allnodes(struct vm_radix *rtree) { struct vm_radix_node *root; - KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0, - ("vm_radix_reclaim_allnodes: unexpected trie recursion")); - root = vm_radix_getroot(rtree); if (root == NULL) return; diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h index 63d27d4..b8a722d 100644 --- a/sys/vm/vm_radix.h +++ b/sys/vm/vm_radix.h @@ -42,7 +42,7 @@ vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index); void vm_radix_reclaim_allnodes(struct vm_radix *rtree); -void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); +vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage); #endif /* _KERNEL */ diff --git a/sys/vm/vm_reserv.c b/sys/vm/vm_reserv.c index 8bb1788..7e2bfb6 100644 --- a/sys/vm/vm_reserv.c +++ b/sys/vm/vm_reserv.c @@ -62,7 +62,7 @@ __FBSDID("$FreeBSD$"); /* * The reservation system supports the speculative allocation of large physical - * pages ("superpages"). Speculative allocation enables the fully-automatic + * pages ("superpages"). Speculative allocation enables the fully automatic * utilization of superpages by the virtual memory system. In other words, no * programmatic directives are required to use superpages. */ @@ -155,11 +155,11 @@ popmap_is_set(popmap_t popmap[], int i) * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets * within that object. The reservation's "popcnt" tracks the number of these * small physical pages that are in use at any given time. When and if the - * reservation is not fully utilized, it appears in the queue of partially- + * reservation is not fully utilized, it appears in the queue of partially * populated reservations. The reservation always appears on the containing * object's list of reservations. * - * A partially-populated reservation can be broken and reclaimed at any time. + * A partially populated reservation can be broken and reclaimed at any time. */ struct vm_reserv { TAILQ_ENTRY(vm_reserv) partpopq; @@ -196,11 +196,11 @@ struct vm_reserv { static vm_reserv_t vm_reserv_array; /* - * The partially-populated reservation queue + * The partially populated reservation queue * - * This queue enables the fast recovery of an unused cached or free small page - * from a partially-populated reservation. The reservation at the head of - * this queue is the least-recently-changed, partially-populated reservation. + * This queue enables the fast recovery of an unused free small page from a + * partially populated reservation. The reservation at the head of this queue + * is the least recently changed, partially populated reservation. * * Access to this queue is synchronized by the free page queue lock. */ @@ -225,7 +225,7 @@ SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, - sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues"); + sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues"); static long vm_reserv_reclaimed; SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, @@ -267,7 +267,7 @@ sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) } /* - * Describes the current state of the partially-populated reservation queue. + * Describes the current state of the partially populated reservation queue. */ static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) @@ -301,7 +301,7 @@ sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the - * reservation to the tail of the partially-populated reservation queue if the + * reservation to the tail of the partially populated reservation queue if the * population count is non-zero. * * The free page queue lock must be held. @@ -363,7 +363,7 @@ vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) /* * Increases the given reservation's population count. Moves the reservation - * to the tail of the partially-populated reservation queue. + * to the tail of the partially populated reservation queue. * * The free page queue must be locked. */ @@ -404,14 +404,18 @@ vm_reserv_populate(vm_reserv_t rv, int index) * physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. * + * The page "mpred" must immediately precede the offset "pindex" within the + * specified object. + * * The object and free page queue must be locked. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, - vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) + vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, + vm_page_t mpred) { vm_paddr_t pa, size; - vm_page_t m, m_ret, mpred, msucc; + vm_page_t m, m_ret, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; u_long allocpages, maxpages, minpages; @@ -448,10 +452,11 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, /* * Look for an existing reservation. */ - mpred = vm_radix_lookup_le(&object->rtree, pindex); if (mpred != NULL) { + KASSERT(mpred->object == object, + ("vm_reserv_alloc_contig: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); + ("vm_reserv_alloc_contig: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; @@ -460,7 +465,7 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); + ("vm_reserv_alloc_contig: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; @@ -597,7 +602,7 @@ found: } /* - * Allocates a page from an existing or newly-created reservation. + * Allocates a page from an existing or newly created reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. @@ -721,12 +726,12 @@ found: } /* - * Breaks the given reservation. Except for the specified cached or free - * page, all cached and free pages in the reservation are returned to the - * physical memory allocator. The reservation's population count and map are - * reset to their initial state. + * Breaks the given reservation. Except for the specified free page, all free + * pages in the reservation are returned to the physical memory allocator. + * The reservation's population count and map are reset to their initial + * state. * - * The given reservation must not be in the partially-populated reservation + * The given reservation must not be in the partially populated reservation * queue. The free page queue lock must be held. */ static void @@ -895,7 +900,7 @@ vm_reserv_level(vm_page_t m) } /* - * Returns a reservation level if the given page belongs to a fully-populated + * Returns a reservation level if the given page belongs to a fully populated * reservation and -1 otherwise. */ int @@ -908,47 +913,8 @@ vm_reserv_level_iffullpop(vm_page_t m) } /* - * Prepare for the reactivation of a cached page. - * - * First, suppose that the given page "m" was allocated individually, i.e., not - * as part of a reservation, and cached. Then, suppose a reservation - * containing "m" is allocated by the same object. Although "m" and the - * reservation belong to the same object, "m"'s pindex may not match the - * reservation's. - * - * The free page queue must be locked. - */ -boolean_t -vm_reserv_reactivate_page(vm_page_t m) -{ - vm_reserv_t rv; - int index; - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - rv = vm_reserv_from_page(m); - if (rv->object == NULL) - return (FALSE); - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_reserv_reactivate_page: page %p is not cached", m)); - if (m->object == rv->object && - m->pindex - rv->pindex == (index = VM_RESERV_INDEX(m->object, - m->pindex))) - vm_reserv_populate(rv, index); - else { - KASSERT(rv->inpartpopq, - ("vm_reserv_reactivate_page: reserv %p's inpartpopq is FALSE", - rv)); - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); - rv->inpartpopq = FALSE; - /* Don't release "m" to the physical memory allocator. */ - vm_reserv_break(rv, m); - } - return (TRUE); -} - -/* - * Breaks the given partially-populated reservation, releasing its cached and - * free pages to the physical memory allocator. + * Breaks the given partially populated reservation, releasing its free pages + * to the physical memory allocator. * * The free page queue lock must be held. */ @@ -966,9 +932,9 @@ vm_reserv_reclaim(vm_reserv_t rv) } /* - * Breaks the reservation at the head of the partially-populated reservation - * queue, releasing its cached and free pages to the physical memory - * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. + * Breaks the reservation at the head of the partially populated reservation + * queue, releasing its free pages to the physical memory allocator. Returns + * TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ @@ -986,11 +952,10 @@ vm_reserv_reclaim_inactive(void) } /* - * Searches the partially-populated reservation queue for the least recently - * active reservation with unused pages, i.e., cached or free, that satisfy the - * given request for contiguous physical memory. If a satisfactory reservation - * is found, it is broken. Returns TRUE if a reservation is broken and FALSE - * otherwise. + * Searches the partially populated reservation queue for the least recently + * changed reservation with free pages that satisfy the given request for + * contiguous physical memory. If a satisfactory reservation is found, it is + * broken. Returns TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ diff --git a/sys/vm/vm_reserv.h b/sys/vm/vm_reserv.h index 52f6ab2..8b33b48 100644 --- a/sys/vm/vm_reserv.h +++ b/sys/vm/vm_reserv.h @@ -47,7 +47,7 @@ */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary); + u_long alignment, vm_paddr_t boundary, vm_page_t mpred); vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); void vm_reserv_break_all(vm_object_t object); @@ -56,7 +56,6 @@ void vm_reserv_init(void); bool vm_reserv_is_page_free(vm_page_t m); int vm_reserv_level(vm_page_t m); int vm_reserv_level_iffullpop(vm_page_t m); -boolean_t vm_reserv_reactivate_page(vm_page_t m); boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); boolean_t vm_reserv_reclaim_inactive(void); diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index f9dfbf0..3349101 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -466,10 +466,6 @@ vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); - } else if ((nsize & PAGE_MASK) && - vm_page_is_cached(object, OFF_TO_IDX(nsize))) { - vm_page_cache_free(object, OFF_TO_IDX(nsize), - nobjsize); } } object->un_pager.vnp.vnp_size = nsize; @@ -894,8 +890,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { - p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | - VM_ALLOC_IFNOTCACHED); + p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) @@ -932,8 +927,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { - p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | - VM_ALLOC_IFNOTCACHED); + p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; |