diff options
Diffstat (limited to 'sys/vm/vm_pageout.c')
-rw-r--r-- | sys/vm/vm_pageout.c | 350 |
1 files changed, 240 insertions, 110 deletions
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 841820b..5bdc464 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$"); #include <sys/resourcevar.h> #include <sys/sched.h> #include <sys/signalvar.h> +#include <sys/smp.h> #include <sys/vnode.h> #include <sys/vmmeter.h> #include <sys/rwlock.h> @@ -103,6 +104,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_map.h> #include <vm/vm_pageout.h> #include <vm/vm_pager.h> +#include <vm/vm_phys.h> #include <vm/swap_pager.h> #include <vm/vm_extern.h> #include <vm/uma.h> @@ -114,7 +116,8 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static int vm_pageout_clean(vm_page_t); -static void vm_pageout_scan(int pass); +static void vm_pageout_scan(struct vm_domain *vmd, int pass); +static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); struct proc *pageproc; @@ -216,14 +219,15 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); -static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t); +static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, + vm_paddr_t); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); -static void vm_pageout_page_stats(void); +static void vm_pageout_page_stats(struct vm_domain *vmd); /* * Initialize a dummy page for marking the caller's place in the specified @@ -267,7 +271,7 @@ vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) queue = m->queue; vm_pageout_init_marker(&marker, queue); - pq = &vm_pagequeues[queue]; + pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); @@ -309,7 +313,7 @@ vm_pageout_page_lock(vm_page_t m, vm_page_t *next) queue = m->queue; vm_pageout_init_marker(&marker, queue); - pq = &vm_pagequeues[queue]; + pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); vm_pagequeue_unlock(pq); @@ -567,21 +571,17 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, } static boolean_t -vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) +vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, + vm_paddr_t high) { struct mount *mp; - struct vm_pagequeue *pq; struct vnode *vp; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_tmp, next; - pq = &vm_pagequeues[queue]; vm_pagequeue_lock(pq); TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) { - KASSERT(m->queue == queue, - ("vm_pageout_launder: page %p's queue is not %d", m, - queue)); if ((m->flags & PG_MARKER) != 0) continue; pa = VM_PAGE_TO_PHYS(m); @@ -661,7 +661,8 @@ vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) void vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) { - int actl, actmax, inactl, inactmax; + int actl, actmax, inactl, inactmax, dom, initial_dom; + static int start_dom = 0; if (tries > 0) { /* @@ -677,19 +678,55 @@ vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) */ uma_reclaim(); } + + /* + * Make the next scan start on the next domain. + */ + initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; + inactl = 0; inactmax = cnt.v_inactive_count; actl = 0; actmax = tries < 2 ? 0 : cnt.v_active_count; + dom = initial_dom; + + /* + * Scan domains in round-robin order, first inactive queues, + * then active. Since domain usually owns large physically + * contiguous chunk of memory, it makes sense to completely + * exhaust one domain before switching to next, while growing + * the pool of contiguous physical pages. + * + * Do not even start launder a domain which cannot contain + * the specified address range, as indicated by segments + * constituting the domain. + */ again: - if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low, - high)) { - inactl++; - goto again; + if (inactl < inactmax) { + if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, + low, high) && + vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], + tries, low, high)) { + inactl++; + goto again; + } + if (++dom == vm_ndomains) + dom = 0; + if (dom != initial_dom) + goto again; } - if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) { - actl++; - goto again; + if (actl < actmax) { + if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, + low, high) && + vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], + tries, low, high)) { + actl++; + goto again; + } + if (++dom == vm_ndomains) + dom = 0; + if (dom != initial_dom) + goto again; } } @@ -861,10 +898,9 @@ vm_pageout_map_deactivate_pages(map, desired) * vm_pageout_scan does the dirty work for the pageout daemon. */ static void -vm_pageout_scan(int pass) +vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; - struct vm_page marker; struct vm_pagequeue *pq; int page_shortage, maxscan, pcount; int addl_page_shortage; @@ -874,8 +910,6 @@ vm_pageout_scan(int pass) int maxlaunder; boolean_t queues_locked; - vm_pageout_init_marker(&marker, PQ_INACTIVE); - /* * Decrease registered cache sizes. */ @@ -888,7 +922,7 @@ vm_pageout_scan(int pass) /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the - * number of pages from cnt.v_inactive_count that should be + * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit); @@ -914,8 +948,6 @@ vm_pageout_scan(int pass) if (pass) maxlaunder = 10000; - maxscan = cnt.v_inactive_count; - /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or @@ -923,7 +955,8 @@ vm_pageout_scan(int pass) * is not used to form decisions for the inactive queue, only for the * active queue. */ - pq = &vm_pagequeues[PQ_INACTIVE]; + pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; + maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queues_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); @@ -984,7 +1017,7 @@ vm_pageout_scan(int pass) * 'next' pointer. Use our marker to remember our * place. */ - TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); + TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, pageq); vm_pagequeue_unlock(pq); queues_locked = FALSE; @@ -1034,7 +1067,7 @@ vm_pageout_scan(int pass) /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted - * from cnt.v_inactive_count. See the + * from the inactive count. See the * calculation of the page_shortage for the * loop over the active queue below. */ @@ -1178,7 +1211,7 @@ vm_pageout_scan(int pass) */ if (m->queue != PQ_INACTIVE || m->object != object || - TAILQ_NEXT(m, pageq) != &marker) { + TAILQ_NEXT(m, pageq) != &vmd->vmd_marker) { vm_page_unlock(m); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; @@ -1248,8 +1281,8 @@ relock_queues: vm_pagequeue_lock(pq); queues_locked = TRUE; } - next = TAILQ_NEXT(&marker, pageq); - TAILQ_REMOVE(&pq->pq_pl, &marker, pageq); + next = TAILQ_NEXT(&vmd->vmd_marker, pageq); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, pageq); } vm_pagequeue_unlock(pq); @@ -1258,7 +1291,7 @@ relock_queues: * active queue to the inactive queue. */ page_shortage = vm_paging_target() + - cnt.v_inactive_target - cnt.v_inactive_count; + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; /* @@ -1266,8 +1299,8 @@ relock_queues: * track the per-page activity counter and use it to locate * deactivation candidates. */ - pcount = cnt.v_active_count; - pq = &vm_pagequeues[PQ_ACTIVE]; + pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; + pcount = pq->pq_cnt; vm_pagequeue_lock(pq); m = TAILQ_FIRST(&pq->pq_pl); while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { @@ -1393,12 +1426,54 @@ relock_queues: * chance to flush out dirty vnode-backed pages and to allow * active pages to be moved to the inactive queue and reclaimed. */ - if (pass != 0 && - ((swap_pager_avail < 64 && vm_page_count_min()) || - (swap_pager_full && vm_paging_target() > 0))) - vm_pageout_oom(VM_OOM_MEM); + vm_pageout_mightbe_oom(vmd, pass); } +static int vm_pageout_oom_vote; + +/* + * The pagedaemon threads randlomly select one to perform the + * OOM. Trying to kill processes before all pagedaemons + * failed to reach free target is premature. + */ +static void +vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) +{ + int old_vote; + + if (pass == 0 || !((swap_pager_avail < 64 && vm_page_count_min()) || + (swap_pager_full && vm_paging_target() > 0))) { + if (vmd->vmd_oom) { + vmd->vmd_oom = FALSE; + atomic_subtract_int(&vm_pageout_oom_vote, 1); + } + return; + } + + if (vmd->vmd_oom) + return; + + vmd->vmd_oom = TRUE; + old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); + if (old_vote != vm_ndomains - 1) + return; + + /* + * The current pagedaemon thread is the last in the quorum to + * start OOM. Initiate the selection and signaling of the + * victim. + */ + vm_pageout_oom(VM_OOM_MEM); + + /* + * After one round of OOM terror, recall our vote. On the + * next pass, current pagedaemon would vote again if the low + * memory condition is still there, due to vmd_oom being + * false. + */ + vmd->vmd_oom = FALSE; + atomic_subtract_int(&vm_pageout_oom_vote, 1); +} void vm_pageout_oom(int shortage) @@ -1501,14 +1576,13 @@ vm_pageout_oom(int shortage) * helps the situation where paging just starts to occur. */ static void -vm_pageout_page_stats(void) +vm_pageout_page_stats(struct vm_domain *vmd) { struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, next; int pcount, tpcount; /* Number of pages to check */ - static int fullintervalcount = 0; - int page_shortage; + int actcount, page_shortage; page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - @@ -1517,25 +1591,30 @@ vm_pageout_page_stats(void) if (page_shortage <= 0) return; - pcount = cnt.v_active_count; - fullintervalcount += vm_pageout_stats_interval; - if (fullintervalcount < vm_pageout_full_stats_interval) { - vm_pageout_stats++; - tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count / - cnt.v_page_count; + pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; + + /* + * pcount limits the depth of the queue scan. In particular, + * for the full scan, it prevents the iteration from looking + * into the requeued pages. The limit is not exact since the + * page queue lock is dropped during the iteration. + */ + pcount = pq->pq_cnt; + vmd->vmd_fullintervalcount += vm_pageout_stats_interval; + if (vmd->vmd_fullintervalcount < vm_pageout_full_stats_interval) { + atomic_add_int(&vm_pageout_stats, 1); + tpcount = (int64_t)vm_pageout_stats_max * pcount / + vmd->vmd_page_count; if (pcount > tpcount) pcount = tpcount; } else { - vm_pageout_full_stats++; - fullintervalcount = 0; + atomic_add_int(&vm_pageout_full_stats, 1); + vmd->vmd_fullintervalcount = 0; } - pq = &vm_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); m = TAILQ_FIRST(&pq->pq_pl); - while ((m != NULL) && (pcount-- > 0)) { - int actcount; - + while (m != NULL && pcount-- > 0) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_page_stats: page %p isn't active", m)); @@ -1560,11 +1639,11 @@ vm_pageout_page_stats(void) } /* - * Don't deactivate pages that are busy. + * Don't deactivate pages that are busy or held. */ - if ((m->busy != 0) || - (m->oflags & VPO_BUSY) || - (m->hold_count != 0)) { + if (m->busy != 0 || + (m->oflags & VPO_BUSY) != 0 || + m->hold_count != 0) { vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); vm_page_requeue_locked(m); @@ -1579,7 +1658,7 @@ vm_pageout_page_stats(void) } actcount += pmap_ts_referenced(m); - if (actcount) { + if (actcount != 0) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; @@ -1611,13 +1690,105 @@ vm_pageout_page_stats(void) vm_pagequeue_unlock(pq); } +static void +vm_pageout_worker(void *arg) +{ + struct vm_domain *domain; + struct pcpu *pc; + int cpu, error, domidx; + + domidx = (uintptr_t)arg; + domain = &vm_dom[domidx]; + + /* + * XXXKIB The bind is rather arbitrary. With some minor + * complications, we could assign the cpuset consisting of all + * CPUs in the same domain. In fact, it even does not matter + * if the CPU we bind to is in the affinity domain of this + * page queue, we only need to establish the fair distribution + * of pagedaemon threads among CPUs. + * + * XXXKIB It would be useful to allocate vm_pages for the + * domain from the domain, and put pcpu area into the page + * owned by the domain. + */ + if (mem_affinity != NULL) { + CPU_FOREACH(cpu) { + pc = pcpu_find(cpu); + if (pc->pc_domain == domidx) { + thread_lock(curthread); + sched_bind(curthread, cpu); + thread_unlock(curthread); + break; + } + } + } + + KASSERT(domain->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); + + /* + * The pageout daemon worker is never done, so loop forever. + */ + while (TRUE) { + /* + * If we have enough free memory, wakeup waiters. Do + * not clear vm_pages_needed until we reach our target, + * otherwise we may be woken up over and over again and + * waste a lot of cpu. + */ + mtx_lock(&vm_page_queue_free_mtx); + if (vm_pages_needed && !vm_page_count_min()) { + if (!vm_paging_needed()) + vm_pages_needed = 0; + wakeup(&cnt.v_free_count); + } + if (vm_pages_needed) { + /* + * Still not done, take a second pass without waiting + * (unlimited dirty cleaning), otherwise sleep a bit + * and try again. + */ + ++(domain->vmd_pass); + if (domain->vmd_pass > 1) + msleep(&vm_pages_needed, + &vm_page_queue_free_mtx, PVM, "psleep", + hz / 2); + } else { + /* + * Good enough, sleep & handle stats. Prime the pass + * for the next run. + */ + if (domain->vmd_pass > 1) + domain->vmd_pass = 1; + else + domain->vmd_pass = 0; + error = msleep(&vm_pages_needed, + &vm_page_queue_free_mtx, PVM, "psleep", + vm_pageout_stats_interval * hz); + if (error && !vm_pages_needed) { + mtx_unlock(&vm_page_queue_free_mtx); + domain->vmd_pass = 0; + vm_pageout_page_stats(domain); + continue; + } + } + if (vm_pages_needed) + cnt.v_pdwakeups++; + mtx_unlock(&vm_page_queue_free_mtx); + vm_pageout_scan(domain, domain->vmd_pass); + } +} + /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { - int error, pass; +#if MAXMEMDOM > 1 + int error, i; +#endif /* * Initialize some paging parameters. @@ -1687,58 +1858,17 @@ vm_pageout(void) vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; swap_pager_swap_init(); - pass = 0; - /* - * The pageout daemon is never done, so loop forever. - */ - while (TRUE) { - /* - * If we have enough free memory, wakeup waiters. Do - * not clear vm_pages_needed until we reach our target, - * otherwise we may be woken up over and over again and - * waste a lot of cpu. - */ - mtx_lock(&vm_page_queue_free_mtx); - if (vm_pages_needed && !vm_page_count_min()) { - if (!vm_paging_needed()) - vm_pages_needed = 0; - wakeup(&cnt.v_free_count); - } - if (vm_pages_needed) { - /* - * Still not done, take a second pass without waiting - * (unlimited dirty cleaning), otherwise sleep a bit - * and try again. - */ - ++pass; - if (pass > 1) - msleep(&vm_pages_needed, - &vm_page_queue_free_mtx, PVM, "psleep", - hz / 2); - } else { - /* - * Good enough, sleep & handle stats. Prime the pass - * for the next run. - */ - if (pass > 1) - pass = 1; - else - pass = 0; - error = msleep(&vm_pages_needed, - &vm_page_queue_free_mtx, PVM, "psleep", - vm_pageout_stats_interval * hz); - if (error && !vm_pages_needed) { - mtx_unlock(&vm_page_queue_free_mtx); - pass = 0; - vm_pageout_page_stats(); - continue; - } +#if MAXMEMDOM > 1 + for (i = 1; i < vm_ndomains; i++) { + error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, + curproc, NULL, 0, 0, "dom%d", i); + if (error != 0) { + panic("starting pageout for domain %d, error %d\n", + i, error); } - if (vm_pages_needed) - cnt.v_pdwakeups++; - mtx_unlock(&vm_page_queue_free_mtx); - vm_pageout_scan(pass); } +#endif + vm_pageout_worker((uintptr_t)0); } /* |