diff options
Diffstat (limited to 'sys/vm/vm_pageout.c')
-rw-r--r-- | sys/vm/vm_pageout.c | 228 |
1 files changed, 117 insertions, 111 deletions
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index dbea3d6..943fb11 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -106,7 +106,7 @@ /* the kernel process "vm_pageout"*/ static void vm_pageout __P((void)); static int vm_pageout_clean __P((vm_page_t)); -static int vm_pageout_scan __P((void)); +static void vm_pageout_scan __P((int pass)); static int vm_pageout_free_page_calc __P((vm_size_t count)); struct proc *pageproc; @@ -140,14 +140,13 @@ static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; #endif extern int vm_swap_size; +static int vm_max_launder = 32; static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; static int vm_pageout_full_stats_interval = 0; -static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0; +static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; static int defer_swap_pageouts=0; static int disable_swap_pageouts=0; -static int max_page_launder=100; -static int vm_pageout_actcmp=0; #if defined(NO_SWAPPING) static int vm_swap_enabled=0; static int vm_swap_idle_enabled=0; @@ -157,7 +156,10 @@ static int vm_swap_idle_enabled=0; #endif SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, - CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt"); + CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); + +SYSCTL_INT(_vm, OID_AUTO, max_launder, + CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); @@ -189,12 +191,6 @@ SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); -SYSCTL_INT(_vm, OID_AUTO, max_page_launder, - CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass"); -SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp, - CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness"); - - #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -509,7 +505,7 @@ vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only) } else if (p->queue == PQ_ACTIVE) { if ((p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); - if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) { + if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) { vm_page_protect(p, VM_PROT_NONE); vm_page_deactivate(p); } else { @@ -627,20 +623,21 @@ vm_pageout_page_free(vm_page_t m) { /* * vm_pageout_scan does the dirty work for the pageout daemon. */ -static int -vm_pageout_scan() +static void +vm_pageout_scan(int pass) { vm_page_t m, next; struct vm_page marker; + int save_page_shortage; + int save_inactive_count; int page_shortage, maxscan, pcount; int addl_page_shortage, addl_page_shortage_init; - int maxlaunder; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; - int force_wakeup = 0; int actcount; int vnodes_skipped = 0; + int maxlaunder; int s; /* @@ -651,27 +648,13 @@ vm_pageout_scan() addl_page_shortage_init = vm_pageout_deficit; vm_pageout_deficit = 0; - if (max_page_launder == 0) - max_page_launder = 1; - /* * Calculate the number of pages we want to either free or move - * to the cache. Be more agressive if we aren't making our target. + * to the cache. */ - - page_shortage = vm_paging_target() + - addl_page_shortage_init + vm_pageout_actcmp; - - /* - * Figure out how agressively we should flush dirty pages. - */ - { - int factor = vm_pageout_actcmp; - - maxlaunder = cnt.v_inactive_target / 3 + factor; - if (maxlaunder > max_page_launder + factor) - maxlaunder = max_page_launder + factor; - } + page_shortage = vm_paging_target() + addl_page_shortage_init; + save_page_shortage = page_shortage; + save_inactive_count = cnt.v_inactive_count; /* * Initialize our marker @@ -687,8 +670,22 @@ vm_pageout_scan() * we have scanned the entire inactive queue. Note that m->act_count * is not used to form decisions for the inactive queue, only for the * active queue. + * + * maxlaunder limits the number of dirty pages we flush per scan. + * For most systems a smaller value (16 or 32) is more robust under + * extreme memory and disk pressure because any unnecessary writes + * to disk can result in extreme performance degredation. However, + * systems with excessive dirty pages (especially when MAP_NOSYNC is + * used) will die horribly with limited laundering. If the pageout + * daemon cannot clean enough pages in the first pass, we let it go + * all out in succeeding passes. */ + if ((maxlaunder = vm_max_launder) <= 1) + maxlaunder = 1; + if (pass) + maxlaunder = 10000; + rescan0: addl_page_shortage = addl_page_shortage_init; maxscan = cnt.v_inactive_count; @@ -792,12 +789,32 @@ rescan0: } else if (m->dirty == 0) { vm_page_cache(m); --page_shortage; - - /* - * Dirty pages need to be paged out. Note that we clean - * only a limited number of pages per pagedaemon pass. - */ + } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { + /* + * Dirty pages need to be paged out, but flushing + * a page is extremely expensive verses freeing + * a clean page. Rather then artificially limiting + * the number of pages we can flush, we instead give + * dirty pages extra priority on the inactive queue + * by forcing them to be cycled through the queue + * twice before being flushed, after which the + * (now clean) page will cycle through once more + * before being freed. This significantly extends + * the thrash point for a heavily loaded machine. + */ + s = splvm(); + vm_page_flag_set(m, PG_WINATCFLS); + TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); + splx(s); } else if (maxlaunder > 0) { + /* + * We always want to try to flush some dirty pages if + * we encounter them, to keep the system stable. + * Normally this number is small, but under extreme + * pressure where there are insufficient clean pages + * on the inactive queue, we may have to go all out. + */ int swap_pageouts_ok; struct vnode *vp = NULL; struct mount *mp; @@ -826,29 +843,24 @@ rescan0: } /* - * Presumably we have sufficient free memory to do - * the more sophisticated checks and locking required - * for vnodes. - * - * The object is already known NOT to be dead. The - * vget() may still block, though, because - * VOP_ISLOCKED() doesn't check to see if an inode - * (v_data) is associated with the vnode. If it isn't, - * vget() will load in it from disk. Worse, vget() - * may actually get stuck waiting on "inode" if another - * process is in the process of bringing the inode in. - * This is bad news for us either way. + * The object is already known NOT to be dead. It + * is possible for the vget() to block the whole + * pageout daemon, but the new low-memory handling + * code should prevent it. * - * So for the moment we check v_data == NULL as a - * workaround. This means that vnodes which do not - * use v_data in the way we expect probably will not - * wind up being paged out by the pager and it will be - * up to the syncer to get them. That's better then - * us blocking here. + * The previous code skipped locked vnodes and, worse, + * reordered pages in the queue. This results in + * completely non-deterministic operation and, on a + * busy system, can lead to extremely non-optimal + * pageouts. For example, it can cause clean pages + * to be freed and dirty pages to be moved to the end + * of the queue. Since dirty pages are also moved to + * the end of the queue once-cleaned, this gives + * way too large a weighting to defering the freeing + * of dirty pages. * - * This whole code section is bogus - we need to fix - * the vnode pager to handle vm_page_t's without us - * having to do any sophisticated VOP tests. + * XXX we need to be able to apply a timeout to the + * vget() lock attempt. */ if (object->type == OBJT_VNODE) { @@ -857,19 +869,8 @@ rescan0: mp = NULL; if (vp->v_type == VREG) vn_start_write(vp, &mp, V_NOWAIT); - if (VOP_ISLOCKED(vp, NULL) || - vp->v_data == NULL || - vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { + if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { vn_finished_write(mp); - if ((m->queue == PQ_INACTIVE) && - (m->hold_count == 0) && - (m->busy == 0) && - (m->flags & PG_BUSY) == 0) { - s = splvm(); - TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); - TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); - splx(s); - } if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; continue; @@ -924,18 +925,23 @@ rescan0: * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we - * start the cleaning operation. maxlaunder nominally - * counts I/O cost (seeks) rather then bytes. + * start the cleaning operation. * * This operation may cluster, invalidating the 'next' * pointer. To prevent an inordinate number of * restarts we use our marker to remember our place. + * + * decrement page_shortage on success to account for + * the (future) cleaned page. Otherwise we could wind + * up laundering or cleaning too many pages. */ s = splvm(); TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq); splx(s); - if (vm_pageout_clean(m) != 0) + if (vm_pageout_clean(m) != 0) { + --page_shortage; --maxlaunder; + } s = splvm(); next = TAILQ_NEXT(&marker, pageq); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq); @@ -948,28 +954,12 @@ rescan0: } /* - * If we were not able to meet our target, increase actcmp - */ - - if (vm_page_count_min()) { - if (vm_pageout_actcmp < ACT_MAX / 2) - vm_pageout_actcmp += ACT_ADVANCE; - } else { - if (vm_pageout_actcmp < ACT_DECLINE) - vm_pageout_actcmp = 0; - else - vm_pageout_actcmp -= ACT_DECLINE; - } - - /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ - page_shortage = vm_paging_target() + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; - page_shortage += vm_pageout_actcmp; /* * Scan the active queue for things we can deactivate. We nominally @@ -1043,9 +1033,9 @@ rescan0: splx(s); } else { m->act_count -= min(m->act_count, ACT_DECLINE); - if (vm_pageout_algorithm_lru || - (m->object->ref_count == 0) || - (m->act_count <= vm_pageout_actcmp)) { + if (vm_pageout_algorithm || + m->object->ref_count == 0 || + m->act_count == 0) { page_shortage--; if (m->object->ref_count == 0) { vm_page_protect(m, VM_PROT_NONE); @@ -1175,7 +1165,6 @@ rescan0: wakeup(&cnt.v_free_count); } } - return force_wakeup; } /* @@ -1254,11 +1243,13 @@ vm_pageout_page_stats() } else { if (m->act_count == 0) { /* - * We turn off page access, so that we have more accurate - * RSS stats. We don't do this in the normal page deactivation - * when the system is loaded VM wise, because the cost of - * the large number of page protect operations would be higher - * than the value of doing the operation. + * We turn off page access, so that we have + * more accurate RSS stats. We don't do this + * in the normal page deactivation when the + * system is loaded VM wise, because the + * cost of the large number of page protect + * operations would be higher than the value + * of doing the operation. */ vm_page_protect(m, VM_PROT_NONE); vm_page_deactivate(m); @@ -1307,6 +1298,7 @@ vm_size_t count; static void vm_pageout() { + int pass; mtx_enter(&Giant, MTX_DEF); @@ -1320,11 +1312,18 @@ vm_pageout() vm_pageout_free_page_calc(cnt.v_page_count); /* - * free_reserved needs to include enough for the largest swap pager - * structures plus enough for any pv_entry structs when paging. + * v_free_target and v_cache_min control pageout hysteresis. Note + * that these are more a measure of the VM cache queue hysteresis + * then the VM free queue. Specifically, v_free_target is the + * high water mark (free+cache pages). + * + * v_free_reserved + v_cache_min (mostly means v_cache_min) is the + * low water mark, while v_free_min is the stop. v_cache_min must + * be big enough to handle memory needs while the pageout daemon + * is signalled and run to free more pages. */ if (cnt.v_free_count > 6144) - cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved; + cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; else cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; @@ -1362,10 +1361,9 @@ vm_pageout() if (vm_pageout_stats_free_max == 0) vm_pageout_stats_free_max = 5; - max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16); - curproc->p_flag |= P_BUFEXHAUST; swap_pager_swap_init(); + pass = 0; /* * The pageout daemon is never done, so loop forever. */ @@ -1386,19 +1384,27 @@ vm_pageout() } if (vm_pages_needed) { /* - * Still not done, sleep a bit and go again + * Still not done, take a second pass without waiting + * (unlimited dirty cleaning), otherwise sleep a bit + * and try again. */ - tsleep(&vm_pages_needed, PVM, "psleep", hz/2); + ++pass; + if (pass > 1) + tsleep(&vm_pages_needed, PVM, "psleep", hz/2); } else { /* - * Good enough, sleep & handle stats + * Good enough, sleep & handle stats. Prime the pass + * for the next run. */ + if (pass > 1) + pass = 1; + else + pass = 0; error = tsleep(&vm_pages_needed, PVM, "psleep", vm_pageout_stats_interval * hz); if (error && !vm_pages_needed) { - if (vm_pageout_actcmp > 0) - --vm_pageout_actcmp; splx(s); + pass = 0; vm_pageout_page_stats(); continue; } @@ -1407,7 +1413,7 @@ vm_pageout() if (vm_pages_needed) cnt.v_pdwakeups++; splx(s); - vm_pageout_scan(); + vm_pageout_scan(pass); vm_pageout_deficit = 0; } } |