summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/amd64/amd64/minidump_machdep.c1
-rw-r--r--sys/ofed/include/linux/page.h1
-rw-r--r--sys/sparc64/sparc64/genassym.c1
-rw-r--r--sys/vm/vm_page.c111
-rw-r--r--sys/vm/vm_page.h37
-rw-r--r--sys/vm/vm_pageout.c350
-rw-r--r--sys/vm/vm_phys.c45
-rw-r--r--sys/vm/vm_phys.h47
-rw-r--r--sys/vm/vm_zeroidle.c1
-rw-r--r--sys/x86/acpica/srat.c4
10 files changed, 421 insertions, 177 deletions
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index 79d8bde..152d12d 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/msgbuf.h>
#include <sys/watchdog.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
#include <vm/pmap.h>
diff --git a/sys/ofed/include/linux/page.h b/sys/ofed/include/linux/page.h
index 9e15201..748014c 100644
--- a/sys/ofed/include/linux/page.h
+++ b/sys/ofed/include/linux/page.h
@@ -32,6 +32,7 @@
#include <sys/param.h>
+#include <machine/atomic.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
diff --git a/sys/sparc64/sparc64/genassym.c b/sys/sparc64/sparc64/genassym.c
index df31805..0b4a10c 100644
--- a/sys/sparc64/sparc64/genassym.c
+++ b/sys/sparc64/sparc64/genassym.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/vmmeter.h>
#include <sys/_cpuset.h>
+#include <machine/atomic.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 4a167c1..36689f6 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -64,8 +64,7 @@
* GENERAL RULES ON VM_PAGE MANIPULATION
*
* - A page queue lock is required when adding or removing a page from a
- * page queue (vm_pagequeues[]), regardless of other locks or the
- * busy state of a page.
+ * page queue regardless of other locks or the busy state of a page.
*
* * In general, no thread besides the page daemon can acquire or
* hold more than one page queue lock at a time.
@@ -124,20 +123,7 @@ __FBSDID("$FreeBSD$");
* page structure.
*/
-struct vm_pagequeue vm_pagequeues[PQ_COUNT] = {
- [PQ_INACTIVE] = {
- .pq_pl = TAILQ_HEAD_INITIALIZER(
- vm_pagequeues[PQ_INACTIVE].pq_pl),
- .pq_cnt = &cnt.v_inactive_count,
- .pq_name = "vm inactive pagequeue"
- },
- [PQ_ACTIVE] = {
- .pq_pl = TAILQ_HEAD_INITIALIZER(
- vm_pagequeues[PQ_ACTIVE].pq_pl),
- .pq_cnt = &cnt.v_active_count,
- .pq_name = "vm active pagequeue"
- }
-};
+struct vm_domain vm_dom[MAXMEMDOM];
struct mtx_padalign vm_page_queue_free_mtx;
struct mtx_padalign pa_lock[PA_LOCK_COUNT];
@@ -256,6 +242,34 @@ vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
return (0);
}
+static void
+vm_page_domain_init(struct vm_domain *vmd)
+{
+ struct vm_pagequeue *pq;
+ int i;
+
+ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
+ "vm inactive pagequeue";
+ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+ &cnt.v_inactive_count;
+ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
+ "vm active pagequeue";
+ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+ &cnt.v_active_count;
+ vmd->vmd_fullintervalcount = 0;
+ vmd->vmd_page_count = 0;
+ vmd->vmd_free_count = 0;
+ vmd->vmd_segs = 0;
+ vmd->vmd_oom = FALSE;
+ vmd->vmd_pass = 0;
+ for (i = 0; i < PQ_COUNT; i++) {
+ pq = &vmd->vmd_pagequeues[i];
+ TAILQ_INIT(&pq->pq_pl);
+ mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
+ MTX_DEF | MTX_DUPOK);
+ }
+}
+
/*
* vm_page_startup:
*
@@ -319,8 +333,8 @@ vm_page_startup(vm_offset_t vaddr)
mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
for (i = 0; i < PA_LOCK_COUNT; i++)
mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
- for (i = 0; i < PQ_COUNT; i++)
- vm_pagequeue_init_lock(&vm_pagequeues[i]);
+ for (i = 0; i < vm_ndomains; i++)
+ vm_page_domain_init(&vm_dom[i]);
/*
* Allocate memory for use when boot strapping the kernel memory
@@ -1055,7 +1069,7 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
("vm_page_cache_free: page %p has inconsistent flags", m));
cnt.v_cache_count--;
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
}
empty = vm_radix_is_empty(&object->cache);
mtx_unlock(&vm_page_queue_free_mtx);
@@ -1311,7 +1325,7 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
("vm_page_alloc: page %p is not free", m));
KASSERT(m->valid == 0,
("vm_page_alloc: free page %p is valid", m));
- cnt.v_free_count--;
+ vm_phys_freecnt_adj(m, -1);
}
/*
@@ -1569,7 +1583,7 @@ vm_page_alloc_init(vm_page_t m)
("vm_page_alloc_init: page %p is not free", m));
KASSERT(m->valid == 0,
("vm_page_alloc_init: free page %p is valid", m));
- cnt.v_free_count--;
+ vm_phys_freecnt_adj(m, -1);
if ((m->flags & PG_ZERO) != 0)
vm_page_zero_count--;
}
@@ -1711,6 +1725,13 @@ vm_waitpfault(void)
"pfault", 0);
}
+struct vm_pagequeue *
+vm_page_pagequeue(vm_page_t m)
+{
+
+ return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+}
+
/*
* vm_page_dequeue:
*
@@ -1726,11 +1747,11 @@ vm_page_dequeue(vm_page_t m)
vm_page_lock_assert(m, MA_OWNED);
KASSERT(m->queue != PQ_NONE,
("vm_page_dequeue: page %p is not queued", m));
- pq = &vm_pagequeues[m->queue];
+ pq = vm_page_pagequeue(m);
vm_pagequeue_lock(pq);
m->queue = PQ_NONE;
TAILQ_REMOVE(&pq->pq_pl, m, pageq);
- (*pq->pq_cnt)--;
+ vm_pagequeue_cnt_dec(pq);
vm_pagequeue_unlock(pq);
}
@@ -1747,11 +1768,11 @@ vm_page_dequeue_locked(vm_page_t m)
struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
- pq = &vm_pagequeues[m->queue];
+ pq = vm_page_pagequeue(m);
vm_pagequeue_assert_locked(pq);
m->queue = PQ_NONE;
TAILQ_REMOVE(&pq->pq_pl, m, pageq);
- (*pq->pq_cnt)--;
+ vm_pagequeue_cnt_dec(pq);
}
/*
@@ -1767,11 +1788,11 @@ vm_page_enqueue(int queue, vm_page_t m)
struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
- pq = &vm_pagequeues[queue];
+ pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
m->queue = queue;
TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
- ++*pq->pq_cnt;
+ vm_pagequeue_cnt_inc(pq);
vm_pagequeue_unlock(pq);
}
@@ -1790,7 +1811,7 @@ vm_page_requeue(vm_page_t m)
vm_page_lock_assert(m, MA_OWNED);
KASSERT(m->queue != PQ_NONE,
("vm_page_requeue: page %p is not queued", m));
- pq = &vm_pagequeues[m->queue];
+ pq = vm_page_pagequeue(m);
vm_pagequeue_lock(pq);
TAILQ_REMOVE(&pq->pq_pl, m, pageq);
TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
@@ -1811,7 +1832,7 @@ vm_page_requeue_locked(vm_page_t m)
KASSERT(m->queue != PQ_NONE,
("vm_page_requeue_locked: page %p is not queued", m));
- pq = &vm_pagequeues[m->queue];
+ pq = vm_page_pagequeue(m);
vm_pagequeue_assert_locked(pq);
TAILQ_REMOVE(&pq->pq_pl, m, pageq);
TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
@@ -1948,7 +1969,7 @@ vm_page_free_toq(vm_page_t m)
*/
mtx_lock(&vm_page_queue_free_mtx);
m->flags |= PG_FREE;
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
#if VM_NRESERVLEVEL > 0
if (!vm_reserv_free_page(m))
#else
@@ -2081,14 +2102,14 @@ _vm_page_deactivate(vm_page_t m, int athead)
if (queue != PQ_NONE)
vm_page_dequeue(m);
m->flags &= ~PG_WINATCFLS;
- pq = &vm_pagequeues[PQ_INACTIVE];
+ pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
vm_pagequeue_lock(pq);
m->queue = PQ_INACTIVE;
if (athead)
TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq);
else
TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
- cnt.v_inactive_count++;
+ vm_pagequeue_cnt_inc(pq);
vm_pagequeue_unlock(pq);
}
}
@@ -2888,18 +2909,20 @@ DB_SHOW_COMMAND(page, vm_page_print_page_info)
DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
{
-
- db_printf("PQ_FREE:");
- db_printf(" %d", cnt.v_free_count);
- db_printf("\n");
-
- db_printf("PQ_CACHE:");
- db_printf(" %d", cnt.v_cache_count);
- db_printf("\n");
-
- db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
- *vm_pagequeues[PQ_ACTIVE].pq_cnt,
- *vm_pagequeues[PQ_INACTIVE].pq_cnt);
+ int dom;
+
+ db_printf("pq_free %d pq_cache %d\n",
+ cnt.v_free_count, cnt.v_cache_count);
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ db_printf(
+ "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+ dom,
+ vm_dom[dom].vmd_page_count,
+ vm_dom[dom].vmd_free_count,
+ vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
+ vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
+ vm_dom[dom].vmd_pass);
+ }
}
DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 4fe5d7e..d8de0de 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -181,18 +181,44 @@ TAILQ_HEAD(pglist, vm_page);
struct vm_pagequeue {
struct mtx pq_mutex;
struct pglist pq_pl;
- int *const pq_cnt;
- const char *const pq_name;
+ int pq_cnt;
+ int * const pq_vcnt;
+ const char * const pq_name;
} __aligned(CACHE_LINE_SIZE);
-extern struct vm_pagequeue vm_pagequeues[PQ_COUNT];
+
+struct vm_domain {
+ struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
+ int vmd_fullintervalcount;
+ u_int vmd_page_count;
+ u_int vmd_free_count;
+ long vmd_segs; /* bitmask of the segments */
+ boolean_t vmd_oom;
+ int vmd_pass; /* local pagedaemon pass */
+ struct vm_page vmd_marker; /* marker for pagedaemon private use */
+};
+
+extern struct vm_domain vm_dom[MAXMEMDOM];
#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
-#define vm_pagequeue_init_lock(pq) mtx_init(&(pq)->pq_mutex, \
- (pq)->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK);
#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
+#ifdef _KERNEL
+static __inline void
+vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
+{
+
+#ifdef notyet
+ vm_pagequeue_assert_locked(pq);
+#endif
+ pq->pq_cnt += addend;
+ atomic_add_int(pq->pq_vcnt, addend);
+}
+#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
+#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
+#endif /* _KERNEL */
+
extern struct mtx_padalign vm_page_queue_free_mtx;
extern struct mtx_padalign pa_lock[];
@@ -393,6 +419,7 @@ boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
+struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
vm_page_t vm_page_prev(vm_page_t m);
void vm_page_putfake(vm_page_t m);
void vm_page_readahead_finish(vm_page_t m);
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 841820b..5bdc464 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
+#include <sys/smp.h>
#include <sys/vnode.h>
#include <sys/vmmeter.h>
#include <sys/rwlock.h>
@@ -103,6 +104,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_map.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
@@ -114,7 +116,8 @@ __FBSDID("$FreeBSD$");
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(int pass);
+static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
struct proc *pageproc;
@@ -216,14 +219,15 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t);
+static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
+ vm_paddr_t);
#if !defined(NO_SWAPPING)
static void vm_pageout_map_deactivate_pages(vm_map_t, long);
static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
static void vm_req_vmdaemon(int req);
#endif
static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
-static void vm_pageout_page_stats(void);
+static void vm_pageout_page_stats(struct vm_domain *vmd);
/*
* Initialize a dummy page for marking the caller's place in the specified
@@ -267,7 +271,7 @@ vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
queue = m->queue;
vm_pageout_init_marker(&marker, queue);
- pq = &vm_pagequeues[queue];
+ pq = vm_page_pagequeue(m);
object = m->object;
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
@@ -309,7 +313,7 @@ vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
queue = m->queue;
vm_pageout_init_marker(&marker, queue);
- pq = &vm_pagequeues[queue];
+ pq = vm_page_pagequeue(m);
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
vm_pagequeue_unlock(pq);
@@ -567,21 +571,17 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
}
static boolean_t
-vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high)
+vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
+ vm_paddr_t high)
{
struct mount *mp;
- struct vm_pagequeue *pq;
struct vnode *vp;
vm_object_t object;
vm_paddr_t pa;
vm_page_t m, m_tmp, next;
- pq = &vm_pagequeues[queue];
vm_pagequeue_lock(pq);
TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) {
- KASSERT(m->queue == queue,
- ("vm_pageout_launder: page %p's queue is not %d", m,
- queue));
if ((m->flags & PG_MARKER) != 0)
continue;
pa = VM_PAGE_TO_PHYS(m);
@@ -661,7 +661,8 @@ vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high)
void
vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
{
- int actl, actmax, inactl, inactmax;
+ int actl, actmax, inactl, inactmax, dom, initial_dom;
+ static int start_dom = 0;
if (tries > 0) {
/*
@@ -677,19 +678,55 @@ vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
*/
uma_reclaim();
}
+
+ /*
+ * Make the next scan start on the next domain.
+ */
+ initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+
inactl = 0;
inactmax = cnt.v_inactive_count;
actl = 0;
actmax = tries < 2 ? 0 : cnt.v_active_count;
+ dom = initial_dom;
+
+ /*
+ * Scan domains in round-robin order, first inactive queues,
+ * then active. Since domain usually owns large physically
+ * contiguous chunk of memory, it makes sense to completely
+ * exhaust one domain before switching to next, while growing
+ * the pool of contiguous physical pages.
+ *
+ * Do not even start launder a domain which cannot contain
+ * the specified address range, as indicated by segments
+ * constituting the domain.
+ */
again:
- if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low,
- high)) {
- inactl++;
- goto again;
+ if (inactl < inactmax) {
+ if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+ low, high) &&
+ vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
+ tries, low, high)) {
+ inactl++;
+ goto again;
+ }
+ if (++dom == vm_ndomains)
+ dom = 0;
+ if (dom != initial_dom)
+ goto again;
}
- if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) {
- actl++;
- goto again;
+ if (actl < actmax) {
+ if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+ low, high) &&
+ vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
+ tries, low, high)) {
+ actl++;
+ goto again;
+ }
+ if (++dom == vm_ndomains)
+ dom = 0;
+ if (dom != initial_dom)
+ goto again;
}
}
@@ -861,10 +898,9 @@ vm_pageout_map_deactivate_pages(map, desired)
* vm_pageout_scan does the dirty work for the pageout daemon.
*/
static void
-vm_pageout_scan(int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass)
{
vm_page_t m, next;
- struct vm_page marker;
struct vm_pagequeue *pq;
int page_shortage, maxscan, pcount;
int addl_page_shortage;
@@ -874,8 +910,6 @@ vm_pageout_scan(int pass)
int maxlaunder;
boolean_t queues_locked;
- vm_pageout_init_marker(&marker, PQ_INACTIVE);
-
/*
* Decrease registered cache sizes.
*/
@@ -888,7 +922,7 @@ vm_pageout_scan(int pass)
/*
* The addl_page_shortage is the number of temporarily
* stuck pages in the inactive queue. In other words, the
- * number of pages from cnt.v_inactive_count that should be
+ * number of pages from the inactive count that should be
* discounted in setting the target for the active queue scan.
*/
addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit);
@@ -914,8 +948,6 @@ vm_pageout_scan(int pass)
if (pass)
maxlaunder = 10000;
- maxscan = cnt.v_inactive_count;
-
/*
* Start scanning the inactive queue for pages we can move to the
* cache or free. The scan will stop when the target is reached or
@@ -923,7 +955,8 @@ vm_pageout_scan(int pass)
* is not used to form decisions for the inactive queue, only for the
* active queue.
*/
- pq = &vm_pagequeues[PQ_INACTIVE];
+ pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
+ maxscan = pq->pq_cnt;
vm_pagequeue_lock(pq);
queues_locked = TRUE;
for (m = TAILQ_FIRST(&pq->pq_pl);
@@ -984,7 +1017,7 @@ vm_pageout_scan(int pass)
* 'next' pointer. Use our marker to remember our
* place.
*/
- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq);
+ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, pageq);
vm_pagequeue_unlock(pq);
queues_locked = FALSE;
@@ -1034,7 +1067,7 @@ vm_pageout_scan(int pass)
/*
* Held pages are essentially stuck in the
* queue. So, they ought to be discounted
- * from cnt.v_inactive_count. See the
+ * from the inactive count. See the
* calculation of the page_shortage for the
* loop over the active queue below.
*/
@@ -1178,7 +1211,7 @@ vm_pageout_scan(int pass)
*/
if (m->queue != PQ_INACTIVE ||
m->object != object ||
- TAILQ_NEXT(m, pageq) != &marker) {
+ TAILQ_NEXT(m, pageq) != &vmd->vmd_marker) {
vm_page_unlock(m);
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
@@ -1248,8 +1281,8 @@ relock_queues:
vm_pagequeue_lock(pq);
queues_locked = TRUE;
}
- next = TAILQ_NEXT(&marker, pageq);
- TAILQ_REMOVE(&pq->pq_pl, &marker, pageq);
+ next = TAILQ_NEXT(&vmd->vmd_marker, pageq);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, pageq);
}
vm_pagequeue_unlock(pq);
@@ -1258,7 +1291,7 @@ relock_queues:
* active queue to the inactive queue.
*/
page_shortage = vm_paging_target() +
- cnt.v_inactive_target - cnt.v_inactive_count;
+ cnt.v_inactive_target - cnt.v_inactive_count;
page_shortage += addl_page_shortage;
/*
@@ -1266,8 +1299,8 @@ relock_queues:
* track the per-page activity counter and use it to locate
* deactivation candidates.
*/
- pcount = cnt.v_active_count;
- pq = &vm_pagequeues[PQ_ACTIVE];
+ pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+ pcount = pq->pq_cnt;
vm_pagequeue_lock(pq);
m = TAILQ_FIRST(&pq->pq_pl);
while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
@@ -1393,12 +1426,54 @@ relock_queues:
* chance to flush out dirty vnode-backed pages and to allow
* active pages to be moved to the inactive queue and reclaimed.
*/
- if (pass != 0 &&
- ((swap_pager_avail < 64 && vm_page_count_min()) ||
- (swap_pager_full && vm_paging_target() > 0)))
- vm_pageout_oom(VM_OOM_MEM);
+ vm_pageout_mightbe_oom(vmd, pass);
}
+static int vm_pageout_oom_vote;
+
+/*
+ * The pagedaemon threads randlomly select one to perform the
+ * OOM. Trying to kill processes before all pagedaemons
+ * failed to reach free target is premature.
+ */
+static void
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
+{
+ int old_vote;
+
+ if (pass == 0 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
+ (swap_pager_full && vm_paging_target() > 0))) {
+ if (vmd->vmd_oom) {
+ vmd->vmd_oom = FALSE;
+ atomic_subtract_int(&vm_pageout_oom_vote, 1);
+ }
+ return;
+ }
+
+ if (vmd->vmd_oom)
+ return;
+
+ vmd->vmd_oom = TRUE;
+ old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
+ if (old_vote != vm_ndomains - 1)
+ return;
+
+ /*
+ * The current pagedaemon thread is the last in the quorum to
+ * start OOM. Initiate the selection and signaling of the
+ * victim.
+ */
+ vm_pageout_oom(VM_OOM_MEM);
+
+ /*
+ * After one round of OOM terror, recall our vote. On the
+ * next pass, current pagedaemon would vote again if the low
+ * memory condition is still there, due to vmd_oom being
+ * false.
+ */
+ vmd->vmd_oom = FALSE;
+ atomic_subtract_int(&vm_pageout_oom_vote, 1);
+}
void
vm_pageout_oom(int shortage)
@@ -1501,14 +1576,13 @@ vm_pageout_oom(int shortage)
* helps the situation where paging just starts to occur.
*/
static void
-vm_pageout_page_stats(void)
+vm_pageout_page_stats(struct vm_domain *vmd)
{
struct vm_pagequeue *pq;
vm_object_t object;
vm_page_t m, next;
int pcount, tpcount; /* Number of pages to check */
- static int fullintervalcount = 0;
- int page_shortage;
+ int actcount, page_shortage;
page_shortage =
(cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
@@ -1517,25 +1591,30 @@ vm_pageout_page_stats(void)
if (page_shortage <= 0)
return;
- pcount = cnt.v_active_count;
- fullintervalcount += vm_pageout_stats_interval;
- if (fullintervalcount < vm_pageout_full_stats_interval) {
- vm_pageout_stats++;
- tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
- cnt.v_page_count;
+ pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+
+ /*
+ * pcount limits the depth of the queue scan. In particular,
+ * for the full scan, it prevents the iteration from looking
+ * into the requeued pages. The limit is not exact since the
+ * page queue lock is dropped during the iteration.
+ */
+ pcount = pq->pq_cnt;
+ vmd->vmd_fullintervalcount += vm_pageout_stats_interval;
+ if (vmd->vmd_fullintervalcount < vm_pageout_full_stats_interval) {
+ atomic_add_int(&vm_pageout_stats, 1);
+ tpcount = (int64_t)vm_pageout_stats_max * pcount /
+ vmd->vmd_page_count;
if (pcount > tpcount)
pcount = tpcount;
} else {
- vm_pageout_full_stats++;
- fullintervalcount = 0;
+ atomic_add_int(&vm_pageout_full_stats, 1);
+ vmd->vmd_fullintervalcount = 0;
}
- pq = &vm_pagequeues[PQ_ACTIVE];
vm_pagequeue_lock(pq);
m = TAILQ_FIRST(&pq->pq_pl);
- while ((m != NULL) && (pcount-- > 0)) {
- int actcount;
-
+ while (m != NULL && pcount-- > 0) {
KASSERT(m->queue == PQ_ACTIVE,
("vm_pageout_page_stats: page %p isn't active", m));
@@ -1560,11 +1639,11 @@ vm_pageout_page_stats(void)
}
/*
- * Don't deactivate pages that are busy.
+ * Don't deactivate pages that are busy or held.
*/
- if ((m->busy != 0) ||
- (m->oflags & VPO_BUSY) ||
- (m->hold_count != 0)) {
+ if (m->busy != 0 ||
+ (m->oflags & VPO_BUSY) != 0 ||
+ m->hold_count != 0) {
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(object);
vm_page_requeue_locked(m);
@@ -1579,7 +1658,7 @@ vm_pageout_page_stats(void)
}
actcount += pmap_ts_referenced(m);
- if (actcount) {
+ if (actcount != 0) {
m->act_count += ACT_ADVANCE + actcount;
if (m->act_count > ACT_MAX)
m->act_count = ACT_MAX;
@@ -1611,13 +1690,105 @@ vm_pageout_page_stats(void)
vm_pagequeue_unlock(pq);
}
+static void
+vm_pageout_worker(void *arg)
+{
+ struct vm_domain *domain;
+ struct pcpu *pc;
+ int cpu, error, domidx;
+
+ domidx = (uintptr_t)arg;
+ domain = &vm_dom[domidx];
+
+ /*
+ * XXXKIB The bind is rather arbitrary. With some minor
+ * complications, we could assign the cpuset consisting of all
+ * CPUs in the same domain. In fact, it even does not matter
+ * if the CPU we bind to is in the affinity domain of this
+ * page queue, we only need to establish the fair distribution
+ * of pagedaemon threads among CPUs.
+ *
+ * XXXKIB It would be useful to allocate vm_pages for the
+ * domain from the domain, and put pcpu area into the page
+ * owned by the domain.
+ */
+ if (mem_affinity != NULL) {
+ CPU_FOREACH(cpu) {
+ pc = pcpu_find(cpu);
+ if (pc->pc_domain == domidx) {
+ thread_lock(curthread);
+ sched_bind(curthread, cpu);
+ thread_unlock(curthread);
+ break;
+ }
+ }
+ }
+
+ KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+ vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
+
+ /*
+ * The pageout daemon worker is never done, so loop forever.
+ */
+ while (TRUE) {
+ /*
+ * If we have enough free memory, wakeup waiters. Do
+ * not clear vm_pages_needed until we reach our target,
+ * otherwise we may be woken up over and over again and
+ * waste a lot of cpu.
+ */
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (vm_pages_needed && !vm_page_count_min()) {
+ if (!vm_paging_needed())
+ vm_pages_needed = 0;
+ wakeup(&cnt.v_free_count);
+ }
+ if (vm_pages_needed) {
+ /*
+ * Still not done, take a second pass without waiting
+ * (unlimited dirty cleaning), otherwise sleep a bit
+ * and try again.
+ */
+ ++(domain->vmd_pass);
+ if (domain->vmd_pass > 1)
+ msleep(&vm_pages_needed,
+ &vm_page_queue_free_mtx, PVM, "psleep",
+ hz / 2);
+ } else {
+ /*
+ * Good enough, sleep & handle stats. Prime the pass
+ * for the next run.
+ */
+ if (domain->vmd_pass > 1)
+ domain->vmd_pass = 1;
+ else
+ domain->vmd_pass = 0;
+ error = msleep(&vm_pages_needed,
+ &vm_page_queue_free_mtx, PVM, "psleep",
+ vm_pageout_stats_interval * hz);
+ if (error && !vm_pages_needed) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ domain->vmd_pass = 0;
+ vm_pageout_page_stats(domain);
+ continue;
+ }
+ }
+ if (vm_pages_needed)
+ cnt.v_pdwakeups++;
+ mtx_unlock(&vm_page_queue_free_mtx);
+ vm_pageout_scan(domain, domain->vmd_pass);
+ }
+}
+
/*
* vm_pageout is the high level pageout daemon.
*/
static void
vm_pageout(void)
{
- int error, pass;
+#if MAXMEMDOM > 1
+ int error, i;
+#endif
/*
* Initialize some paging parameters.
@@ -1687,58 +1858,17 @@ vm_pageout(void)
vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
swap_pager_swap_init();
- pass = 0;
- /*
- * The pageout daemon is never done, so loop forever.
- */
- while (TRUE) {
- /*
- * If we have enough free memory, wakeup waiters. Do
- * not clear vm_pages_needed until we reach our target,
- * otherwise we may be woken up over and over again and
- * waste a lot of cpu.
- */
- mtx_lock(&vm_page_queue_free_mtx);
- if (vm_pages_needed && !vm_page_count_min()) {
- if (!vm_paging_needed())
- vm_pages_needed = 0;
- wakeup(&cnt.v_free_count);
- }
- if (vm_pages_needed) {
- /*
- * Still not done, take a second pass without waiting
- * (unlimited dirty cleaning), otherwise sleep a bit
- * and try again.
- */
- ++pass;
- if (pass > 1)
- msleep(&vm_pages_needed,
- &vm_page_queue_free_mtx, PVM, "psleep",
- hz / 2);
- } else {
- /*
- * Good enough, sleep & handle stats. Prime the pass
- * for the next run.
- */
- if (pass > 1)
- pass = 1;
- else
- pass = 0;
- error = msleep(&vm_pages_needed,
- &vm_page_queue_free_mtx, PVM, "psleep",
- vm_pageout_stats_interval * hz);
- if (error && !vm_pages_needed) {
- mtx_unlock(&vm_page_queue_free_mtx);
- pass = 0;
- vm_pageout_page_stats();
- continue;
- }
+#if MAXMEMDOM > 1
+ for (i = 1; i < vm_ndomains; i++) {
+ error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
+ curproc, NULL, 0, 0, "dom%d", i);
+ if (error != 0) {
+ panic("starting pageout for domain %d, error %d\n",
+ i, error);
}
- if (vm_pages_needed)
- cnt.v_pdwakeups++;
- mtx_unlock(&vm_page_queue_free_mtx);
- vm_pageout_scan(pass);
}
+#endif
+ vm_pageout_worker((uintptr_t)0);
}
/*
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
index e55f841..1fa223b 100644
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -65,26 +65,15 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
-struct vm_freelist {
- struct pglist pl;
- int lcnt;
-};
-
-struct vm_phys_seg {
- vm_paddr_t start;
- vm_paddr_t end;
- vm_page_t first_page;
- int domain;
- struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
-};
+_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
+ "Too many physsegs.");
struct mem_affinity *mem_affinity;
int vm_ndomains = 1;
-static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
-
-static int vm_phys_nsegs;
+struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_phys_nsegs;
#define VM_PHYS_FICTITIOUS_NSEGS 8
static struct vm_phys_fictitious_seg {
@@ -140,6 +129,22 @@ vm_rr_selectdomain(void)
#endif
}
+boolean_t
+vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
+{
+ struct vm_phys_seg *s;
+ int idx;
+
+ while ((idx = ffsl(mask)) != 0) {
+ idx--; /* ffsl counts from 1 */
+ mask &= ~(1UL << idx);
+ s = &vm_phys_segs[idx];
+ if (low < s->end && high > s->start)
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
/*
* Outputs the state of the physical memory allocator, specifically,
* the amount of physical memory in each free list.
@@ -378,12 +383,16 @@ void
vm_phys_add_page(vm_paddr_t pa)
{
vm_page_t m;
+ struct vm_domain *vmd;
cnt.v_page_count++;
m = vm_phys_paddr_to_vm_page(pa);
m->phys_addr = pa;
m->queue = PQ_NONE;
m->segind = vm_phys_paddr_to_segind(pa);
+ vmd = vm_phys_domain(m);
+ vmd->vmd_page_count++;
+ vmd->vmd_segs |= 1UL << m->segind;
m->flags = PG_FREE;
KASSERT(m->order == VM_NFREEORDER,
("vm_phys_add_page: page %p has unexpected order %d",
@@ -391,7 +400,7 @@ vm_phys_add_page(vm_paddr_t pa)
m->pool = VM_FREEPOOL_DEFAULT;
pmap_page_init(m);
mtx_lock(&vm_page_queue_free_mtx);
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
vm_phys_free_pages(m, 0);
mtx_unlock(&vm_page_queue_free_mtx);
}
@@ -813,12 +822,12 @@ vm_phys_zero_pages_idle(void)
for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
vm_phys_unfree_page(m_tmp);
- cnt.v_free_count--;
+ vm_phys_freecnt_adj(m, -1);
mtx_unlock(&vm_page_queue_free_mtx);
pmap_zero_page_idle(m_tmp);
m_tmp->flags |= PG_ZERO;
mtx_lock(&vm_page_queue_free_mtx);
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
vm_phys_free_pages(m_tmp, 0);
vm_page_zero_count++;
cnt_prezero++;
diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h
index 9812816..f39943c 100644
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@@ -47,8 +47,23 @@ struct mem_affinity {
int domain;
};
+struct vm_freelist {
+ struct pglist pl;
+ int lcnt;
+};
+
+struct vm_phys_seg {
+ vm_paddr_t start;
+ vm_paddr_t end;
+ vm_page_t first_page;
+ int domain;
+ struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
+};
+
extern struct mem_affinity *mem_affinity;
extern int vm_ndomains;
+extern struct vm_phys_seg vm_phys_segs[];
+extern int vm_phys_nsegs;
/*
* The following functions are only to be used by the virtual memory system.
@@ -58,6 +73,7 @@ vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
u_long alignment, vm_paddr_t boundary);
vm_page_t vm_phys_alloc_freelist_pages(int flind, int pool, int order);
vm_page_t vm_phys_alloc_pages(int pool, int order);
+boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high);
int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
vm_memattr_t memattr);
void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end);
@@ -70,5 +86,36 @@ void vm_phys_set_pool(int pool, vm_page_t m, int order);
boolean_t vm_phys_unfree_page(vm_page_t m);
boolean_t vm_phys_zero_pages_idle(void);
+/*
+ * vm_phys_domain:
+ *
+ * Return the memory domain the page belongs to.
+ */
+static inline struct vm_domain *
+vm_phys_domain(vm_page_t m)
+{
+#if MAXMEMDOM > 1
+ int domn, segind;
+
+ /* XXXKIB try to assert that the page is managed */
+ segind = m->segind;
+ KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m));
+ domn = vm_phys_segs[segind].domain;
+ KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m));
+ return (&vm_dom[domn]);
+#else
+ return (&vm_dom[0]);
+#endif
+}
+
+static inline void
+vm_phys_freecnt_adj(vm_page_t m, int adj)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ cnt.v_free_count += adj;
+ vm_phys_domain(m)->vmd_free_count += adj;
+}
+
#endif /* _KERNEL */
#endif /* !_VM_PHYS_H_ */
diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c
index 6ba96e1..8c191c0 100644
--- a/sys/vm/vm_zeroidle.c
+++ b/sys/vm/vm_zeroidle.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
#include <sys/unistd.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c
index 7ea715e..8b5082c 100644
--- a/sys/x86/acpica/srat.c
+++ b/sys/x86/acpica/srat.c
@@ -31,10 +31,14 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
#include <sys/smp.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_param.h>
+#include <vm/vm_page.h>
#include <vm/vm_phys.h>
#include <contrib/dev/acpica/include/acpi.h>
OpenPOWER on IntegriCloud