summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2010-07-27 20:33:50 +0000
committerjhb <jhb@FreeBSD.org>2010-07-27 20:33:50 +0000
commitf27c8b35e2b318d78a7d1b1a5da79b6443afc069 (patch)
tree7e82572d7902fd76fd7aca9ed824e46e66521ec3
parentc5f2739ce161682d9aa352169fde83a2aa2073ae (diff)
downloadFreeBSD-src-f27c8b35e2b318d78a7d1b1a5da79b6443afc069.zip
FreeBSD-src-f27c8b35e2b318d78a7d1b1a5da79b6443afc069.tar.gz
Very rough first cut at NUMA support for the physical page allocator. For
now it uses a very dumb first-touch allocation policy. This will change in the future. - Each architecture indicates the maximum number of supported memory domains via a new VM_NDOMAIN parameter in <machine/vmparam.h>. - Each cpu now has a PCPU_GET(domain) member to indicate the memory domain a CPU belongs to. Domain values are dense and numbered from 0. - When a platform supports multiple domains, the default freelist (VM_FREELIST_DEFAULT) is split up into N freelists, one for each domain. The MD code is required to populate an array of mem_affinity structures. Each entry in the array defines a range of memory (start and end) and a domain for the range. Multiple entries may be present for a single domain. The list is terminated by an entry where all fields are zero. This array of structures is used to split up phys_avail[] regions that fall in VM_FREELIST_DEFAULT into per-domain freelists. - Each memory domain has a separate lookup-array of freelists that is used when fulfulling a physical memory allocation. Right now the per-domain freelists are listed in a round-robin order for each domain. In the future a table such as the ACPI SLIT table may be used to order the per-domain lookup lists based on the penalty for each memory domain relative to a specific domain. The lookup lists may be examined via a new vm.phys.lookup_lists sysctl. - The first-touch policy is implemented by using PCPU_GET(domain) to pick a lookup list when allocating memory. Reviewed by: alc
-rw-r--r--sys/amd64/include/vmparam.h7
-rw-r--r--sys/arm/include/vmparam.h7
-rw-r--r--sys/i386/include/vmparam.h7
-rw-r--r--sys/ia64/include/vmparam.h7
-rw-r--r--sys/mips/include/vmparam.h7
-rw-r--r--sys/powerpc/include/vmparam.h7
-rw-r--r--sys/sparc64/include/vmparam.h7
-rw-r--r--sys/sun4v/include/vmparam.h7
-rw-r--r--sys/sys/pcpu.h1
-rw-r--r--sys/vm/vm_phys.c147
-rw-r--r--sys/vm/vm_phys.h9
11 files changed, 206 insertions, 7 deletions
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 6dbe371..f86b184 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -132,6 +132,13 @@
#define VM_NFREEORDER 13
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Enable superpage reservations: 1 level.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/arm/include/vmparam.h b/sys/arm/include/vmparam.h
index d54671d..23b0f86 100644
--- a/sys/arm/include/vmparam.h
+++ b/sys/arm/include/vmparam.h
@@ -86,6 +86,13 @@
#define VM_NFREEORDER 9
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Disable superpage reservations.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/i386/include/vmparam.h b/sys/i386/include/vmparam.h
index e5f596c..2b159b6 100644
--- a/sys/i386/include/vmparam.h
+++ b/sys/i386/include/vmparam.h
@@ -119,6 +119,13 @@
#endif
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Enable superpage reservations: 1 level.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/ia64/include/vmparam.h b/sys/ia64/include/vmparam.h
index 2a0f061..a827e01 100644
--- a/sys/ia64/include/vmparam.h
+++ b/sys/ia64/include/vmparam.h
@@ -120,6 +120,13 @@
#define VM_NFREEORDER 16
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Disable superpage reservations.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/mips/include/vmparam.h b/sys/mips/include/vmparam.h
index c7ce73e..385d309 100644
--- a/sys/mips/include/vmparam.h
+++ b/sys/mips/include/vmparam.h
@@ -118,6 +118,13 @@
#endif
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Disable superpage reservations. (not sure if this is right
* I copied it from ARM)
*/
diff --git a/sys/powerpc/include/vmparam.h b/sys/powerpc/include/vmparam.h
index ad8c67d..13f2e57 100644
--- a/sys/powerpc/include/vmparam.h
+++ b/sys/powerpc/include/vmparam.h
@@ -167,6 +167,13 @@ struct pmap_physseg {
#define VM_NFREEORDER 11
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Disable superpage reservations.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/sparc64/include/vmparam.h b/sys/sparc64/include/vmparam.h
index 7849884..b9ce546 100644
--- a/sys/sparc64/include/vmparam.h
+++ b/sys/sparc64/include/vmparam.h
@@ -121,6 +121,13 @@
#define VM_NFREEORDER 12
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Disable superpage reservations.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/sun4v/include/vmparam.h b/sys/sun4v/include/vmparam.h
index bd133e8..f4b24b5 100644
--- a/sys/sun4v/include/vmparam.h
+++ b/sys/sun4v/include/vmparam.h
@@ -121,6 +121,13 @@
#define VM_NFREEORDER 12
/*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define VM_NDOMAIN 1
+#endif
+
+/*
* Disable superpage reservations.
*/
#ifndef VM_NRESERVLEVEL
diff --git a/sys/sys/pcpu.h b/sys/sys/pcpu.h
index 917f329..7d510c9 100644
--- a/sys/sys/pcpu.h
+++ b/sys/sys/pcpu.h
@@ -179,6 +179,7 @@ struct pcpu {
struct device *pc_device;
void *pc_netisr; /* netisr SWI cookie */
int pc_dnweight; /* vm_page_dontneed() */
+ int pc_domain; /* Memory domain. */
/*
* Stuff for read mostly lock
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
index 3482337..e75c340 100644
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -56,6 +56,13 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_phys.h>
#include <vm/vm_reserv.h>
+/*
+ * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
+ * domain. These extra lists are stored at the end of the regular
+ * free lists starting with VM_NFREELIST.
+ */
+#define VM_RAW_NFREELIST (VM_NFREELIST + VM_NDOMAIN - 1)
+
struct vm_freelist {
struct pglist pl;
int lcnt;
@@ -65,15 +72,20 @@ struct vm_phys_seg {
vm_paddr_t start;
vm_paddr_t end;
vm_page_t first_page;
+ int domain;
struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
};
+struct mem_affinity *mem_affinity;
+
static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
static int vm_phys_nsegs;
static struct vm_freelist
- vm_phys_free_queues[VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
+ vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
+static struct vm_freelist
+(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
@@ -89,6 +101,14 @@ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
+#if VM_NDOMAIN > 1
+static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
+#endif
+
+static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
+ int domain);
static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
static int vm_phys_paddr_to_segind(vm_paddr_t pa);
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
@@ -157,6 +177,7 @@ sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
(uintmax_t)seg->start);
sbuf_printf(&sbuf, "end: %#jx\n",
(uintmax_t)seg->end);
+ sbuf_printf(&sbuf, "domain: %d\n", seg->domain);
sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
}
sbuf_finish(&sbuf);
@@ -166,11 +187,40 @@ sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
return (error);
}
+#if VM_NDOMAIN > 1
+/*
+ * Outputs the set of free list lookup lists.
+ */
+static int
+sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ char *cbuf;
+ const int cbufsize = (vm_nfreelists + 1) * VM_NDOMAIN * 81;
+ int domain, error, flind, ndomains;
+
+ ndomains = vm_nfreelists - VM_NFREELIST + 1;
+ cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
+ sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+ for (domain = 0; domain < ndomains; domain++) {
+ sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
+ for (flind = 0; flind < vm_nfreelists; flind++)
+ sbuf_printf(&sbuf, " [%d]:\t%p\n", flind,
+ vm_phys_lookup_lists[domain][flind]);
+ }
+ sbuf_finish(&sbuf);
+ error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+ sbuf_delete(&sbuf);
+ free(cbuf, M_TEMP);
+ return (error);
+}
+#endif
+
/*
* Create a physical memory segment.
*/
static void
-vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
{
struct vm_phys_seg *seg;
#ifdef VM_PHYSSEG_SPARSE
@@ -188,14 +238,51 @@ vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
seg = &vm_phys_segs[vm_phys_nsegs++];
seg->start = start;
seg->end = end;
+ seg->domain = domain;
#ifdef VM_PHYSSEG_SPARSE
seg->first_page = &vm_page_array[pages];
#else
seg->first_page = PHYS_TO_VM_PAGE(start);
#endif
+#if VM_NDOMAIN > 1
+ if (flind == VM_FREELIST_DEFAULT && domain != 0) {
+ flind = VM_NFREELIST + (domain - 1);
+ if (flind >= vm_nfreelists)
+ vm_nfreelists = flind + 1;
+ }
+#endif
seg->free_queues = &vm_phys_free_queues[flind];
}
+static void
+vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+{
+ int i;
+
+ if (mem_affinity == NULL) {
+ _vm_phys_create_seg(start, end, flind, 0);
+ return;
+ }
+
+ for (i = 0;; i++) {
+ if (mem_affinity[i].end == 0)
+ panic("Reached end of affinity info");
+ if (mem_affinity[i].end <= start)
+ continue;
+ if (mem_affinity[i].start > start)
+ panic("No affinity info for start %jx",
+ (uintmax_t)start);
+ if (mem_affinity[i].end >= end) {
+ _vm_phys_create_seg(start, end, flind,
+ mem_affinity[i].domain);
+ break;
+ }
+ _vm_phys_create_seg(start, mem_affinity[i].end, flind,
+ mem_affinity[i].domain);
+ start = mem_affinity[i].end;
+ }
+}
+
/*
* Initialize the physical memory allocator.
*/
@@ -204,6 +291,9 @@ vm_phys_init(void)
{
struct vm_freelist *fl;
int flind, i, oind, pind;
+#if VM_NDOMAIN > 1
+ int ndomains, j;
+#endif
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
#ifdef VM_FREELIST_ISADMA
@@ -246,6 +336,37 @@ vm_phys_init(void)
TAILQ_INIT(&fl[oind].pl);
}
}
+#if VM_NDOMAIN > 1
+ /*
+ * Build a free list lookup list for each domain. All of the
+ * memory domain lists are inserted at the VM_FREELIST_DEFAULT
+ * index in a round-robin order starting with the current
+ * domain.
+ */
+ ndomains = vm_nfreelists - VM_NFREELIST + 1;
+ for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
+ for (i = 0; i < ndomains; i++)
+ vm_phys_lookup_lists[i][flind] =
+ &vm_phys_free_queues[flind];
+ for (i = 0; i < ndomains; i++)
+ for (j = 0; j < ndomains; j++) {
+ flind = (i + j) % ndomains;
+ if (flind == 0)
+ flind = VM_FREELIST_DEFAULT;
+ else
+ flind += VM_NFREELIST - 1;
+ vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
+ &vm_phys_free_queues[flind];
+ }
+ for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
+ flind++)
+ for (i = 0; i < ndomains; i++)
+ vm_phys_lookup_lists[i][flind + ndomains - 1] =
+ &vm_phys_free_queues[flind];
+#else
+ for (flind = 0; flind < vm_nfreelists; flind++)
+ vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
+#endif
}
/*
@@ -321,7 +442,7 @@ vm_phys_alloc_freelist_pages(int flind, int pool, int order)
{
struct vm_freelist *fl;
struct vm_freelist *alt;
- int oind, pind;
+ int domain, oind, pind;
vm_page_t m;
KASSERT(flind < VM_NFREELIST,
@@ -330,8 +451,14 @@ vm_phys_alloc_freelist_pages(int flind, int pool, int order)
("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_freelist_pages: order %d is out of range", order));
+
+#if VM_NDOMAIN > 1
+ domain = PCPU_GET(domain);
+#else
+ domain = 0;
+#endif
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- fl = vm_phys_free_queues[flind][pool];
+ fl = (*vm_phys_lookup_lists[domain][flind])[pool];
for (oind = order; oind < VM_NFREEORDER; oind++) {
m = TAILQ_FIRST(&fl[oind].pl);
if (m != NULL) {
@@ -351,7 +478,7 @@ vm_phys_alloc_freelist_pages(int flind, int pool, int order)
*/
for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- alt = vm_phys_free_queues[flind][pind];
+ alt = (*vm_phys_lookup_lists[domain][flind])[pind];
m = TAILQ_FIRST(&alt[oind].pl);
if (m != NULL) {
TAILQ_REMOVE(&alt[oind].pl, m, pageq);
@@ -613,8 +740,13 @@ vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
struct vnode *vp;
vm_paddr_t pa, pa_last, size;
vm_page_t deferred_vdrop_list, m, m_ret;
- int flind, i, oind, order, pind;
+ int domain, flind, i, oind, order, pind;
+#if VM_NDOMAIN > 1
+ domain = PCPU_GET(domain);
+#else
+ domain = 0;
+#endif
size = npages << PAGE_SHIFT;
KASSERT(size != 0,
("vm_phys_alloc_contig: size must not be 0"));
@@ -632,7 +764,8 @@ retry:
for (flind = 0; flind < vm_nfreelists; flind++) {
for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- fl = vm_phys_free_queues[flind][pind];
+ fl = (*vm_phys_lookup_lists[domain][flind])
+ [pind];
TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
/*
* A free list may contain physical pages
diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h
index 0dbd96a..a5b9e93 100644
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@@ -40,6 +40,15 @@
#ifdef _KERNEL
+/* Domains must be dense (non-sparse) and zero-based. */
+struct mem_affinity {
+ vm_paddr_t start;
+ vm_paddr_t end;
+ int domain;
+};
+
+extern struct mem_affinity *mem_affinity;
+
void vm_phys_add_page(vm_paddr_t pa);
vm_page_t vm_phys_alloc_contig(unsigned long npages,
vm_paddr_t low, vm_paddr_t high,
OpenPOWER on IntegriCloud