summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoradrian <adrian@FreeBSD.org>2015-07-11 15:21:37 +0000
committeradrian <adrian@FreeBSD.org>2015-07-11 15:21:37 +0000
commit41db4b88e03d18ebeca593d652b664a3a7b2ad61 (patch)
tree21c5a6bf05b26992a7aef1cefc9874f34b43fbec
parent015d4dceeddc18d2cbb4ed91caed9a2bc2964ded (diff)
downloadFreeBSD-src-41db4b88e03d18ebeca593d652b664a3a7b2ad61.zip
FreeBSD-src-41db4b88e03d18ebeca593d652b664a3a7b2ad61.tar.gz
Add an initial NUMA affinity/policy configuration for threads and processes.
This is based on work done by jeff@ and jhb@, as well as the numa.diff patch that has been circulating when someone asks for first-touch NUMA on -10 or -11. * Introduce a simple set of VM policy and iterator types. * tie the policy types into the vm_phys path for now, mirroring how the initial first-touch allocation work was enabled. * add syscalls to control changing thread and process defaults. * add a global NUMA VM domain policy. * implement a simple cascade policy order - if a thread policy exists, use it; if a process policy exists, use it; use the default policy. * processes inherit policies from their parent processes, threads inherit policies from their parent threads. * add a simple tool (numactl) to query and modify default thread/process policities. * add documentation for the new syscalls, for numa and for numactl. * re-enable first touch NUMA again by default, as now policies can be set in a variety of methods. This is only relevant for very specific workloads. This doesn't pretend to be a final NUMA solution. The previous defaults in -HEAD (with MAXMEMDOM set) can be achieved by 'sysctl vm.default_policy=rr'. This is only relevant if MAXMEMDOM is set to something other than 1. Ie, if you're using GENERIC or a modified kernel with non-NUMA, then this is a glorified no-op for you. Thank you to Norse Corp for giving me access to rather large (for FreeBSD!) NUMA machines in order to develop and verify this. Thank you to Dell for providing me with dual socket sandybridge and westmere v3 hardware to do NUMA development with. Thank you to Scott Long at Netflix for providing me with access to the two-socket, four-domain haswell v3 hardware. Thank you to Peter Holm for running the stress testing suite against the NUMA branch during various stages of development! Tested: * MIPS (regression testing; non-NUMA) * i386 (regression testing; non-NUMA GENERIC) * amd64 (regression testing; non-NUMA GENERIC) * westmere, 2 socket (thankyou norse!) * sandy bridge, 2 socket (thankyou dell!) * ivy bridge, 2 socket (thankyou norse!) * westmere-EX, 4 socket / 1TB RAM (thankyou norse!) * haswell, 2 socket (thankyou norse!) * haswell v3, 2 socket (thankyou dell) * haswell v3, 2x18 core (thankyou scott long / netflix!) * Peter Holm ran a stress test suite on this work and found one issue, but has not been able to verify it (it doesn't look NUMA related, and he only saw it once over many testing runs.) * I've tested bhyve instances running in fixed NUMA domains and cpusets; all seems to work correctly. Verified: * intel-pcm - pcm-numa.x and pcm-memory.x, whilst selecting different NUMA policies for processes under test. Review: This was reviewed through phabricator (https://reviews.freebsd.org/D2559) as well as privately and via emails to freebsd-arch@. The git history with specific attributes is available at https://github.com/erikarn/freebsd/ in the NUMA branch (https://github.com/erikarn/freebsd/compare/local/adrian_numa_policy). This has been reviewed by a number of people (stas, rpaulo, kib, ngie, wblock) but not achieved a clear consensus. My hope is that with further exposure and testing more functionality can be implemented and evaluated. Notes: * The VM doesn't handle unbalanced domains very well, and if you have an overly unbalanced memory setup whilst under high memory pressure, VM page allocation may fail leading to a kernel panic. This was a problem in the past, but it's much more easily triggered now with these tools. * This work only controls the path through vm_phys; it doesn't yet strongly/predictably affect contigmalloc, KVA placement, UMA, etc. So, driver placement of memory isn't really guaranteed in any way. That's next on my plate. Sponsored by: Norse Corp, Inc.; Dell
-rw-r--r--lib/libc/sys/Makefile.inc2
-rw-r--r--lib/libc/sys/Symbol.map2
-rw-r--r--lib/libc/sys/numa_getaffinity.2197
-rw-r--r--share/man/man4/Makefile1
-rw-r--r--share/man/man4/numa.4172
-rw-r--r--sys/conf/files2
-rw-r--r--sys/kern/init_main.c5
-rw-r--r--sys/kern/init_sysent.c2
-rw-r--r--sys/kern/kern_exit.c6
-rw-r--r--sys/kern/kern_fork.c10
-rw-r--r--sys/kern/kern_numa.c170
-rw-r--r--sys/kern/kern_thr.c9
-rw-r--r--sys/kern/kern_thread.c3
-rw-r--r--sys/sys/_vm_domain.h61
-rw-r--r--sys/sys/numa.h41
-rw-r--r--sys/sys/proc.h3
-rw-r--r--sys/vm/vm_domain.c374
-rw-r--r--sys/vm/vm_domain.h66
-rw-r--r--sys/vm/vm_phys.c166
-rw-r--r--sys/vm/vm_phys.h1
-rw-r--r--usr.bin/Makefile1
-rw-r--r--usr.bin/numactl/Makefile5
-rw-r--r--usr.bin/numactl/numactl.1132
-rw-r--r--usr.bin/numactl/numactl.c284
24 files changed, 1704 insertions, 11 deletions
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index 5162563..e672b69 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -235,6 +235,7 @@ MAN+= abort2.2 \
nanosleep.2 \
nfssvc.2 \
ntp_adjtime.2 \
+ numa_getaffinity.2 \
open.2 \
pathconf.2 \
pdfork.2 \
@@ -395,6 +396,7 @@ MLINKS+=mount.2 nmount.2 \
MLINKS+=mq_receive.2 mq_timedreceive.2
MLINKS+=mq_send.2 mq_timedsend.2
MLINKS+=ntp_adjtime.2 ntp_gettime.2
+MLINKS+=numa_getaffinity.2 numa_setaffinity.2
MLINKS+=open.2 openat.2
MLINKS+=pathconf.2 fpathconf.2
MLINKS+=pathconf.2 lpathconf.2
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 194aa5b..a3a613e 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -400,6 +400,8 @@ FBSD_1.4 {
futimens;
ppoll;
utimensat;
+ numa_setaffinity;
+ numa_getaffinity;
};
FBSDprivate_1.0 {
diff --git a/lib/libc/sys/numa_getaffinity.2 b/lib/libc/sys/numa_getaffinity.2
new file mode 100644
index 0000000..efb78e9
--- /dev/null
+++ b/lib/libc/sys/numa_getaffinity.2
@@ -0,0 +1,197 @@
+.\" Copyright (c) 2008 Christian Brueffer
+.\" Copyright (c) 2008 Jeffrey Roberson
+.\" Copyright (c) 2015 Adrian Chadd
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd May 7, 2015
+.Dt NUMA_GETAFFINITY 2
+.Os
+.Sh NAME
+.Nm numa_getaffinity ,
+.Nm numa_setaffinity
+.Nd manage NUMA affinity
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/numa.h
+.Ft int
+.Fn numa_getaffinity "cpuwhich_t which" "id_t id" "struct vm_domain_policy_entry *policy"
+.Ft int
+.Fn numa_setaffinity "cpuwhich_t which" "id_t id" "const struct vm_domain_policy_entry *policy"
+.Sh DESCRIPTION
+.Fn numa_getaffinity
+and
+.Fn numa_setaffinity
+allow the manipulation of NUMA policies available to processes and threads.
+These functions may manipulate NUMA policies that contain many processes
+or affect only a single object.
+.Pp
+Valid values for the
+.Fa which
+argument are documented in
+.Xr cpuset 2 .
+These arguments specify which object set are used.
+Only
+.Dv CPU_WHICH_TID
+and
+.Dv CPU_WHICH_PID
+can be manipulated.
+.Pp
+The
+.Fa policy
+entry contains a vm_domain_policy_entry with the following fields:
+.Bd -literal
+struct vm_domain_policy_entry {
+ vm_domain_policy_type_t policy; /* VM policy */
+ int domain; /* VM domain, if applicable */
+}
+.Ed
+.Fa vm_domain_policy_type_t policy
+is one these:
+.Bl -tag -width VM_POLICY_NONE
+.It Dv VM_POLICY_NONE
+Reset the domain back to none.
+Any parent object NUMA domain policy will apply.
+The only valid value for
+.Dv domain
+is -1.
+.It Dv VM_POLICY_ROUND_ROBIN
+Select round-robin policy.
+Pages will be allocated round-robin from each VM domain in order.
+The only valid value for
+.Dv domain
+is -1.
+.It Dv VM_POLICY_FIXED_DOMAIN
+Select fixed-domain only policy.
+Pages will be allocated from the given
+.Dv domain
+which must be set to a valid VM domain.
+Pages will not be allocated from another domain if
+.Dv domain
+is out of free pages.
+.It Dv VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN
+Select fixed-domain only policy.
+Pages will be allocated from
+.Dv domain
+which must be set to a valid VM domain.
+If page allocation fails, pages will be round-robin
+allocated from another domain if
+.Dv domain
+is out of free pages.
+.It Dv VM_POLICY_FIRST_TOUCH
+Select first-touch policy.
+Pages will be allocated from the NUMA domain which the thread
+is currently scheduled upon.
+Pages will not be allocated from another domain if the current domain
+is out of free pages.
+The only valid value for
+.Dv domain
+is -1.
+.It Dv VM_POLICY_FIRST_TOUCH_ROUND_ROBIN
+Select first-touch policy.
+Pages will be allocated from the NUMA domain which the thread
+is currently scheduled upon.
+Pages will be allocated round-robin from another domain if the
+current domain is out of free pages.
+The only valid value for
+.Dv domain
+is -1.
+.El
+.Pp
+Note that the VM might assign some pages from other domains.
+For example, if an existing page allocation is covered by a superpage
+allocation.
+.Pp
+.Fn numa_getaffinity
+retrieves the
+NUMA policy from the object specified by
+.Fa which
+and
+.Fa id
+and stores it in the space provided by
+.Fa policy .
+.Pp
+.Fn numa_setaffinity
+attempts to set the NUMA policy for the object specified by
+.Fa which
+and
+.Fa id
+to the policy in
+.Fa policy .
+.Sh RETURN VALUES
+.Rv -std
+.Sh ERRORS
+.Va errno
+can contain these error codes:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The
+.Fa level
+or
+.Fa which
+argument was not a valid value.
+.It Bq Er EINVAL
+The
+.Fa policy
+argument specified when calling
+.Fn numa_setaffinity
+did not contain a valid policy.
+.It Bq Er EFAULT
+The policy pointer passed was invalid.
+.It Bq Er ESRCH
+The object specified by the
+.Fa id
+and
+.Fa which
+arguments could not be found.
+.It Bq Er ERANGE
+The
+.Fa domain
+in the given policy
+was out of the range of possible VM domains available.
+.It Bq Er EPERM
+The calling process did not have the credentials required to complete the
+operation.
+.El
+.Sh SEE ALSO
+.Xr cpuset 1 ,
+.Xr numactl 1 ,
+.Xr cpuset 2 ,
+.Xr cpuset_getaffinity 2 ,
+.Xr cpuset_getid 2 ,
+.Xr cpuset_setaffinity 2 ,
+.Xr cpuset_setid 2 ,
+.Xr pthread_affinity_np 3 ,
+.Xr pthread_attr_affinity_np 3 ,
+.Xr numa 4
+.Sh HISTORY
+The
+.Nm
+family of system calls first appeared in
+.Fx 11.0 .
+.Sh AUTHORS
+.An Adrian Chadd Aq Mt adrian@FreeBSD.org
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 498f905..cd4c91b 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -364,6 +364,7 @@ MAN= aac.4 \
nsp.4 \
${_ntb.4} \
null.4 \
+ numa.4 \
${_nvd.4} \
${_nvme.4} \
${_nvram.4} \
diff --git a/share/man/man4/numa.4 b/share/man/man4/numa.4
new file mode 100644
index 0000000..984c464
--- /dev/null
+++ b/share/man/man4/numa.4
@@ -0,0 +1,172 @@
+.\" Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd May 10, 2015
+.Dt NUMA 4
+.Os
+.Sh NAME
+.Nm NUMA
+.Nd Non-Uniform Memory Access
+.Sh SYNOPSIS
+.Cd options SMP
+.Cd options MAXMEMDOM=16
+.Pp
+.In sys/numa.h
+.In sys/cpuset.h
+.In sys/bus.h
+.Sh DESCRIPTION
+Non-Uniform Memory Access is a computer architecture design which
+involves unequal costs between processors, memory and IO devices
+in a given system.
+.Pp
+In a
+.Nm
+architecture, the latency to access specific memory or IO devices
+depends upon which processor the memory or device is attached to.
+Accessing memory local to a processor is faster than accessing memory
+that is connected to one of the other processors.
+.Pp
+.Nm
+is enabled when the
+.Cd MAXMEMDOM
+option is used in a kernel configuration
+file and is set to a value greater than 1.
+.Pp
+Thread and process
+.Nm
+policies are controlled with the
+.Xr numa_setaffinity 2
+and
+.Xr numa_getaffinity 2
+syscalls.
+.Pp
+The
+.Xr numactl 1
+tool is available for starting processes with a non-default
+policy, or to change the policy of an existing thread or process.
+.Pp
+Systems with non-uniform access to I/O devices may mark those devices
+with the local VM domain identifier.
+Drivers can find out their local domain information by calling
+.Xr bus_get_domain 9 .
+.Ss MIB Variables
+The operation of
+.Nm
+is controlled and exposes information with these
+.Xr sysctl 8
+MIB variables:
+.Pp
+.Bl -tag -width indent -compact
+.It Va vm.ndomains
+The number of VM domains which have been detected.
+.Pp
+.It Va vm.default_policy
+The default VM domain allocation policy.
+Defaults to "first-touch-rr".
+The valid values are "first-touch", "first-touch-rr",
+"rr", where "rr" is a short-hand for "round-robin."
+See
+.Xr numa_setaffinity 2
+for more information about the available policies.
+.Pp
+.It Va vm.phys_locality
+A table indicating the relative cost of each VM domain to each other.
+A value of 10 indicates equal cost.
+A value of -1 means the locality map is not available or no
+locality information is available.
+.Pp
+.It Va vm.phys_segs
+The map of physical memory, grouped by VM domain.
+.El
+.Sh IMPLEMENTATION NOTES
+The current
+.Nm
+implementation is VM-focused.
+The hardware
+.Nm
+domains are mapped into a contiguous, non-sparse
+VM domain space, starting from 0.
+Thus, VM domain information (for example, the domain identifier) is not
+necessarily the same as is found in the hardware specific information.
+.Pp
+The
+.Nm
+allocation policies are implemented as a policy and iterator in
+.Pa sys/vm/vm_domain.c
+and
+.Pa sys/vm/vm_domain.h .
+Policy information is available in both struct thread and struct proc.
+Processes inherit
+.Nm
+policy from parent processes and threads inherit
+.Nm
+policy from parent threads.
+Note that threads do not explicitly inherit their
+.Nm
+policy from processes.
+Instead, if no thread policy is set, the system
+will fall back to the process policy.
+.Pp
+For now,
+.Nm
+domain policies only influence physical page allocation in
+.Pa sys/vm/vm_phys.c .
+This is useful for userland memory allocation, but not for kernel
+and driver memory allocation.
+These features will be implemented in future work.
+.Sh SEE ALSO
+.Xr numactl 1 ,
+.Xr numa_getaffinity 2 ,
+.Xr numa_setaffinity 2 ,
+.Xr bus_get_domain 9
+.Sh HISTORY
+.Nm
+first appeared in
+.Fx 9.0
+as a first-touch allocation policy with a fail-over to round-robin allocation
+and was not configurable.
+It was then modified in
+.Fx 10.0
+to implement a round-robin allocation policy and was also not configurable.
+.Pp
+The
+.Xr numa_getaffinity 2
+and
+.Xr numa_setaffinity 2
+syscalls first appeared in
+.Fx 11.0 .
+.Pp
+The
+.Xr numactl 1
+tool first appeared in
+.Fx 11.0 .
+.Sh AUTHORS
+This manual page written by
+.An Adrian Chadd Aq Mt adrian@FreeBSD.org .
+.Sh NOTES
+No statistics are kept to indicate how often
+.Nm
+allocation policies succeed or fail.
diff --git a/sys/conf/files b/sys/conf/files
index e2fdfc8..0448ad4 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3017,6 +3017,7 @@ kern/kern_module.c standard
kern/kern_mtxpool.c standard
kern/kern_mutex.c standard
kern/kern_ntptime.c standard
+kern/kern_numa.c standard
kern/kern_osd.c standard
kern/kern_physio.c standard
kern/kern_pmc.c standard
@@ -4043,6 +4044,7 @@ vm/vm_pager.c standard
vm/vm_phys.c standard
vm/vm_radix.c standard
vm/vm_reserv.c standard
+vm/vm_domain.c standard
vm/vm_unix.c standard
vm/vm_zeroidle.c standard
vm/vnode_pager.c standard
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 37539c4..7552d51 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -87,6 +87,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <vm/vm_domain.h>
#include <sys/copyright.h>
#include <ddb/ddb.h>
@@ -496,6 +497,10 @@ proc0_init(void *dummy __unused)
td->td_flags = TDF_INMEM;
td->td_pflags = TDP_KTHREAD;
td->td_cpuset = cpuset_thread0();
+ vm_domain_policy_init(&td->td_vm_dom_policy);
+ vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1);
+ vm_domain_policy_init(&p->p_vm_dom_policy);
+ vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1);
prison0_init();
p->p_peers = 0;
p->p_leader = p;
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 14e8281..a328906 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -588,4 +588,6 @@ struct sysent sysent[] = {
{ AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = ppoll */
{ AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = futimens */
{ AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = utimensat */
+ { AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 548 = numa_getaffinity */
+ { AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 549 = numa_setaffinity */
};
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 60691f0..8fe968e 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -86,6 +86,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/uma.h>
+#include <vm/vm_domain.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
@@ -950,6 +951,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options)
#ifdef MAC
mac_proc_destroy(p);
#endif
+ /*
+ * Free any domain policy that's still hiding around.
+ */
+ vm_domain_policy_cleanup(&p->p_vm_dom_policy);
+
KASSERT(FIRST_THREAD_IN_PROC(p),
("proc_reap: no residual thread!"));
uma_zfree(proc_zone, p);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 3fd4f09..a031435 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -80,6 +80,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
+#include <vm/vm_domain.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
@@ -405,6 +406,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
pargs_hold(p2->p_args);
+
PROC_UNLOCK(p1);
bzero(&p2->p_startzero,
@@ -497,6 +499,14 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
+ /*
+ * Whilst the proc lock is held, copy the VM domain data out
+ * using the VM domain method.
+ */
+ vm_domain_policy_init(&p2->p_vm_dom_policy);
+ vm_domain_policy_localcopy(&p2->p_vm_dom_policy,
+ &p1->p_vm_dom_policy);
+
if (flags & RFSIGSHARE) {
p2->p_sigacts = sigacts_hold(p1->p_sigacts);
} else {
diff --git a/sys/kern/kern_numa.c b/sys/kern/kern_numa.c
new file mode 100644
index 0000000..e3a5837
--- /dev/null
+++ b/sys/kern/kern_numa.c
@@ -0,0 +1,170 @@
+/*-
+ * Copyright (c) 2015, Adrian Chadd <adrian@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/cpuset.h>
+#include <sys/sx.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_domain.h>
+
+int
+sys_numa_setaffinity(struct thread *td, struct numa_setaffinity_args *uap)
+{
+ int error;
+ struct vm_domain_policy vp;
+ struct thread *ttd;
+ struct proc *p;
+ struct cpuset *set;
+
+ set = NULL;
+ p = NULL;
+
+ /*
+ * Copy in just the policy information into the policy
+ * struct. Userland only supplies vm_domain_policy_entry.
+ */
+ error = copyin(uap->policy, &vp.p, sizeof(vp.p));
+ if (error)
+ goto out;
+
+ /*
+ * Ensure the seq number is zero - otherwise seq.h
+ * may get very confused.
+ */
+ vp.seq = 0;
+
+ /*
+ * Validate policy.
+ */
+ if (vm_domain_policy_validate(&vp) != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Go find the desired proc/tid for this operation.
+ */
+ error = cpuset_which(uap->which, uap->id, &p,
+ &ttd, &set);
+ if (error)
+ goto out;
+
+ /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */
+ /*
+ * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset,
+ * it'll return ESRCH. We should just return EINVAL.
+ */
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ vm_domain_policy_copy(&ttd->td_vm_dom_policy, &vp);
+ break;
+ case CPU_WHICH_PID:
+ vm_domain_policy_copy(&p->p_vm_dom_policy, &vp);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ PROC_UNLOCK(p);
+out:
+ if (set)
+ cpuset_rel(set);
+ return (error);
+}
+
+int
+sys_numa_getaffinity(struct thread *td, struct numa_getaffinity_args *uap)
+{
+ int error;
+ struct vm_domain_policy vp;
+ struct thread *ttd;
+ struct proc *p;
+ struct cpuset *set;
+
+ set = NULL;
+ p = NULL;
+
+ error = cpuset_which(uap->which, uap->id, &p,
+ &ttd, &set);
+ if (error)
+ goto out;
+
+ /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */
+ /*
+ * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset,
+ * it'll return ESRCH. We should just return EINVAL.
+ */
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ vm_domain_policy_localcopy(&vp, &ttd->td_vm_dom_policy);
+ break;
+ case CPU_WHICH_PID:
+ vm_domain_policy_localcopy(&vp, &p->p_vm_dom_policy);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (p)
+ PROC_UNLOCK(p);
+ /*
+ * Copy out only the vm_domain_policy_entry part.
+ */
+ if (error == 0)
+ error = copyout(&vp.p, uap->policy, sizeof(vp.p));
+out:
+ if (set)
+ cpuset_rel(set);
+ return (error);
+}
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index e6b0a59..7e7b88f 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -54,6 +54,8 @@ __FBSDID("$FreeBSD$");
#include <sys/umtx.h>
#include <sys/limits.h>
+#include <vm/vm_domain.h>
+
#include <machine/frame.h>
#include <security/audit/audit.h>
@@ -254,6 +256,13 @@ create_thread(struct thread *td, mcontext_t *ctx,
thread_unlock(td);
if (P_SHOULDSTOP(p))
newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+
+ /*
+ * Copy the existing thread VM policy into the new thread.
+ */
+ vm_domain_policy_localcopy(&newtd->td_vm_dom_policy,
+ &td->td_vm_dom_policy);
+
PROC_UNLOCK(p);
tidhash_add(newtd);
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 4343b64..0f65403 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
+#include <vm/vm_domain.h>
#include <sys/eventhandler.h>
SDT_PROVIDER_DECLARE(proc);
@@ -351,6 +352,7 @@ thread_alloc(int pages)
return (NULL);
}
cpu_thread_alloc(td);
+ vm_domain_policy_init(&td->td_vm_dom_policy);
return (td);
}
@@ -380,6 +382,7 @@ thread_free(struct thread *td)
cpu_thread_free(td);
if (td->td_kstack != 0)
vm_thread_dispose(td);
+ vm_domain_policy_cleanup(&td->td_vm_dom_policy);
uma_zfree(thread_zone, td);
}
diff --git a/sys/sys/_vm_domain.h b/sys/sys/_vm_domain.h
new file mode 100644
index 0000000..36d107a
--- /dev/null
+++ b/sys/sys/_vm_domain.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+#ifndef __SYS_VM_DOMAIN_H__
+#define __SYS_VM_DOMAIN_H__
+
+#include <sys/seq.h>
+
+typedef enum {
+ VM_POLICY_NONE,
+ VM_POLICY_ROUND_ROBIN,
+ VM_POLICY_FIXED_DOMAIN,
+ VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN,
+ VM_POLICY_FIRST_TOUCH,
+ VM_POLICY_FIRST_TOUCH_ROUND_ROBIN,
+ VM_POLICY_MAX
+} vm_domain_policy_type_t;
+
+struct vm_domain_policy_entry {
+ vm_domain_policy_type_t policy;
+ int domain;
+};
+
+struct vm_domain_policy {
+ seq_t seq;
+ struct vm_domain_policy_entry p;
+};
+
+#define VM_DOMAIN_POLICY_STATIC_INITIALISER(vt, vd) \
+ { .seq = 0, \
+ .p.policy = vt, \
+ .p.domain = vd }
+
+#endif /* __SYS_VM_DOMAIN_H__ */
diff --git a/sys/sys/numa.h b/sys/sys/numa.h
new file mode 100644
index 0000000..982f9e7
--- /dev/null
+++ b/sys/sys/numa.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef __SYS_NUMA_H__
+#define __SYS_NUMA_H__
+
+#include <sys/_vm_domain.h>
+
+extern int numa_setaffinity(cpuwhich_t which, id_t id,
+ struct vm_domain_policy_entry *vd);
+extern int numa_getaffinity(cpuwhich_t which, id_t id,
+ struct vm_domain_policy_entry *vd);
+
+#endif /* __SYS_NUMA_H__ */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index e6c83b4..95a4e041 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -63,6 +63,7 @@
#endif
#include <sys/ucontext.h>
#include <sys/ucred.h>
+#include <sys/_vm_domain.h>
#include <machine/proc.h> /* Machine-dependent proc substruct. */
/*
@@ -217,6 +218,7 @@ struct thread {
struct turnstile *td_turnstile; /* (k) Associated turnstile. */
struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */
struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */
+ struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */
lwpid_t td_tid; /* (b) Thread ID. */
sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */
#define td_siglist td_sigqueue.sq_signals
@@ -606,6 +608,7 @@ struct proc {
uint64_t p_prev_runtime; /* (c) Resource usage accounting. */
struct racct *p_racct; /* (b) Resource accounting. */
u_char p_throttled; /* (c) Flag for racct pcpu throttling */
+ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */
/*
* An orphan is the child that has beed re-parented to the
* debugger as a result of attaching to it. Need to keep
diff --git a/sys/vm/vm_domain.c b/sys/vm/vm_domain.c
new file mode 100644
index 0000000..83814d5
--- /dev/null
+++ b/sys/vm/vm_domain.c
@@ -0,0 +1,374 @@
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_vm.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#if MAXMEMDOM > 1
+#include <sys/proc.h>
+#endif
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/vmmeter.h>
+#include <sys/seq.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+
+#include <vm/vm_domain.h>
+
+static __inline int
+vm_domain_rr_selectdomain(void)
+{
+#if MAXMEMDOM > 1
+ struct thread *td;
+
+ td = curthread;
+
+ td->td_dom_rr_idx++;
+ td->td_dom_rr_idx %= vm_ndomains;
+ return (td->td_dom_rr_idx);
+#else
+ return (0);
+#endif
+}
+
+/*
+ * This implements a very simple set of VM domain memory allocation
+ * policies and iterators.
+ */
+
+/*
+ * A VM domain policy represents a desired VM domain policy.
+ * Iterators implement searching through VM domains in a specific
+ * order.
+ */
+
+/*
+ * When setting a policy, the caller must establish their own
+ * exclusive write protection for the contents of the domain
+ * policy.
+ */
+int
+vm_domain_policy_init(struct vm_domain_policy *vp)
+{
+
+ bzero(vp, sizeof(*vp));
+ vp->p.policy = VM_POLICY_NONE;
+ vp->p.domain = -1;
+ return (0);
+}
+
+int
+vm_domain_policy_set(struct vm_domain_policy *vp,
+ vm_domain_policy_type_t vt, int domain)
+{
+
+ seq_write_begin(&vp->seq);
+ vp->p.policy = vt;
+ vp->p.domain = domain;
+ seq_write_end(&vp->seq);
+ return (0);
+}
+
+/*
+ * Take a local copy of a policy.
+ *
+ * The destination policy isn't write-barriered; this is used
+ * for doing local copies into something that isn't shared.
+ */
+void
+vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src)
+{
+ seq_t seq;
+
+ for (;;) {
+ seq = seq_read(&src->seq);
+ *dst = *src;
+ if (seq_consistent(&src->seq, seq))
+ return;
+ cpu_spinwait();
+ }
+}
+
+/*
+ * Take a write-barrier copy of a policy.
+ *
+ * The destination policy is write -barriered; this is used
+ * for doing copies into policies that may be read by other
+ * threads.
+ */
+void
+vm_domain_policy_copy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src)
+{
+ seq_t seq;
+ struct vm_domain_policy d;
+
+ for (;;) {
+ seq = seq_read(&src->seq);
+ d = *src;
+ if (seq_consistent(&src->seq, seq)) {
+ seq_write_begin(&dst->seq);
+ dst->p.domain = d.p.domain;
+ dst->p.policy = d.p.policy;
+ seq_write_end(&dst->seq);
+ return;
+ }
+ cpu_spinwait();
+ }
+}
+
+int
+vm_domain_policy_validate(const struct vm_domain_policy *vp)
+{
+
+ switch (vp->p.policy) {
+ case VM_POLICY_NONE:
+ case VM_POLICY_ROUND_ROBIN:
+ case VM_POLICY_FIRST_TOUCH:
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ if (vp->p.domain == -1)
+ return (0);
+ return (-1);
+ case VM_POLICY_FIXED_DOMAIN:
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains)
+ return (0);
+ return (-1);
+ default:
+ return (-1);
+ }
+ return (-1);
+}
+
+int
+vm_domain_policy_cleanup(struct vm_domain_policy *vp)
+{
+
+ /* For now, empty */
+ return (0);
+}
+
+int
+vm_domain_iterator_init(struct vm_domain_iterator *vi)
+{
+
+ /* Nothing to do for now */
+ return (0);
+}
+
+/*
+ * Manually setup an iterator with the given details.
+ */
+int
+vm_domain_iterator_set(struct vm_domain_iterator *vi,
+ vm_domain_policy_type_t vt, int domain)
+{
+
+ switch (vt) {
+ case VM_POLICY_FIXED_DOMAIN:
+ vi->policy = VM_POLICY_FIXED_DOMAIN;
+ vi->domain = domain;
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN;
+ vi->domain = domain;
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_FIRST_TOUCH:
+ vi->policy = VM_POLICY_FIRST_TOUCH;
+ vi->domain = PCPU_GET(domain);
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN;
+ vi->domain = PCPU_GET(domain);
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ vi->policy = VM_POLICY_ROUND_ROBIN;
+ vi->domain = -1;
+ vi->n = vm_ndomains;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * Setup an iterator based on the given policy.
+ */
+static inline void
+_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+ const struct vm_domain_policy *vt)
+{
+ /*
+ * Initialise the iterator.
+ *
+ * For first-touch, the initial domain is set
+ * via the current thread CPU domain.
+ *
+ * For fixed-domain, it's assumed that the
+ * caller has initialised the specific domain
+ * it is after.
+ */
+ switch (vt->p.policy) {
+ case VM_POLICY_FIXED_DOMAIN:
+ vi->policy = vt->p.policy;
+ vi->domain = vt->p.domain;
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ vi->policy = vt->p.policy;
+ vi->domain = vt->p.domain;
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_FIRST_TOUCH:
+ vi->policy = vt->p.policy;
+ vi->domain = PCPU_GET(domain);
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ vi->policy = vt->p.policy;
+ vi->domain = PCPU_GET(domain);
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ /*
+ * Default to round-robin policy.
+ */
+ vi->policy = VM_POLICY_ROUND_ROBIN;
+ vi->domain = -1;
+ vi->n = vm_ndomains;
+ break;
+ }
+}
+
+void
+vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+ const struct vm_domain_policy *vt)
+{
+ seq_t seq;
+ struct vm_domain_policy vt_lcl;
+
+ for (;;) {
+ seq = seq_read(&vt->seq);
+ vt_lcl = *vt;
+ if (seq_consistent(&vt->seq, seq)) {
+ _vm_domain_iterator_set_policy(vi, &vt_lcl);
+ return;
+ }
+ cpu_spinwait();
+ }
+}
+
+/*
+ * Return the next VM domain to use.
+ *
+ * Returns 0 w/ domain set to the next domain to use, or
+ * -1 to indicate no more domains are available.
+ */
+int
+vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
+{
+
+ /* General catch-all */
+ if (vi->n <= 0)
+ return (-1);
+
+ switch (vi->policy) {
+ case VM_POLICY_FIXED_DOMAIN:
+ case VM_POLICY_FIRST_TOUCH:
+ *domain = vi->domain;
+ vi->n--;
+ break;
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ /*
+ * XXX TODO: skip over the rr'ed domain
+ * if it equals the one we started with.
+ */
+ if (vi->n == vm_ndomains)
+ *domain = vi->domain;
+ else
+ *domain = vm_domain_rr_selectdomain();
+ vi->n--;
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ *domain = vm_domain_rr_selectdomain();
+ vi->n--;
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Returns 1 if the iteration is done, or 0 if it has not.
+
+ * This can only be called after at least one loop through
+ * the iterator. Ie, it's designed to be used as a tail
+ * check of a loop, not the head check of a loop.
+ */
+int
+vm_domain_iterator_isdone(struct vm_domain_iterator *vi)
+{
+
+ return (vi->n <= 0);
+}
+
+int
+vm_domain_iterator_cleanup(struct vm_domain_iterator *vi)
+{
+
+ return (0);
+}
diff --git a/sys/vm/vm_domain.h b/sys/vm/vm_domain.h
new file mode 100644
index 0000000..7d9e07c
--- /dev/null
+++ b/sys/vm/vm_domain.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+#ifndef __VM_DOMAIN_H__
+#define __VM_DOMAIN_H__
+
+#include <sys/_vm_domain.h>
+
+struct vm_domain_iterator {
+ vm_domain_policy_type_t policy;
+ int domain;
+ int n;
+};
+
+/*
+ * TODO: check to see if these should just become inline functions
+ * at some point.
+ */
+extern int vm_domain_policy_init(struct vm_domain_policy *vp);
+extern int vm_domain_policy_set(struct vm_domain_policy *vp,
+ vm_domain_policy_type_t vt, int domain);
+extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp);
+extern void vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src);
+extern void vm_domain_policy_copy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src);
+extern int vm_domain_policy_validate(const struct vm_domain_policy *vp);
+
+extern int vm_domain_iterator_init(struct vm_domain_iterator *vi);
+extern int vm_domain_iterator_set(struct vm_domain_iterator *vi,
+ vm_domain_policy_type_t vt, int domain);
+extern void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+ const struct vm_domain_policy *vt);
+extern int vm_domain_iterator_run(struct vm_domain_iterator *vi,
+ int *domain);
+extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi);
+extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi);
+
+#endif /* __VM_DOMAIN_H__ */
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
index 71fadd7..d26b8b5 100644
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -57,6 +57,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/tree.h>
#include <sys/vmmeter.h>
+#include <sys/seq.h>
#include <ddb/ddb.h>
@@ -67,6 +68,8 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
+#include <vm/vm_domain.h>
+
_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
"Too many physsegs.");
@@ -141,13 +144,30 @@ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
+#if MAXMEMDOM > 1
static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
+#endif
SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
&vm_ndomains, 0, "Number of physical memory domains available.");
+/*
+ * Default to first-touch + round-robin.
+ */
+static struct mtx vm_default_policy_mtx;
+MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
+ MTX_DEF);
+#if MAXMEMDOM > 1
+static struct vm_domain_policy vm_default_policy =
+ VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+#else
+/* Use round-robin so the domain policy code will only try once per allocation */
+static struct vm_domain_policy vm_default_policy =
+ VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
+#endif
+
static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
int order);
static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
@@ -156,6 +176,60 @@ static int vm_phys_paddr_to_segind(vm_paddr_t pa);
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
int order);
+static int
+sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
+{
+ char policy_name[32];
+ int error;
+
+ mtx_lock(&vm_default_policy_mtx);
+
+ /* Map policy to output string */
+ switch (vm_default_policy.p.policy) {
+ case VM_POLICY_FIRST_TOUCH:
+ strcpy(policy_name, "first-touch");
+ break;
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ strcpy(policy_name, "first-touch-rr");
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ strcpy(policy_name, "rr");
+ break;
+ }
+ mtx_unlock(&vm_default_policy_mtx);
+
+ error = sysctl_handle_string(oidp, &policy_name[0],
+ sizeof(policy_name), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vm_default_policy_mtx);
+ /* Set: match on the subset of policies that make sense as a default */
+ if (strcmp("first-touch-rr", policy_name) == 0) {
+ vm_domain_policy_set(&vm_default_policy,
+ VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+ } else if (strcmp("first-touch", policy_name) == 0) {
+ vm_domain_policy_set(&vm_default_policy,
+ VM_POLICY_FIRST_TOUCH, 0);
+ } else if (strcmp("rr", policy_name) == 0) {
+ vm_domain_policy_set(&vm_default_policy,
+ VM_POLICY_ROUND_ROBIN, 0);
+ } else {
+ error = EINVAL;
+ goto finish;
+ }
+
+ error = 0;
+finish:
+ mtx_unlock(&vm_default_policy_mtx);
+ return (error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
+ 0, 0, sysctl_vm_default_policy, "A",
+ "Default policy (rr, first-touch, first-touch-rr");
+
/*
* Red-black tree helpers for vm fictitious range management.
*/
@@ -213,6 +287,53 @@ vm_rr_selectdomain(void)
#endif
}
+/*
+ * Initialise a VM domain iterator.
+ *
+ * Check the thread policy, then the proc policy,
+ * then default to the system policy.
+ *
+ * Later on the various layers will have this logic
+ * plumbed into them and the phys code will be explicitly
+ * handed a VM domain policy to use.
+ */
+static void
+vm_policy_iterator_init(struct vm_domain_iterator *vi)
+{
+#if MAXMEMDOM > 1
+ struct vm_domain_policy lcl;
+#endif
+
+ vm_domain_iterator_init(vi);
+
+#if MAXMEMDOM > 1
+ /* Copy out the thread policy */
+ vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
+ if (lcl.p.policy != VM_POLICY_NONE) {
+ /* Thread policy is present; use it */
+ vm_domain_iterator_set_policy(vi, &lcl);
+ return;
+ }
+
+ vm_domain_policy_localcopy(&lcl,
+ &curthread->td_proc->p_vm_dom_policy);
+ if (lcl.p.policy != VM_POLICY_NONE) {
+ /* Process policy is present; use it */
+ vm_domain_iterator_set_policy(vi, &lcl);
+ return;
+ }
+#endif
+ /* Use system default policy */
+ vm_domain_iterator_set_policy(vi, &vm_default_policy);
+}
+
+static void
+vm_policy_iterator_finish(struct vm_domain_iterator *vi)
+{
+
+ vm_domain_iterator_cleanup(vi);
+}
+
boolean_t
vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
{
@@ -305,17 +426,22 @@ sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
/*
* Return affinity, or -1 if there's no affinity information.
*/
-static int
+int
vm_phys_mem_affinity(int f, int t)
{
+#if MAXMEMDOM > 1
if (mem_locality == NULL)
return (-1);
if (f >= vm_ndomains || t >= vm_ndomains)
return (-1);
return (mem_locality[f * vm_ndomains + t]);
+#else
+ return (-1);
+#endif
}
+#if MAXMEMDOM > 1
/*
* Outputs the VM locality table.
*/
@@ -343,6 +469,7 @@ sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
sbuf_delete(&sbuf);
return (error);
}
+#endif
static void
vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
@@ -634,15 +761,17 @@ vm_page_t
vm_phys_alloc_pages(int pool, int order)
{
vm_page_t m;
- int dom, domain, flind;
+ int domain, flind;
+ struct vm_domain_iterator vi;
KASSERT(pool < VM_NFREEPOOL,
("vm_phys_alloc_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_pages: order %d is out of range", order));
- for (dom = 0; dom < vm_ndomains; dom++) {
- domain = vm_rr_selectdomain();
+ vm_policy_iterator_init(&vi);
+
+ while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
for (flind = 0; flind < vm_nfreelists; flind++) {
m = vm_phys_alloc_domain_pages(domain, flind, pool,
order);
@@ -650,6 +779,8 @@ vm_phys_alloc_pages(int pool, int order)
return (m);
}
}
+
+ vm_policy_iterator_finish(&vi);
return (NULL);
}
@@ -664,7 +795,8 @@ vm_page_t
vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
{
vm_page_t m;
- int dom, domain;
+ struct vm_domain_iterator vi;
+ int domain;
KASSERT(freelist < VM_NFREELIST,
("vm_phys_alloc_freelist_pages: freelist %d is out of range",
@@ -673,13 +805,17 @@ vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_freelist_pages: order %d is out of range", order));
- for (dom = 0; dom < vm_ndomains; dom++) {
- domain = vm_rr_selectdomain();
+
+ vm_policy_iterator_init(&vi);
+
+ while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
m = vm_phys_alloc_domain_pages(domain,
vm_freelist_to_flind[freelist], pool, order);
if (m != NULL)
return (m);
}
+
+ vm_policy_iterator_finish(&vi);
return (NULL);
}
@@ -1169,7 +1305,8 @@ vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
vm_paddr_t pa, pa_last, size;
vm_page_t m, m_ret;
u_long npages_end;
- int dom, domain, flind, oind, order, pind;
+ int domain, flind, oind, order, pind;
+ struct vm_domain_iterator vi;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
size = npages << PAGE_SHIFT;
@@ -1181,9 +1318,15 @@ vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
("vm_phys_alloc_contig: boundary must be a power of 2"));
/* Compute the queue that is the best fit for npages. */
for (order = 0; (1 << order) < npages; order++);
- dom = 0;
+
+ vm_policy_iterator_init(&vi);
+
restartdom:
- domain = vm_rr_selectdomain();
+ if (vm_domain_iterator_run(&vi, &domain) != 0) {
+ vm_policy_iterator_finish(&vi);
+ return (NULL);
+ }
+
for (flind = 0; flind < vm_nfreelists; flind++) {
for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
@@ -1241,8 +1384,9 @@ restartdom:
}
}
}
- if (++dom < vm_ndomains)
+ if (!vm_domain_iterator_isdone(&vi))
goto restartdom;
+ vm_policy_iterator_finish(&vi);
return (NULL);
done:
for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h
index 575b93c..37864db 100644
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@@ -87,6 +87,7 @@ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
void vm_phys_set_pool(int pool, vm_page_t m, int order);
boolean_t vm_phys_unfree_page(vm_page_t m);
boolean_t vm_phys_zero_pages_idle(void);
+int vm_phys_mem_affinity(int f, int t);
/*
* vm_phys_domain:
diff --git a/usr.bin/Makefile b/usr.bin/Makefile
index 1187dc2..fd78602 100644
--- a/usr.bin/Makefile
+++ b/usr.bin/Makefile
@@ -117,6 +117,7 @@ SUBDIR= ${_addr2line} \
nice \
nl \
${_nm} \
+ numactl \
nohup \
opieinfo \
opiekey \
diff --git a/usr.bin/numactl/Makefile b/usr.bin/numactl/Makefile
new file mode 100644
index 0000000..7158487
--- /dev/null
+++ b/usr.bin/numactl/Makefile
@@ -0,0 +1,5 @@
+# $FreeBSD$
+
+PROG= numactl
+
+.include <bsd.prog.mk>
diff --git a/usr.bin/numactl/numactl.1 b/usr.bin/numactl/numactl.1
new file mode 100644
index 0000000..750e23b
--- /dev/null
+++ b/usr.bin/numactl/numactl.1
@@ -0,0 +1,132 @@
+.\" Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd May 9, 2015
+.Dt NUMACTL 1
+.Os
+.Sh NAME
+.Nm numactl
+.Nd "manage NUMA policy configuration"
+.Sh SYNOPSIS
+.Nm
+.Op Fl l Ar policy
+.Op Fl m Ar domain
+.Op Fl c Ar domain
+.Ar cmd ...
+.Nm
+.Fl g
+.Op Fl p Ar pid
+.Op Fl t Ar tid
+.Nm
+.Fl s
+.Op Fl l Ar policy
+.Op Fl m Ar domain
+.Op Fl c Ar domain
+.Op Fl p Ar pid
+.Op Fl t Ar tid
+.Sh DESCRIPTION
+The
+.Nm
+command can be used to assign NUMA policies to processes/threads,
+run commands with a given NUMA policy, and query information
+about NUMA policies on running processes.
+.Pp
+.Nm
+requires a target to modify or query.
+The target may be specified as a command, process id or a thread id.
+Using
+.Fl -get
+the target's NUMA policy may be queried.
+Using
+.Fl -set
+the target's NUMA policy may be queried.
+If no target is specified,
+.Nm
+operates on itself.
+Not all combinations of operations and targets are supported.
+For example,
+you may not set the id of an existing set or query and launch a command
+at the same time.
+.Pp
+Each process and thread has a NUMA policy.
+By default the policy is NONE.
+If a thread policy is NONE, then the policy will fall back to the process.
+If the process policy is NONE, then the policy will fall back to the
+system default.
+The policy may be queried by using
+.Fl -get.
+.Pp
+The options are as follows:
+.Bl -tag -width ".Fl -cpudomain Ar domain"
+.It Fl -cpudomain Ar domain , Fl c Ar domain
+Set the given CPU scheduling policy.
+Constrain the the object (tid, pid, command) to run on CPUs
+that belong to the given domain.
+.It Fl -get , Fl g
+Retrieve the NUMA policy for the given thread or process id.
+.It Fl -set , Fl s
+Set the NUMA policy for the given thread or process id.
+.It Fl -memdomain Ar domain , Fl m Ar domain
+Constrain the object (tid, pid, command) to the given
+domain.
+This is only valid for fixed-domain and fixed-domain-rr.
+It must not be set for other policies.
+.It Fl -mempolicy Ar policy , Fl l Ar policy
+Set the given memory allocation policy.
+Valid policies are none, rr, fixed-domain, fixed-domain-rr,
+first-touch, and first-touch-rr.
+A memdomain argument is required for fixed-domain and
+fixed-domain-rr.
+.It Fl -pid Ar pid , Fl p Ar pid
+Operate on the given pid.
+.It Fl -tid Ar tid , Fl t Ar tid
+Operate on the given tid.
+.El
+.Sh EXIT STATUS
+.Ex -std
+.Sh EXAMPLES
+Create a
+.Pa /bin/sh
+process with memory coming from domain 0, but
+CPUs coming from domain 1:
+.Dl numactl --mempolicy=fixed-domain --memdomain=0 --cpudomain=1 /bin/sh
+.Pp
+Query the NUMA policy for the
+.Aq sh pid :
+.Dl numactl --get --pid=<sh pid>
+.Pp
+Set the NUMA policy for the given TID to round-robin:
+.Dl numactl --set --mempolicy=rr --tid=<tid>
+.Sh SEE ALSO
+.Xr cpuset 2 ,
+.Xr numa 4
+.Sh HISTORY
+The
+.Nm
+command first appeared in
+.Fx 11.0 .
+.Sh AUTHORS
+.An Adrian Chadd Aq Mt adrian@FreeBSD.org
diff --git a/usr.bin/numactl/numactl.c b/usr.bin/numactl/numactl.c
new file mode 100644
index 0000000..ce1dfae
--- /dev/null
+++ b/usr.bin/numactl/numactl.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/numa.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+static struct option longopts[] = {
+ { "tid", required_argument, NULL, 't' },
+ { "pid", required_argument, NULL, 'p' },
+ { "memdomain", required_argument, NULL, 'm' },
+ { "cpudomain", required_argument, NULL, 'c' },
+ { "mempolicy", required_argument, NULL, 'l' },
+ { "set", no_argument, NULL, 's' },
+ { "get", no_argument, NULL, 'g' },
+ { NULL, 0, NULL, 0 }
+};
+
+static const char *
+policy_to_str(vm_domain_policy_type_t vt)
+{
+
+ switch (vt) {
+ case VM_POLICY_NONE:
+ return ("none");
+ case VM_POLICY_ROUND_ROBIN:
+ return ("rr");
+ case VM_POLICY_FIXED_DOMAIN:
+ return ("fixed-domain");
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ return ("fixed-domain-rr");
+ case VM_POLICY_FIRST_TOUCH:
+ return ("first-touch");
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ return ("first-touch-rr");
+ default:
+ return ("unknown");
+ }
+}
+
+static int
+parse_policy(struct vm_domain_policy_entry *vd, const char *str)
+{
+
+ if (strcmp(str, "rr") == 0) {
+ vd->policy = VM_POLICY_ROUND_ROBIN;
+ vd->domain = -1;
+ return (0);
+ }
+
+ if (strcmp(str, "first-touch-rr") == 0) {
+ vd->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN;
+ vd->domain = -1;
+ return (0);
+ }
+
+ if (strcmp(str, "first-touch") == 0) {
+ vd->policy = VM_POLICY_FIRST_TOUCH;
+ vd->domain = -1;
+ return (0);
+ }
+
+ if (strcmp(str, "fixed-domain") == 0) {
+ vd->policy = VM_POLICY_FIXED_DOMAIN;
+ vd->domain = 0;
+ return (0);
+ }
+
+ if (strcmp(str, "fixed-domain-rr") == 0) {
+ vd->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN;
+ vd->domain = 0;
+ return (0);
+ }
+
+ return (-1);
+}
+
+static void
+usage(void)
+{
+
+ printf("usage: numactl --get [--tid/-t <tid>] [--pid/-p <pid>]\n");
+ printf(" numactl --set [--tid=<tid>] [--pid/-p<pid>]\n");
+ printf(" [--mempolicy/-l <policy>] [--memdomain/"
+ "-m <domain>]\n");
+ printf(" [--cpudomain/-c <domain>]\n");
+ printf(" numactl [--mempolicy/-l <policy>] [--memdomain/-m "
+ "<domain>]\n");
+ printf(" [--cpudomain/-c <domain>] <cmd> ...\n");
+
+ exit(EX_USAGE);
+}
+
+static int
+set_numa_domain_cpuaffinity(int cpu_domain)
+{
+ cpuset_t set;
+ int error;
+
+ error = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_DOMAIN,
+ cpu_domain, sizeof(set), &set);
+ if (error != 0)
+ err(1, "cpuset_getaffinity");
+ error = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
+ sizeof(set), &set);
+ if (error != 0)
+ err(1, "cpuset_setaffinity");
+
+ return (0);
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct vm_domain_policy_entry vd;
+ lwpid_t tid;
+ pid_t pid;
+ cpuwhich_t which;
+ id_t id;
+ int error;
+ int is_set, is_get;
+ int mem_policy_set;
+ int ch;
+ int cpu_domain;
+
+ id = -1;
+ which = -1;
+ is_set = 0;
+ is_get = 0;
+ mem_policy_set = 0;
+ tid = -1;
+ pid = -1;
+ cpu_domain = -1;
+
+ while ((ch = getopt_long(argc, argv, "c:gl:m:p:st:", longopts,
+ NULL)) != -1) {
+ switch (ch) {
+ case 'c':
+ cpu_domain = atoi(optarg);
+ break;
+ case 'g':
+ is_get = 1;
+ break;
+ case 'l':
+ if (parse_policy(&vd, optarg) != 0) {
+ fprintf(stderr,
+ "Could not parse policy: '%s'\n", optarg);
+ exit(1);
+ }
+ mem_policy_set = 1;
+ break;
+ case 'm':
+ if (mem_policy_set == 0) {
+ fprintf(stderr,
+ "Error: set policy first before domain\n");
+ exit(1);
+ }
+ vd.domain = atoi(optarg);
+ break;
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 's':
+ is_set = 1;
+ break;
+ case 't':
+ tid = atoi(optarg);
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ /* Handle the user wishing to run a command */
+ if (argc) {
+ /* Ensure that a policy was set */
+ if (mem_policy_set == 0) {
+ fprintf(stderr, "Error: no policy given\n");
+ usage();
+ }
+
+ /* Set current memory process policy, will be inherited */
+ if (numa_setaffinity(CPU_WHICH_PID, -1, &vd) != 0)
+ err(1, "numa_setaffinity");
+
+ /* If a CPU domain policy was given, include that too */
+ if (cpu_domain != -1)
+ (void) set_numa_domain_cpuaffinity(cpu_domain);
+
+ errno = 0;
+ execvp(*argv, argv);
+ err(errno == ENOENT ? 127 : 126, "%s", *argv);
+ }
+
+ /* Figure out which */
+ if (tid != -1) {
+ which = CPU_WHICH_TID;
+ id = tid;
+ } else if (pid != -1) {
+ which = CPU_WHICH_PID;
+ id = pid;
+ } else {
+ fprintf(stderr, "Error: one of tid or pid must be given\n");
+ usage();
+ }
+
+ /* Sanity checks */
+ if (is_set && is_get) {
+ fprintf(stderr, "Error: can't set both 'set' and 'get'\n");
+ usage();
+ }
+
+ if (is_set && ! mem_policy_set) {
+ fprintf(stderr, "Error: --set given, but no policy\n");
+ usage();
+ }
+
+ /* If it's get, then get the policy and return */
+ if (is_get) {
+ error = numa_getaffinity(which, id, &vd);
+ if (error != 0)
+ err(1, "numa_getaffinity");
+ printf(" Policy: %s; domain: %d\n",
+ policy_to_str(vd.policy),
+ vd.domain);
+ exit(0);
+ }
+
+ /* Assume it's set */
+
+ /* Syscall */
+ error = numa_setaffinity(which, id, &vd);
+ if (error != 0)
+ err(1, "numa_setaffinity");
+
+ /* If a CPU domain policy was given, include that too */
+ if (cpu_domain != -1)
+ (void) set_numa_domain_cpuaffinity(cpu_domain);
+
+ exit(0);
+}
OpenPOWER on IntegriCloud