43 files changed, 1029 insertions, 321 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
index ad7ea11..6f5a13a 100644
--- a/sys/kern/Make.tags.inc
+++ b/sys/kern/Make.tags.inc
@@ -37,6 +37,7 @@ COMM=	${SYS}/dev/advansys/*.[ch] \
 	${SYS}/fs/smbfs/*.[ch] \
 	${SYS}/fs/udf/*.[ch] \
 	${SYS}/fs/unionfs/*.[ch] \
+	${SYS}/geom/*.[ch] \
 	${SYS}/kern/*.[ch] \
 	${SYS}/net/*.[ch] \
 	${SYS}/netatalk/*.[ch] \
@@ -55,6 +56,7 @@ COMM=	${SYS}/dev/advansys/*.[ch] \
 	${SYS}/sys/*.[ch]
 
 COMMDIR1= ${SYS}/conf \
+	${SYS}/geom \
 	${SYS}/kern \
 	${SYS}/net \
 	${SYS}/netatalk \
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
index 2f889ca..3908da7 100644
--- a/sys/kern/imgact_aout.c
+++ b/sys/kern/imgact_aout.c
@@ -103,7 +103,7 @@ struct sysentvec aout_sysvec = {
 
 #elif defined(__amd64__)
 
-#define	AOUT32_USRSTACK	0xbfc0000
+#define	AOUT32_USRSTACK	0xbfc00000
 #define	AOUT32_PS_STRINGS \
     (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
 
@@ -152,7 +152,7 @@ aout_fixup(register_t **stack_base, struct image_params *imgp)
 {
 
 	*(char **)stack_base -= sizeof(uint32_t);
-	return (suword(*stack_base, imgp->args->argc));
+	return (suword32(*stack_base, imgp->args->argc));
 }
 
 static int
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index b41741a..45f6d64 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -31,10 +31,12 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_core.h"
 
 #include <sys/param.h>
+#include <sys/capability.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
@@ -578,6 +580,15 @@ __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long base_addr = 0;
 	int vfslocked, error, i, numsegs;
 
+#ifdef CAPABILITY_MODE
+	/*
+	 * XXXJA: This check can go away once we are sufficiently confident
+	 * that the checks in namei() are correct.
+	 */
+	if (IN_CAPABILITY_MODE(curthread))
+		return (ECAPMODE);
+#endif
+
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
@@ -1104,6 +1115,7 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 	hdrsize = 0;
 	__elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count);
 
+#ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	error = racct_add(td->td_proc, RACCT_CORE, hdrsize + seginfo.size);
 	PROC_UNLOCK(td->td_proc);
@@ -1111,6 +1123,7 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 		error = EFAULT;
 		goto done;
 	}
+#endif
 	if (hdrsize + seginfo.size >= limit) {
 		error = EFAULT;
 		goto done;
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index e45ffc5..004516b 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -548,8 +548,8 @@ struct sysent sysent[] = {
 	{ AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 511 = msgctl */
 	{ AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 512 = shmctl */
 	{ AS(lpathconf_args), (sy_call_t *)lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0, SY_THR_STATIC },	/* 513 = lpathconf */
-	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 514 = cap_new */
-	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 515 = cap_getrights */
+	{ AS(cap_new_args), (sy_call_t *)cap_new, AUE_CAP_NEW, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 514 = cap_new */
+	{ AS(cap_getrights_args), (sy_call_t *)cap_getrights, AUE_CAP_GETRIGHTS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 515 = cap_getrights */
 	{ 0, (sy_call_t *)cap_enter, AUE_CAP_ENTER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 516 = cap_enter */
 	{ AS(cap_getmode_args), (sy_call_t *)cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 517 = cap_getmode */
 	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 518 = pdfork */
diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
index dd8bab5..ecfd408 100644
--- a/sys/kern/kern_clocksource.c
+++ b/sys/kern/kern_clocksource.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 cyclic_clock_func_t	cyclic_clock_func = NULL;
 #endif
 
+int			cpu_can_deep_sleep = 0;	/* C3 state is available. */
 int			cpu_disable_deep_sleep = 0; /* Timer dies in C3. */
 
 static void		setuptimer(void);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 180d598..829ece2 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -37,6 +37,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
@@ -44,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 
+#include <sys/capability.h>
 #include <sys/conf.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
@@ -91,6 +93,7 @@ __FBSDID("$FreeBSD$");
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
+#include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
@@ -818,6 +821,7 @@ do_dup(struct thread *td, int flags, int old, int new,
 			 * descriptors, just put the limit on the size of the file
 			 * descriptor table.
 			 */
+#ifdef RACCT
 			PROC_LOCK(p);
 			error = racct_set(p, RACCT_NOFILE, new + 1);
 			PROC_UNLOCK(p);
@@ -826,6 +830,7 @@ do_dup(struct thread *td, int flags, int old, int new,
 				fdrop(fp, td);
 				return (EMFILE);
 			}
+#endif
 			fdgrowtable(fdp, new + 1);
 		}
 		if (fdp->fd_ofiles[new] == NULL)
@@ -1155,7 +1160,7 @@ kern_close(td, fd)
 	int fd;
 {
 	struct filedesc *fdp;
-	struct file *fp;
+	struct file *fp, *fp_object;
 	int error;
 	int holdleaders;
 
@@ -1190,8 +1195,14 @@ kern_close(td, fd)
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
-	if (fp->f_type == DTYPE_MQUEUE)
-		mq_fdclose(td, fd, fp);
+
+	/*
+	 * When we're closing an fd with a capability, we need to notify
+	 * mqueue if the underlying object is of type mqueue.
+	 */
+	(void)cap_funwrap(fp, 0, &fp_object);
+	if (fp_object->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp_object);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
@@ -1473,7 +1484,10 @@ fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
-	int fd = -1, maxfd, error;
+	int fd = -1, maxfd;
+#ifdef RACCT
+	int error;
+#endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
@@ -1496,11 +1510,13 @@ fdalloc(struct thread *td, int minfd, int *result)
 			return (EMFILE);
 		if (fd < fdp->fd_nfiles)
 			break;
+#ifdef RACCT
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EMFILE);
+#endif
 		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
 	}
 
@@ -1561,54 +1577,85 @@ fdavail(struct thread *td, int n)
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
 {
-	struct proc *p = td->td_proc;
 	struct file *fp;
-	int error, i;
+	int error, fd;
+
+	error = falloc_noinstall(td, &fp);
+	if (error)
+		return (error);		/* no reference held on error */
+
+	error = finstall(td, fp, &fd, flags);
+	if (error) {
+		fdrop(fp, td);		/* one reference (fp only) */
+		return (error);
+	}
+
+	if (resultfp != NULL)
+		*resultfp = fp;		/* copy out result */
+	else
+		fdrop(fp, td);		/* release local reference */
+
+	if (resultfd != NULL)
+		*resultfd = fd;
+
+	return (0);
+}
+
+/*
+ * Create a new open file structure without allocating a file descriptor.
+ */
+int
+falloc_noinstall(struct thread *td, struct file **resultfp)
+{
+	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
-	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
+	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
+
 	if ((openfiles >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
-			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
-				td->td_ucred->cr_ruid);
+			printf("kern.maxfiles limit exceeded by uid %i, "
+			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
 		}
-		uma_zfree(file_zone, fp);
 		return (ENFILE);
 	}
 	atomic_add_int(&openfiles, 1);
-
-	/*
-	 * If the process has file descriptor zero open, add the new file
-	 * descriptor to the list of open files at that point, otherwise
-	 * put it at the front of the list of open files.
-	 */
+	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	refcount_init(&fp->f_count, 1);
-	if (resultfp)
-		fhold(fp);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
-	FILEDESC_XLOCK(p->p_fd);
-	if ((error = fdalloc(td, 0, &i))) {
-		FILEDESC_XUNLOCK(p->p_fd);
-		fdrop(fp, td);
-		if (resultfp)
-			fdrop(fp, td);
+	*resultfp = fp;
+	return (0);
+}
+
+/*
+ * Install a file in a file descriptor table.
+ */
+int
+finstall(struct thread *td, struct file *fp, int *fd, int flags)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+
+	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
+	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
+
+	FILEDESC_XLOCK(fdp);
+	if ((error = fdalloc(td, 0, fd))) {
+		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
-	p->p_fd->fd_ofiles[i] = fp;
+	fhold(fp);
+	fdp->fd_ofiles[*fd] = fp;
 	if ((flags & O_CLOEXEC) != 0)
-		p->p_fd->fd_ofileflags[i] |= UF_EXCLOSE;
-	FILEDESC_XUNLOCK(p->p_fd);
-	if (resultfp)
-		*resultfp = fp;
-	if (resultfd)
-		*resultfd = i;
+		fdp->fd_ofileflags[*fd] |= UF_EXCLOSE;
+	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
@@ -1739,11 +1786,11 @@ fdcopy(struct filedesc *fdp)
 		FILEDESC_XUNLOCK(newfdp);
 		FILEDESC_SLOCK(fdp);
 	}
-	/* copy everything except kqueue descriptors */
+	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		if (fdisused(fdp, i) &&
-		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE &&
+		    (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
 		    fdp->fd_ofiles[i]->f_ops != &badfileops) {
 			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
 			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
@@ -1785,9 +1832,11 @@ fdfree(struct thread *td)
 	if (fdp == NULL)
 		return;
 
+#ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	racct_set(td->td_proc, RACCT_NOFILE, 0);
 	PROC_UNLOCK(td->td_proc);
+#endif
 
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
@@ -2103,6 +2152,7 @@ closef(struct file *fp, struct thread *td)
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
+	struct file *fp_object;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
@@ -2115,11 +2165,15 @@ closef(struct file *fp, struct thread *td)
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
+	 *
+	 * If this is a capability, we do lock processing under the underlying
+	 * node, not the capability itself.
 	 */
-	if (fp->f_type == DTYPE_VNODE && td != NULL) {
+	(void)cap_funwrap(fp, 0, &fp_object);
+	if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
 		int vfslocked;
 
-		vp = fp->f_vnode;
+		vp = fp_object->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
@@ -2149,7 +2203,7 @@ closef(struct file *fp, struct thread *td)
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
-				vp = fp->f_vnode;
+				vp = fp_object->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)fdtol->fdl_leader,
 						   F_UNLCK, &lf, F_POSIX);
@@ -2228,15 +2282,27 @@ fget_unlocked(struct filedesc *fdp, int fd)
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
+ * If the FGET_GETCAP flag is set, the capability itself will be returned.
+ * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
+ * Otherwise, if the file is a capability, its rights will be checked against
+ * the capability rights mask, and if successful, the object will be unwrapped.
+ *
  * If an error occured the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
+#define	FGET_GETCAP	0x00000001
 static __inline int
-_fget(struct thread *td, int fd, struct file **fpp, int flags)
+_fget(struct thread *td, int fd, struct file **fpp, int flags,
+    cap_rights_t needrights, cap_rights_t *haverights, u_char *maxprotp,
+    int fget_flags)
 {
 	struct filedesc *fdp;
 	struct file *fp;
+#ifdef CAPABILITIES
+	struct file *fp_fromcap;
+	int error;
+#endif
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
@@ -2247,6 +2313,47 @@ _fget(struct thread *td, int fd, struct file **fpp, int flags)
 		fdrop(fp, td);
 		return (EBADF);
 	}
+
+#ifdef CAPABILITIES
+	/*
+	 * If a capability has been requested, return the capability directly.
+	 * Otherwise, check capability rights, extract the underlying object,
+	 * and check its access flags.
+	 */
+	if (fget_flags & FGET_GETCAP) {
+		if (fp->f_type != DTYPE_CAPABILITY) {
+			fdrop(fp, td);
+			return (EINVAL);
+		}
+	} else {
+		if (maxprotp == NULL)
+			error = cap_funwrap(fp, needrights, &fp_fromcap);
+		else
+			error = cap_funwrap_mmap(fp, needrights, maxprotp,
+			    &fp_fromcap);
+		if (error) {
+			fdrop(fp, td);
+			return (error);
+		}
+
+		/*
+		 * If we've unwrapped a file, drop the original capability
+		 * and hold the new descriptor.  fp after this point refers to
+		 * the actual (unwrapped) object, not the capability.
+		 */
+		if (fp != fp_fromcap) {
+			fhold(fp_fromcap);
+			fdrop(fp, td);
+			fp = fp_fromcap;
+		}
+	}
+#else /* !CAPABILITIES */
+	KASSERT(fp->f_type != DTYPE_CAPABILITY,
+	    ("%s: saw capability", __func__));
+	if (maxprotp != NULL)
+		*maxprotp = VM_PROT_ALL;
+#endif /* CAPABILITIES */
+
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
@@ -2265,23 +2372,36 @@ int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, 0));
+	return(_fget(td, fd, fpp, 0, 0, NULL, NULL, 0));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FREAD));
+	return(_fget(td, fd, fpp, FREAD, 0, NULL, NULL, 0));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FWRITE));
+	return(_fget(td, fd, fpp, FWRITE, 0, NULL, NULL, 0));
+}
+
+/*
+ * Unlike the other fget() calls, which will accept and check capability rights
+ * but never return capabilities, fgetcap() returns the capability but doesn't
+ * check capability rights.
+ */
+int
+fgetcap(struct thread *td, int fd, struct file **fpp)
+{
+
+	return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
 }
 
+
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
@@ -2296,7 +2416,7 @@ _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
 	int error;
 
 	*vpp = NULL;
-	if ((error = _fget(td, fd, &fp, flags)) != 0)
+	if ((error = _fget(td, fd, &fp, flags, 0, NULL, NULL, 0)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
@@ -2352,7 +2472,7 @@ fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
-	if ((error = _fget(td, fd, &fp, 0)) != 0)
+	if ((error = _fget(td, fd, &fp, 0, 0, NULL, NULL, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
@@ -2388,6 +2508,9 @@ fputsock(struct socket *so)
 
 /*
  * Handle the last reference to a file being closed.
+ *
+ * No special capability handling here, as the capability's fo_close will run
+ * instead of the object here, and perform any necessary drop on the object.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index c51cc39..f30f89a 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -27,12 +27,14 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
+#include <sys/capability.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
@@ -415,6 +417,18 @@ do_execve(td, args, mac_p)
 
 interpret:
 	if (args->fname != NULL) {
+#ifdef CAPABILITY_MODE
+		/*
+		 * While capability mode can't reach this point via direct
+		 * path arguments to execve(), we also don't allow
+		 * interpreters to be used in capability mode (for now).
+		 * Catch indirect lookups and return a permissions error.
+		 */
+		if (IN_CAPABILITY_MODE(td)) {
+			error = ECAPMODE;
+			goto exec_fail;
+		}
+#endif
 		error = namei(&nd);
 		if (error)
 			goto exec_fail;
@@ -631,6 +645,13 @@ interpret:
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
+	 * We disable setuid/setgid/etc in compatibility mode on the basis
+	 * that most setugid applications are not written with that
+	 * environment in mind, and will therefore almost certainly operate
+	 * incorrectly. In principle there's no reason that setugid
+	 * applications might not be useful in capability mode, so we may want
+	 * to reconsider this conservative design choice in the future.
+	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
@@ -646,6 +667,9 @@ interpret:
 #endif
 
 	if (credential_changing &&
+#ifdef CAPABILITY_MODE
+	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
+#endif
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index bb25d17..30b94b6 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -744,9 +744,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options,
 	 * Destroy resource accounting information associated with the process.
 	 */
 	racct_proc_exit(p);
+#ifdef RACCT
 	PROC_LOCK(p->p_pptr);
 	racct_sub(p->p_pptr, RACCT_NPROC, 1);
 	PROC_UNLOCK(p->p_pptr);
+#endif
 
 	/*
 	 * Free credentials, arguments, and sigacts.
@@ -905,19 +907,23 @@ loop:
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
+#ifdef RACCT
 	int locked;
+#endif
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
+#ifdef RACCT
 	locked = PROC_LOCKED(parent);
 	if (!locked)
 		PROC_LOCK(parent);
 	racct_add_force(parent, RACCT_NPROC, 1);
 	if (!locked)
 		PROC_UNLOCK(parent);
+#endif
 	PROC_LOCK(child->p_pptr);
 	racct_sub(child->p_pptr, RACCT_NPROC, 1);
 	sigqueue_take(child->p_ksi);
diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c
index e0fb32b..f192471 100644
--- a/sys/kern/kern_fail.c
+++ b/sys/kern/kern_fail.c
@@ -52,6 +52,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/fail.h>
 #include <sys/kernel.h>
@@ -59,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/proc.h>
 #include <sys/sbuf.h>
 
 #include <machine/stdarg.h>
@@ -88,16 +90,20 @@ enum fail_point_t {
 	FAIL_POINT_BREAK,	/**< break into the debugger */
 	FAIL_POINT_PRINT,	/**< print a message */
 	FAIL_POINT_SLEEP,	/**< sleep for some msecs */
-	FAIL_POINT_INVALID,	/**< placeholder */
+	FAIL_POINT_NUMTYPES
 };
 
-static const char *fail_type_strings[] = {
-	"off",
-	"panic",
-	"return",
-	"break",
-	"print",
-	"sleep",
+static struct {
+	const char *name;
+	int	nmlen;
+} fail_type_strings[] = {
+#define	FP_TYPE_NM_LEN(s)	{ s, sizeof(s) - 1 }
+	[FAIL_POINT_OFF] =	FP_TYPE_NM_LEN("off"),
+	[FAIL_POINT_PANIC] =	FP_TYPE_NM_LEN("panic"),
+	[FAIL_POINT_RETURN] =	FP_TYPE_NM_LEN("return"),
+	[FAIL_POINT_BREAK] =	FP_TYPE_NM_LEN("break"),
+	[FAIL_POINT_PRINT] =	FP_TYPE_NM_LEN("print"),
+	[FAIL_POINT_SLEEP] =	FP_TYPE_NM_LEN("sleep"),
 };
 
 /**
@@ -109,7 +115,7 @@ struct fail_point_entry {
 	int		fe_arg;		/**< argument to type (e.g. return value) */
 	int		fe_prob;	/**< likelihood of firing in millionths */
 	int		fe_count;	/**< number of times to fire, 0 means always */
-
+	pid_t		fe_pid;		/**< only fail for this process */
 	TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */
 };
 
@@ -120,7 +126,7 @@ fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent,
 	/* convert from millisecs to ticks, rounding up */
 	int timo = ((msecs * hz) + 999) / 1000;
 
-	if (timo) {
+	if (timo > 0) {
 		if (fp->fp_sleep_fn == NULL) {
 			msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo);
 		} else {
@@ -191,19 +197,13 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...)
 void
 fail_point_destroy(struct fail_point *fp)
 {
-	struct fail_point_entry *ent;
 
-	if (fp->fp_flags & FAIL_POINT_DYNAMIC_NAME && fp->fp_name != NULL) {
-		fp_free((void *)(intptr_t)fp->fp_name);
+	if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
+		fp_free(__DECONST(void *, fp->fp_name));
 		fp->fp_name = NULL;
 	}
 	fp->fp_flags = 0;
-
-	while (!TAILQ_EMPTY(&fp->fp_entries)) {
-		ent = TAILQ_FIRST(&fp->fp_entries);
-		TAILQ_REMOVE(&fp->fp_entries, ent, fe_entries);
-		fp_free(ent);
-	}
+	clear_entries(&fp->fp_entries);
 }
 
 /**
@@ -222,16 +222,14 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
 
 	FP_LOCK();
 
-	ent = TAILQ_FIRST(&fp->fp_entries);
-	while (ent) {
+	TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) {
 		int cont = 0; /* don't continue by default */
-		next = TAILQ_NEXT(ent, fe_entries);
 
 		if (ent->fe_prob < PROB_MAX &&
-		    ent->fe_prob < random() % PROB_MAX) {
-			cont = 1;
-			goto loop_end;
-		}
+		    ent->fe_prob < random() % PROB_MAX)
+			continue;
+		if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
+			continue;
 
 		switch (ent->fe_type) {
 		case FAIL_POINT_PANIC:
@@ -239,13 +237,14 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
 			/* NOTREACHED */
 
 		case FAIL_POINT_RETURN:
-			if (return_value)
+			if (return_value != NULL)
 				*return_value = ent->fe_arg;
 			ret = FAIL_POINT_RC_RETURN;
 			break;
 
 		case FAIL_POINT_BREAK:
-			printf("fail point %s breaking to debugger\n", fp->fp_name);
+			printf("fail point %s breaking to debugger\n",
+			    fp->fp_name);
 			breakpoint();
 			break;
 
@@ -273,13 +272,9 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
 			break;
 		}
 
-		if (ent && ent->fe_count > 0 && --ent->fe_count == 0)
+		if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0)
 			free_entry(&fp->fp_entries, ent);
-
-loop_end:
-		if (cont)
-			ent = next;
-		else
+		if (cont == 0)
 			break;
 	}
 
@@ -290,7 +285,7 @@ loop_end:
 
 	FP_UNLOCK();
 
-	return ret;
+	return (ret);
 }
 
 /**
@@ -320,9 +315,11 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb)
 		}
 		if (ent->fe_count > 0)
 			sbuf_printf(sb, "%d*", ent->fe_count);
-		sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type]);
+		sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
 		if (ent->fe_arg)
 			sbuf_printf(sb, "(%d)", ent->fe_arg);
+		if (ent->fe_pid != NO_PID)
+			sbuf_printf(sb, "[pid %d]", ent->fe_pid);
 		if (TAILQ_NEXT(ent, fe_entries))
 			sbuf_printf(sb, "->");
 	}
@@ -380,7 +377,7 @@ fail_point_set(struct fail_point *fp, char *buf)
 		    fp->fp_name, fp->fp_location, buf);
 #endif /* IWARNING */
 
-	return error;
+	return (error);
 }
 
 #define MAX_FAIL_POINT_BUF	1023
@@ -422,9 +419,8 @@ fail_point_sysctl(SYSCTL_HANDLER_ARGS)
         }
 
 out:
-	if (buf)
-		fp_free(buf);
-	return error;
+	fp_free(buf);
+	return (error);
 }
 
 /**
@@ -437,12 +433,17 @@ parse_fail_point(struct fail_point_entries *ents, char *p)
 	/*  <fail_point> ::
 	 *      <term> ( "->" <term> )*
 	 */
-	if (!(p = parse_term(ents, p)))
-		return 0;
-	while (*p)
-		if (p[0] != '-' || p[1] != '>' || !(p = parse_term(ents, p+2)))
-			return 0;
-	return p;
+	p = parse_term(ents, p);
+	if (p == NULL)
+		return (NULL);
+	while (*p != '\0') {
+		if (p[0] != '-' || p[1] != '>')
+			return (NULL);
+		p = parse_term(ents, p + 2);
+		if (p == NULL)
+			return (NULL);
+	}
+	return (p);
 }
 
 /**
@@ -455,6 +456,7 @@ parse_term(struct fail_point_entries *ents, char *p)
 
 	ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO);
 	ent->fe_prob = PROB_MAX;
+	ent->fe_pid = NO_PID;
 	TAILQ_INSERT_TAIL(ents, ent, fe_entries);
 
 	/*
@@ -462,14 +464,16 @@ parse_term(struct fail_point_entries *ents, char *p)
 	 *     ( (<float> "%") | (<integer> "*" ) )*
 	 *     <type>
 	 *     [ "(" <integer> ")" ]
+	 *     [ "[pid " <integer> "]" ]
 	 */
 
 	/* ( (<float> "%") | (<integer> "*" ) )* */
-	while (('0' <= *p && *p <= '9') || *p == '.') {
+	while (isdigit(*p) || *p == '.') {
 		int units, decimal;
 
-		if (!(p = parse_number(&units, &decimal, p)))
-			return 0;
+		p = parse_number(&units, &decimal, p);
+		if (p == NULL)
+			return (NULL);
 
 		if (*p == '%') {
 			if (units > 100) /* prevent overflow early */
@@ -477,37 +481,44 @@ parse_term(struct fail_point_entries *ents, char *p)
 			ent->fe_prob = units * (PROB_MAX / 100) + decimal;
 			if (ent->fe_prob > PROB_MAX)
 				ent->fe_prob = PROB_MAX;
-
 		} else if (*p == '*') {
 			if (!units || decimal)
-				return 0;
+				return (NULL);
 			ent->fe_count = units;
-
-		} else {
-			return 0;
-		}
-
+		} else
+			return (NULL);
 		p++;
 	}
 
 	/* <type> */
-	if (!(p = parse_type(ent, p)))
-		return 0;
+	p = parse_type(ent, p);
+	if (p == NULL)
+		return (NULL);
 	if (*p == '\0')
-		return p;
+		return (p);
 
 	/* [ "(" <integer> ")" ] */
 	if (*p != '(')
 		return p;
 	p++;
-	if (('0' <= *p && *p <= '9') || *p == '-')
-		ent->fe_arg = strtol(p, &p, 0);
-	else
-		return 0;
+	if (!isdigit(*p) && *p != '-')
+		return (NULL);
+	ent->fe_arg = strtol(p, &p, 0);
 	if (*p++ != ')')
-		return 0;
-
-	return p;
+		return (NULL);
+
+	/* [ "[pid " <integer> "]" ] */
+#define	PID_STRING	"[pid "
+	if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
+		return (p);
+	p += sizeof(PID_STRING) - 1;
+	if (!isdigit(*p))
+		return (NULL);
+	ent->fe_pid = strtol(p, &p, 0);
+	if (*p++ != ']')
+		return (NULL);
+
+	return (p);
 }
 
 /**
@@ -528,14 +539,14 @@ parse_number(int *out_units, int *out_decimal, char *p)
 	old_p = p;
 	*out_units = strtol(p, &p, 10);
 	if (p == old_p && *p != '.')
-		return 0;
+		return (NULL);
 
 	/* fractional part */
 	*out_decimal = 0;
 	if (*p == '.') {
 		int digits = 0;
 		p++;
-		while ('0' <= *p && *p <= '9') {
+		while (isdigit(*p)) {
 			int digit = *p - '0';
 			if (digits < PROB_DIGITS - 2)
 				*out_decimal = *out_decimal * 10 + digit;
@@ -545,12 +556,12 @@ parse_number(int *out_units, int *out_decimal, char *p)
 			p++;
 		}
 		if (!digits) /* need at least one digit after '.' */
-			return 0;
+			return (NULL);
 		while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */
 			*out_decimal *= 10;
 	}
 
-	return p; /* success */
+	return (p); /* success */
 }
 
 /**
@@ -560,21 +571,16 @@ static char *
 parse_type(struct fail_point_entry *ent, char *beg)
 {
 	enum fail_point_t type;
-	char *end = beg;
-	while ('a' <= *end && *end <= 'z')
-		end++;
-	if (beg == end)
-		return 0;
-	for (type = FAIL_POINT_OFF; type != FAIL_POINT_INVALID; type++) {
-		const char *p = fail_type_strings[type];
-		const char *q = beg;
-		while (q < end && *p++ == *q++);
-		if (q == end && *p == '\0') {
+	int len;
+
+	for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) {
+		len = fail_type_strings[type].nmlen;
+		if (strncmp(fail_type_strings[type].name, beg, len) == 0) {
 			ent->fe_type = type;
-			return end;
+			return (beg + len);
 		}
 	}
-	return 0;
+	return (NULL);
 }
 
 /**
@@ -595,6 +601,7 @@ static void
 clear_entries(struct fail_point_entries *ents)
 {
 	struct fail_point_entry *ent, *ent_next;
+
 	TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next)
 		fp_free(ent);
 	TAILQ_INIT(ents);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 04e635a..9d3e22d 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -476,7 +476,10 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
-	if (flags & RFLINUXTHPN) 
+
+	if (flags & RFTSIGZMB)
+	        p2->p_sigparent = RFTSIGNUM(flags);
+	else if (flags & RFLINUXTHPN)
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
@@ -719,10 +722,22 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
 	static int curfail;
 	static struct timeval lastfail;
 
+	/* Check for the undefined or unimplemented flags. */
+	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
+		return (EINVAL);
+
+	/* Signal value requires RFTSIGZMB. */
+	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
+		return (EINVAL);
+
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
+	/* Check the validity of the signal number. */
+	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
+		return (EINVAL);
+
 	p1 = td->td_proc;
 
 	/*
@@ -734,11 +749,13 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
 		return (fork_norfproc(td, flags));
 	}
 
+#ifdef RACCT
 	PROC_LOCK(p1);
 	error = racct_add(p1, RACCT_NPROC, 1);
 	PROC_UNLOCK(p1);
 	if (error != 0)
 		return (EAGAIN);
+#endif
 
 	mem_charged = 0;
 	vm2 = NULL;
@@ -822,6 +839,7 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
 		goto fail;
 	}
 
+#ifdef RACCT
 	/*
 	 * After fork, there is exactly one thread running.
 	 */
@@ -832,6 +850,7 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
 		error = EAGAIN;
 		goto fail;
 	}
+#endif
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
@@ -874,9 +893,11 @@ fail1:
 		vmspace_free(vm2);
 	uma_zfree(proc_zone, newproc);
 	pause("fork", hz / 2);
+#ifdef RACCT
 	PROC_LOCK(p1);
 	racct_sub(p1, RACCT_NPROC, 1);
 	PROC_UNLOCK(p1);
+#endif
 	return (error);
 }
 
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 5850ad1..358d673 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3639,6 +3639,7 @@ prison_priv_check(struct ucred *cred, int priv)
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
+	case PRIV_NET_SETIFFIB:
 
 		/*
 		 * 802.11-related privileges.
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 01f7777..401ce1d 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -103,7 +103,7 @@ SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *",
 
 int racct_types[] = {
 	[RACCT_CPU] =
-		RACCT_IN_THOUSANDS,
+		RACCT_IN_MILLIONS,
 	[RACCT_DATA] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_STACK] =
@@ -141,7 +141,7 @@ int racct_types[] = {
 	[RACCT_SHMSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_WALLCLOCK] =
-		RACCT_IN_THOUSANDS };
+		RACCT_IN_MILLIONS };
 
 static void
 racct_add_racct(struct racct *dest, const struct racct *src)
@@ -173,7 +173,7 @@ racct_sub_racct(struct racct *dest, const struct racct *src)
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
-		if (!racct_is_sloppy(i)) {
+		if (!RACCT_IS_SLOPPY(i)) {
 			KASSERT(dest->r_resources[i] >= 0,
 			    ("racct propagation meltdown: dest < 0"));
 			KASSERT(src->r_resources[i] >= 0,
@@ -181,10 +181,10 @@ racct_sub_racct(struct racct *dest, const struct racct *src)
 			KASSERT(src->r_resources[i] <= dest->r_resources[i],
 			    ("racct propagation meltdown: src > dest"));
 		}
-		if (racct_is_reclaimable(i)) {
+		if (RACCT_IS_RECLAIMABLE(i)) {
 			dest->r_resources[i] -= src->r_resources[i];
 			if (dest->r_resources[i] < 0) {
-				KASSERT(racct_is_sloppy(i),
+				KASSERT(RACCT_IS_SLOPPY(i),
 				    ("racct_sub_racct: usage < 0"));
 				dest->r_resources[i] = 0;
 			}
@@ -218,9 +218,9 @@ racct_destroy_locked(struct racct **racctp)
 	racct = *racctp;
 
 	for (i = 0; i <= RACCT_MAX; i++) {
-		if (racct_is_sloppy(i))
+		if (RACCT_IS_SLOPPY(i))
 			continue;
-		if (!racct_is_reclaimable(i))
+		if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		KASSERT(racct->r_resources[i] == 0,
 		    ("destroying non-empty racct: "
@@ -255,7 +255,7 @@ racct_alloc_resource(struct racct *racct, int resource,
 
 	racct->r_resources[resource] += amount;
 	if (racct->r_resources[resource] < 0) {
-		KASSERT(racct_is_sloppy(resource),
+		KASSERT(RACCT_IS_SLOPPY(resource),
 		    ("racct_alloc_resource: usage < 0"));
 		racct->r_resources[resource] = 0;
 	}
@@ -285,7 +285,7 @@ racct_add(struct proc *p, int resource, uint64_t amount)
 	mtx_lock(&racct_lock);
 #ifdef RCTL
 	error = rctl_enforce(p, resource, amount);
-	if (error && racct_is_deniable(resource)) {
+	if (error && RACCT_IS_DENIABLE(resource)) {
 		SDT_PROBE(racct, kernel, rusage, add_failure, p, resource,
 		    amount, 0, 0);
 		mtx_unlock(&racct_lock);
@@ -373,14 +373,14 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount)
 
 	diff = amount - p->p_racct->r_resources[resource];
 #ifdef notyet
-	KASSERT(diff >= 0 || racct_is_reclaimable(resource),
+	KASSERT(diff >= 0 || RACCT_IS_RECLAIMABLE(resource),
 	    ("racct_set: usage of non-reclaimable resource %d dropping",
 	     resource));
 #endif
 #ifdef RCTL
 	if (diff > 0) {
 		error = rctl_enforce(p, resource, diff);
-		if (error && racct_is_deniable(resource)) {
+		if (error && RACCT_IS_DENIABLE(resource)) {
 			SDT_PROBE(racct, kernel, rusage, set_failure, p,
 			    resource, amount, 0, 0);
 			return (error);
@@ -489,7 +489,7 @@ racct_sub(struct proc *p, int resource, uint64_t amount)
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	KASSERT(racct_is_reclaimable(resource),
+	KASSERT(RACCT_IS_RECLAIMABLE(resource),
 	    ("racct_sub: called for non-reclaimable resource %d", resource));
 
 	mtx_lock(&racct_lock);
@@ -512,7 +512,7 @@ racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 	    0, 0);
 
 #ifdef notyet
-	KASSERT(racct_is_reclaimable(resource),
+	KASSERT(RACCT_IS_RECLAIMABLE(resource),
 	    ("racct_sub_cred: called for non-reclaimable resource %d",
 	     resource));
 #endif
@@ -564,7 +564,7 @@ racct_proc_fork(struct proc *parent, struct proc *child)
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (parent->p_racct->r_resources[i] == 0 ||
-		    !racct_is_inheritable(i))
+		    !RACCT_IS_INHERITABLE(i))
 			continue;
 
 		error = racct_set_locked(child, i,
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 3d0a478..a939758 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -99,17 +99,17 @@ static struct dict subjectnames[] = {
 	{ NULL, -1 }};
 
 static struct dict resourcenames[] = {
-	{ "cpu", RACCT_CPU },
-	{ "data", RACCT_DATA },
-	{ "stack", RACCT_STACK },
-	{ "core", RACCT_CORE },
-	{ "rss", RACCT_RSS },
-	{ "memlock", RACCT_MEMLOCK },
-	{ "nproc", RACCT_NPROC },
-	{ "nofile", RACCT_NOFILE },
-	{ "vmem", RACCT_VMEM },
-	{ "npts", RACCT_NPTS },
-	{ "swap", RACCT_SWAP },
+	{ "cputime", RACCT_CPU },
+	{ "datasize", RACCT_DATA },
+	{ "stacksize", RACCT_STACK },
+	{ "coredumpsize", RACCT_CORE },
+	{ "memoryuse", RACCT_RSS },
+	{ "memorylocked", RACCT_MEMLOCK },
+	{ "maxproc", RACCT_NPROC },
+	{ "openfiles", RACCT_NOFILE },
+	{ "vmemoryuse", RACCT_VMEM },
+	{ "pseudoterminals", RACCT_NPTS },
+	{ "swapuse", RACCT_SWAP },
 	{ "nthr", RACCT_NTHR },
 	{ "msgqqueued", RACCT_MSGQQUEUED },
 	{ "msgqsize", RACCT_MSGQSIZE },
@@ -907,7 +907,7 @@ rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
 		error = str2int64(amountstr, &rule->rr_amount);
 		if (error != 0)
 			goto out;
-		if (racct_is_in_thousands(rule->rr_resource))
+		if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
 			rule->rr_amount *= 1000;
 	}
 
@@ -947,7 +947,7 @@ rctl_rule_add(struct rctl_rule *rule)
 
 	/*
 	 * Some rules just don't make sense.  Note that the one below
-	 * cannot be rewritten using racct_is_deniable(); the RACCT_PCTCPU,
+	 * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
 	 * for example, is not deniable in the racct sense, but the
 	 * limit is enforced in a different way, so "deny" rules for %CPU
 	 * do make sense.
@@ -958,7 +958,7 @@ rctl_rule_add(struct rctl_rule *rule)
 		return (EOPNOTSUPP);
 
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
-	    racct_is_sloppy(rule->rr_resource))
+	    RACCT_IS_SLOPPY(rule->rr_resource))
 		return (EOPNOTSUPP);
 
 	/*
@@ -1152,8 +1152,8 @@ rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
 
 	amount = rule->rr_amount;
 	if (amount != RCTL_AMOUNT_UNDEFINED &&
-	    racct_is_in_thousands(rule->rr_resource))
-		amount /= 1000;
+	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
+		amount /= 1000000;
 
 	sbuf_printf(sb, "%s:%s=%jd",
 	    rctl_resource_name(rule->rr_resource),
@@ -1219,10 +1219,10 @@ rctl_racct_to_sbuf(struct racct *racct, int sloppy)
 
 	sb = sbuf_new_auto();
 	for (i = 0; i <= RACCT_MAX; i++) {
-		if (sloppy == 0 && racct_is_sloppy(i))
+		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
 			continue;
 		amount = racct->r_resources[i];
-		if (racct_is_in_thousands(i))
+		if (RACCT_IS_IN_MILLIONS(i))
 			amount /= 1000;
 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
 	}
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
index 3214e1b..1c7337d 100644
--- a/sys/kern/kern_rmlock.c
+++ b/sys/kern/kern_rmlock.c
@@ -263,7 +263,7 @@ _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 	pc = pcpu_find(curcpu);
 
 	/* Check if we just need to do a proper critical_exit. */
-	if (!CPU_OVERLAP(&pc->pc_cpumask, &rm->rm_writecpus)) {
+	if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
 		critical_exit();
 		return (1);
 	}
@@ -325,7 +325,7 @@ _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 
 	critical_enter();
 	pc = pcpu_find(curcpu);
-	CPU_NAND(&rm->rm_writecpus, &pc->pc_cpumask);
+	CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
 	rm_tracker_add(pc, tracker);
 	sched_pin();
 	critical_exit();
@@ -367,7 +367,7 @@ _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 	 * conditional jump.
 	 */
 	if (0 == (td->td_owepreempt |
-	    CPU_OVERLAP(&rm->rm_writecpus,  &pc->pc_cpumask)))
+	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)))
 		return (1);
 
 	/* We do not have a read token and need to acquire one. */
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index 39d6f23..0c52071 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -492,6 +492,12 @@ tc_windup(void)
 
 	/* Now is a good time to change timecounters. */
 	if (th->th_counter != timecounter) {
+#ifndef __arm__
+		if ((timecounter->tc_flags & TC_FLAGS_C3STOP) != 0)
+			cpu_disable_deep_sleep++;
+		if ((th->th_counter->tc_flags & TC_FLAGS_C3STOP) != 0)
+			cpu_disable_deep_sleep--;
+#endif
 		th->th_counter = timecounter;
 		th->th_offset_count = ncount;
 		tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 7011a53..94e41e2 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -185,11 +185,13 @@ create_thread(struct thread *td, mcontext_t *ctx,
 		}
 	}
 
+#ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	error = racct_add(p, RACCT_NTHR, 1);
 	PROC_UNLOCK(td->td_proc);
 	if (error != 0)
 		return (EPROCLIM);
+#endif
 
 	/* Initialize our td */
 	newtd = thread_alloc(0);
@@ -277,9 +279,11 @@ create_thread(struct thread *td, mcontext_t *ctx,
 	return (0);
 
 fail:
+#ifdef RACCT
 	PROC_LOCK(p);
 	racct_sub(p, RACCT_NTHR, 1);
 	PROC_UNLOCK(p);
+#endif
 	return (error);
 }
 
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
index 38bf37f..2f9a1f6 100644
--- a/sys/kern/link_elf.c
+++ b/sys/kern/link_elf.c
@@ -950,11 +950,11 @@ link_elf_load_file(linker_class_t cls, const char* filename,
 	ef->ddbstrcnt = strcnt;
 	ef->ddbstrtab = ef->strbase;
 
+nosyms:
 	error = link_elf_link_common_finish(lf);
 	if (error != 0)
 		goto out;
 
-nosyms:
 	*result = lf;
 
 out:
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 592bb80..574755f0 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -951,8 +951,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
 #ifdef SMP
-		/* Spinlock held here, assume no migration. */
-		CPU_NAND(&idle_cpus_mask, PCPU_PTR(cpumask));
+		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	} else {
 		if (TD_IS_RUNNING(td)) {
@@ -1026,7 +1025,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
-		CPU_OR(&idle_cpus_mask, PCPU_PTR(cpumask));
+		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
@@ -1055,7 +1054,8 @@ static int
 forward_wakeup(int cpunum)
 {
 	struct pcpu *pc;
-	cpuset_t dontuse, id, map, map2, me;
+	cpuset_t dontuse, map, map2;
+	u_int id, me;
 	int iscpuset;
 
 	mtx_assert(&sched_lock, MA_OWNED);
@@ -1073,27 +1073,24 @@ forward_wakeup(int cpunum)
 	/*
 	 * Check the idle mask we received against what we calculated
 	 * before in the old version.
-	 *
-	 * Also note that sched_lock is held now, thus no migration is
-	 * expected.
 	 */
-	me = PCPU_GET(cpumask);
+	me = PCPU_GET(cpuid);
 
 	/* Don't bother if we should be doing it ourself. */
-	if (CPU_OVERLAP(&me, &idle_cpus_mask) &&
-	    (cpunum == NOCPU || CPU_ISSET(cpunum, &me)))
+	if (CPU_ISSET(me, &idle_cpus_mask) &&
+	    (cpunum == NOCPU || me == cpunum))
 		return (0);
 
-	dontuse = me;
+	CPU_SETOF(me, &dontuse);
 	CPU_OR(&dontuse, &stopped_cpus);
 	CPU_OR(&dontuse, &hlt_cpus_mask);
 	CPU_ZERO(&map2);
 	if (forward_wakeup_use_loop) {
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
-			id = pc->pc_cpumask;
-			if (!CPU_OVERLAP(&id, &dontuse) &&
+			id = pc->pc_cpuid;
+			if (!CPU_ISSET(id, &dontuse) &&
 			    pc->pc_curthread == pc->pc_idlethread) {
-				CPU_OR(&map2, &id);
+				CPU_SET(id, &map2);
 			}
 		}
 	}
@@ -1125,11 +1122,11 @@ forward_wakeup(int cpunum)
 	if (!CPU_EMPTY(&map)) {
 		forward_wakeups_delivered++;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
-			id = pc->pc_cpumask;
-			if (!CPU_OVERLAP(&map, &id))
+			id = pc->pc_cpuid;
+			if (!CPU_ISSET(id, &map))
 				continue;
 			if (cpu_idle_wakeup(pc->pc_cpuid))
-				CPU_NAND(&map, &id);
+				CPU_CLR(id, &map);
 		}
 		if (!CPU_EMPTY(&map))
 			ipi_selected(map, IPI_AST);
@@ -1147,7 +1144,7 @@ kick_other_cpu(int pri, int cpuid)
 	int cpri;
 
 	pcpu = pcpu_find(cpuid);
-	if (CPU_OVERLAP(&idle_cpus_mask, &pcpu->pc_cpumask)) {
+	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
 		forward_wakeups_delivered++;
 		if (!cpu_idle_wakeup(cpuid))
 			ipi_cpu(cpuid, IPI_AST);
@@ -1205,10 +1202,10 @@ void
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
-	cpuset_t idle, me, tidlemsk;
+	cpuset_t tidlemsk;
 	struct td_sched *ts;
+	u_int cpu, cpuid;
 	int forwarded = 0;
-	int cpu;
 	int single_cpu = 0;
 
 	ts = td->td_sched;
@@ -1271,23 +1268,17 @@ sched_add(struct thread *td, int flags)
 		ts->ts_runq = &runq;
 	}
 
-	if (single_cpu && (cpu != PCPU_GET(cpuid))) {
+	cpuid = PCPU_GET(cpuid);
+	if (single_cpu && cpu != cpuid) {
 	        kick_other_cpu(td->td_priority, cpu);
 	} else {
 		if (!single_cpu) {
+			tidlemsk = idle_cpus_mask;
+			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
+			CPU_CLR(cpuid, &tidlemsk);
 
-			/*
-			 * Thread spinlock is held here, assume no
-			 * migration is possible.
-			 */
-			me = PCPU_GET(cpumask);
-			idle = idle_cpus_mask;
-			tidlemsk = idle;
-			CPU_AND(&idle, &me);
-			CPU_OR(&me, &hlt_cpus_mask);
-			CPU_NAND(&tidlemsk, &me);
-
-			if (CPU_EMPTY(&idle) && ((flags & SRQ_INTR) == 0) &&
+			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
+			    ((flags & SRQ_INTR) == 0) &&
 			    !CPU_EMPTY(&tidlemsk))
 				forwarded = forward_wakeup(cpu);
 		}
diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c
index c2f6e99..f5cb31e 100644
--- a/sys/kern/subr_kdb.c
+++ b/sys/kern/subr_kdb.c
@@ -88,20 +88,6 @@ SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
     kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
 
 /*
- * Flag indicating whether or not to IPI the other CPUs to stop them on
- * entering the debugger.  Sometimes, this will result in a deadlock as
- * stop_cpus() waits for the other cpus to stop, so we allow it to be
- * disabled.  In order to maximize the chances of success, use a hard
- * stop for that.
- */
-#ifdef SMP
-static int kdb_stop_cpus = 1;
-SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus, CTLFLAG_RW | CTLFLAG_TUN,
-    &kdb_stop_cpus, 0, "stop other CPUs when entering the debugger");
-TUNABLE_INT("debug.kdb.stop_cpus", &kdb_stop_cpus);
-#endif
-
-/*
  * Flag to indicate to debuggers why the debugger was entered.
  */
 const char * volatile kdb_why = KDB_WHY_UNSET;
@@ -211,9 +197,12 @@ kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
 void
 kdb_panic(const char *msg)
 {
-	
 #ifdef SMP
-	stop_cpus_hard(PCPU_GET(other_cpus));
+	cpuset_t other_cpus;
+
+	other_cpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	stop_cpus_hard(other_cpus);
 #endif
 	printf("KDB: panic\n");
 	panic("%s", msg);
@@ -429,7 +418,7 @@ kdb_thr_ctx(struct thread *thr)
 #if defined(SMP) && defined(KDB_STOPPEDPCB)
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)  {
 		if (pc->pc_curthread == thr &&
-		    CPU_OVERLAP(&stopped_cpus, &pc->pc_cpumask))
+		    CPU_ISSET(pc->pc_cpuid, &stopped_cpus))
 			return (KDB_STOPPEDPCB(pc));
 	}
 #endif
@@ -513,11 +502,11 @@ kdb_thr_select(struct thread *thr)
 int
 kdb_trap(int type, int code, struct trapframe *tf)
 {
-	struct kdb_dbbe *be;
-	register_t intr;
 #ifdef SMP
-	int did_stop_cpus;
+	cpuset_t other_cpus;
 #endif
+	struct kdb_dbbe *be;
+	register_t intr;
 	int handled;
 
 	be = kdb_dbbe;
@@ -531,8 +520,9 @@ kdb_trap(int type, int code, struct trapframe *tf)
 	intr = intr_disable();
 
 #ifdef SMP
-	if ((did_stop_cpus = kdb_stop_cpus) != 0)
-		stop_cpus_hard(PCPU_GET(other_cpus));
+	other_cpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	stop_cpus_hard(other_cpus);
 #endif
 
 	kdb_active++;
@@ -558,8 +548,7 @@ kdb_trap(int type, int code, struct trapframe *tf)
 	kdb_active--;
 
 #ifdef SMP
-	if (did_stop_cpus)
-		restart_cpus(stopped_cpus);
+	restart_cpus(stopped_cpus);
 #endif
 
 	intr_restore(intr);
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
index a6b3ae0..ec6b590 100644
--- a/sys/kern/subr_pcpu.c
+++ b/sys/kern/subr_pcpu.c
@@ -87,7 +87,6 @@ pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 	KASSERT(cpuid >= 0 && cpuid < MAXCPU,
 	    ("pcpu_init: invalid cpuid %d", cpuid));
 	pcpu->pc_cpuid = cpuid;
-	CPU_SETOF(cpuid, &pcpu->pc_cpumask);
 	cpuid_to_pcpu[cpuid] = pcpu;
 	STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu);
 	cpu_pcpu_init(pcpu, cpuid, size);
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index c38177b..caec965 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -142,7 +142,7 @@ mp_start(void *dummy)
 	/* Probe for MP hardware. */
 	if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 		mp_ncpus = 1;
-		all_cpus = PCPU_GET(cpumask);
+		CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 		return;
 	}
 
@@ -236,12 +236,10 @@ generic_stop_cpus(cpuset_t map, u_int type)
 		/* spin */
 		cpu_spinwait();
 		i++;
-#ifdef DIAGNOSTIC
-		if (i == 100000) {
+		if (i == 100000000) {
 			printf("timeout stopping cpus\n");
 			break;
 		}
-#endif
 	}
 
 	stopping_cpu = NOCPU;
@@ -708,7 +706,7 @@ mp_setvariables_for_up(void *dummy)
 {
 	mp_ncpus = 1;
 	mp_maxid = PCPU_GET(cpuid);
-	all_cpus = PCPU_GET(cpumask);
+	CPU_SETOF(mp_maxid, &all_cpus);
 	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 }
 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 0113d7b..3527ed1 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -44,7 +44,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_capabilities.h"
+#include "opt_capsicum.h"
 #include "opt_ktrace.h"
 #include "opt_kdtrace.h"
 #include "opt_sched.h"
@@ -313,7 +313,7 @@ syscallenter(struct thread *td, struct syscall_args *sa)
 				goto retval;
 		}
 
-#ifdef CAPABILITIES
+#ifdef CAPABILITY_MODE
 		/*
 		 * In capability mode, we only allow access to system calls
 		 * flagged with SYF_CAPENABLED.
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
index 9385dc4..6e81328 100644
--- a/sys/kern/subr_uio.c
+++ b/sys/kern/subr_uio.c
@@ -64,6 +64,8 @@ __FBSDID("$FreeBSD$");
 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
 	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
 
+static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
+
 #ifdef ZERO_COPY_SOCKETS
 /* Declared in uipc_socket.c */
 extern int so_zero_copy_receive;
@@ -129,23 +131,65 @@ retry:
 #endif /* ZERO_COPY_SOCKETS */
 
 int
+copyin_nofault(const void *udaddr, void *kaddr, size_t len)
+{
+	int error, save;
+
+	save = vm_fault_disable_pagefaults();
+	error = copyin(udaddr, kaddr, len);
+	vm_fault_enable_pagefaults(save);
+	return (error);
+}
+
+int
+copyout_nofault(const void *kaddr, void *udaddr, size_t len)
+{
+	int error, save;
+
+	save = vm_fault_disable_pagefaults();
+	error = copyout(kaddr, udaddr, len);
+	vm_fault_enable_pagefaults(save);
+	return (error);
+}
+
+int
 uiomove(void *cp, int n, struct uio *uio)
 {
-	struct thread *td = curthread;
+
+	return (uiomove_faultflag(cp, n, uio, 0));
+}
+
+int
+uiomove_nofault(void *cp, int n, struct uio *uio)
+{
+
+	return (uiomove_faultflag(cp, n, uio, 1));
+}
+
+static int
+uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
+{
+	struct thread *td;
 	struct iovec *iov;
 	u_int cnt;
-	int error = 0;
-	int save = 0;
+	int error, newflags, save;
+
+	td = curthread;
+	error = 0;
 
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove: mode"));
-	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
 	    ("uiomove proc"));
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-	    "Calling uiomove()");
+	if (!nofault)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+		    "Calling uiomove()");
 
-	save = td->td_pflags & TDP_DEADLKTREAT;
-	td->td_pflags |= TDP_DEADLKTREAT;
+	/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
+	newflags = TDP_DEADLKTREAT;
+	if (uio->uio_segflg == UIO_USERSPACE && nofault)
+		newflags |= TDP_NOFAULTING;
+	save = curthread_pflags_set(newflags);
 
 	while (n > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
@@ -187,8 +231,7 @@ uiomove(void *cp, int n, struct uio *uio)
 		n -= cnt;
 	}
 out:
-	if (save == 0)
-		td->td_pflags &= ~TDP_DEADLKTREAT;
+	curthread_pflags_restore(save);
 	return (error);
 }
 
diff --git a/sys/kern/sys_capability.c b/sys/kern/sys_capability.c
index 89dc923..04f98d8 100644
--- a/sys/kern/sys_capability.c
+++ b/sys/kern/sys_capability.c
@@ -36,7 +36,7 @@
  *
  */
 
-#include "opt_capabilities.h"
+#include "opt_capsicum.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
@@ -59,18 +59,11 @@ __FBSDID("$FreeBSD$");
 #include <vm/uma.h>
 #include <vm/vm.h>
 
-#ifdef CAPABILITIES
+#ifdef CAPABILITY_MODE
 
 FEATURE(security_capabilities, "Capsicum Capability Mode");
 
 /*
- * We don't currently have any MIB entries for sysctls, but we do expose
- * security.capabilities so that it's easy to tell if options CAPABILITIES is
- * compiled into the kernel.
- */
-SYSCTL_NODE(_security, OID_AUTO, capabilities, CTLFLAG_RW, 0, "Capsicum");
-
-/*
  * System call to enter capability mode for the process.
  */
 int
@@ -106,7 +99,7 @@ cap_getmode(struct thread *td, struct cap_getmode_args *uap)
 	return (copyout(&i, uap->modep, sizeof(i)));
 }
 
-#else /* !CAPABILITIES */
+#else /* !CAPABILITY_MODE */
 
 int
 cap_enter(struct thread *td, struct cap_enter_args *uap)
@@ -122,4 +115,403 @@ cap_getmode(struct thread *td, struct cap_getmode_args *uap)
 	return (ENOSYS);
 }
 
+#endif /* CAPABILITY_MODE */
+
+#ifdef CAPABILITIES
+
+/*
+ * struct capability describes a capability, and is hung off of its struct
+ * file f_data field.  cap_file and cap_rightss are static once hooked up, as
+ * neither the object it references nor the rights it encapsulates are
+ * permitted to change.
+ */
+struct capability {
+	struct file	*cap_object;	/* Underlying object's file. */
+	struct file	*cap_file;	/* Back-pointer to cap's file. */
+	cap_rights_t	 cap_rights;	/* Mask of rights on object. */
+};
+
+/*
+ * Capabilities have a fileops vector, but in practice none should ever be
+ * called except for fo_close, as the capability will normally not be
+ * returned during a file descriptor lookup in the system call code.
+ */
+static fo_rdwr_t capability_read;
+static fo_rdwr_t capability_write;
+static fo_truncate_t capability_truncate;
+static fo_ioctl_t capability_ioctl;
+static fo_poll_t capability_poll;
+static fo_kqfilter_t capability_kqfilter;
+static fo_stat_t capability_stat;
+static fo_close_t capability_close;
+
+static struct fileops capability_ops = {
+	.fo_read = capability_read,
+	.fo_write = capability_write,
+	.fo_truncate = capability_truncate,
+	.fo_ioctl = capability_ioctl,
+	.fo_poll = capability_poll,
+	.fo_kqfilter = capability_kqfilter,
+	.fo_stat = capability_stat,
+	.fo_close = capability_close,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
+static struct fileops capability_ops_unpassable = {
+	.fo_read = capability_read,
+	.fo_write = capability_write,
+	.fo_truncate = capability_truncate,
+	.fo_ioctl = capability_ioctl,
+	.fo_poll = capability_poll,
+	.fo_kqfilter = capability_kqfilter,
+	.fo_stat = capability_stat,
+	.fo_close = capability_close,
+	.fo_flags = 0,
+};
+
+static uma_zone_t capability_zone;
+
+static void
+capability_init(void *dummy __unused)
+{
+
+	capability_zone = uma_zcreate("capability", sizeof(struct capability),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	if (capability_zone == NULL)
+		panic("capability_init: capability_zone not initialized");
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL);
+
+/*
+ * Test whether a capability grants the requested rights.
+ */
+static int
+cap_check(struct capability *c, cap_rights_t rights)
+{
+
+	if ((c->cap_rights | rights) != c->cap_rights)
+		return (ENOTCAPABLE);
+	return (0);
+}
+
+/*
+ * Extract rights from a capability for monitoring purposes -- not for use in
+ * any other way, as we want to keep all capability permission evaluation in
+ * this one file.
+ */
+cap_rights_t
+cap_rights(struct file *fp_cap)
+{
+	struct capability *c;
+
+	KASSERT(fp_cap->f_type == DTYPE_CAPABILITY,
+	    ("cap_rights: !capability"));
+
+	c = fp_cap->f_data;
+	return (c->cap_rights);
+}
+
+/*
+ * System call to create a new capability reference to either an existing
+ * file object or an an existing capability.
+ */
+int
+cap_new(struct thread *td, struct cap_new_args *uap)
+{
+	int error, capfd;
+	int fd = uap->fd;
+	struct file *fp, *fcapp;
+	cap_rights_t rights = uap->rights;
+
+	AUDIT_ARG_FD(fd);
+#ifdef notyet	/* capability auditing will follow in a few commits */
+	AUDIT_ARG_RIGHTS(rights);
+#endif
+	error = fget(td, fd, &fp);
+	if (error)
+		return (error);
+	AUDIT_ARG_FILE(td->td_proc, fp);
+	error = kern_capwrap(td, fp, rights, &fcapp, &capfd);
+	if (error)
+		return (error);
+
+	/*
+	 * Release our reference to the file (kern_capwrap has held a reference
+	 * for the filedesc array).
+	 */
+	fdrop(fp, td);
+	td->td_retval[0] = capfd;
+	return (0);
+}
+
+/*
+ * System call to query the rights mask associated with a capability.
+ */
+int
+cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+{
+	struct capability *cp;
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	error = fgetcap(td, uap->fd, &fp);
+	if (error)
+		return (error);
+	cp = fp->f_data;
+	error = copyout(&cp->cap_rights, uap->rightsp, sizeof(*uap->rightsp));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Create a capability to wrap around an existing file.
+ */
+int
+kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights,
+    struct file **fcappp, int *capfdp)
+{
+	struct capability *cp, *cp_old;
+	struct file *fp_object;
+	int error;
+
+	if ((rights | CAP_MASK_VALID) != CAP_MASK_VALID)
+		return (EINVAL);
+
+	/*
+	 * If a new capability is being derived from an existing capability,
+	 * then the new capability rights must be a subset of the existing
+	 * rights.
+	 */
+	if (fp->f_type == DTYPE_CAPABILITY) {
+		cp_old = fp->f_data;
+		if ((cp_old->cap_rights | rights) != cp_old->cap_rights)
+			return (ENOTCAPABLE);
+	}
+
+	/*
+	 * Allocate a new file descriptor to hang the capability off of.
+	 */
+	error = falloc(td, fcappp, capfdp, fp->f_flag);
+	if (error)
+		return (error);
+
+	/*
+	 * Rather than nesting capabilities, directly reference the object an
+	 * existing capability references.  There's nothing else interesting
+	 * to preserve for future use, as we've incorporated the previous
+	 * rights mask into the new one.  This prevents us from having to
+	 * deal with capability chains.
+	 */
+	if (fp->f_type == DTYPE_CAPABILITY)
+		fp_object = ((struct capability *)fp->f_data)->cap_object;
+	else
+		fp_object = fp;
+	fhold(fp_object);
+	cp = uma_zalloc(capability_zone, M_WAITOK | M_ZERO);
+	cp->cap_rights = rights;
+	cp->cap_object = fp_object;
+	cp->cap_file = *fcappp;
+	if (fp->f_flag & DFLAG_PASSABLE)
+		finit(*fcappp, fp->f_flag, DTYPE_CAPABILITY, cp,
+		    &capability_ops);
+	else
+		finit(*fcappp, fp->f_flag, DTYPE_CAPABILITY, cp,
+		    &capability_ops_unpassable);
+
+	/*
+	 * Release our private reference (the proc filedesc still has one).
+	 */
+	fdrop(*fcappp, td);
+	return (0);
+}
+
+/*
+ * Given a file descriptor, test it against a capability rights mask and then
+ * return the file descriptor on which to actually perform the requested
+ * operation.  As long as the reference to fp_cap remains valid, the returned
+ * pointer in *fp will remain valid, so no extra reference management is
+ * required, and the caller should fdrop() fp_cap as normal when done with
+ * both.
+ */
+int
+cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
+{
+	struct capability *c;
+	int error;
+
+	if (fp_cap->f_type != DTYPE_CAPABILITY) {
+		*fpp = fp_cap;
+		return (0);
+	}
+	c = fp_cap->f_data;
+	error = cap_check(c, rights);
+	if (error)
+		return (error);
+	*fpp = c->cap_object;
+	return (0);
+}
+
+/*
+ * Slightly different routine for memory mapping file descriptors: unwrap the
+ * capability and check CAP_MMAP, but also return a bitmask representing the
+ * maximum mapping rights the capability allows on the object.
+ */
+int
+cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
+    struct file **fpp)
+{
+	struct capability *c;
+	u_char maxprot;
+	int error;
+
+	if (fp_cap->f_type != DTYPE_CAPABILITY) {
+		*fpp = fp_cap;
+		*maxprotp = VM_PROT_ALL;
+		return (0);
+	}
+	c = fp_cap->f_data;
+	error = cap_check(c, rights | CAP_MMAP);
+	if (error)
+		return (error);
+	*fpp = c->cap_object;
+	maxprot = 0;
+	if (c->cap_rights & CAP_READ)
+		maxprot |= VM_PROT_READ;
+	if (c->cap_rights & CAP_WRITE)
+		maxprot |= VM_PROT_WRITE;
+	if (c->cap_rights & CAP_MAPEXEC)
+		maxprot |= VM_PROT_EXECUTE;
+	*maxprotp = maxprot;
+	return (0);
+}
+
+/*
+ * When a capability is closed, simply drop the reference on the underlying
+ * object and free the capability.  fdrop() will handle the case where the
+ * underlying object also needs to close, and the caller will have already
+ * performed any object-specific lock or mqueue handling.
+ */
+static int
+capability_close(struct file *fp, struct thread *td)
+{
+	struct capability *c;
+	struct file *fp_object;
+
+	KASSERT(fp->f_type == DTYPE_CAPABILITY,
+	    ("capability_close: !capability"));
+
+	c = fp->f_data;
+	fp->f_ops = &badfileops;
+	fp->f_data = NULL;
+	fp_object = c->cap_object;
+	uma_zfree(capability_zone, c);
+	return (fdrop(fp_object, td));
+}
+
+/*
+ * In general, file descriptor operations should never make it to the
+ * capability, only the underlying file descriptor operation vector, so panic
+ * if we do turn up here.
+ */
+static int
+capability_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	panic("capability_read");
+}
+
+static int
+capability_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	panic("capability_write");
+}
+
+static int
+capability_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	panic("capability_truncate");
+}
+
+static int
+capability_ioctl(struct file *fp, u_long com, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+
+	panic("capability_ioctl");
+}
+
+static int
+capability_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	panic("capability_poll");
+}
+
+static int
+capability_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	panic("capability_kqfilter");
+}
+
+static int
+capability_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	panic("capability_stat");
+}
+
+#else /* !CAPABILITIES */
+
+/*
+ * Stub Capability functions for when options CAPABILITIES isn't compiled
+ * into the kernel.
+ */
+int
+cap_new(struct thread *td, struct cap_new_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
+{
+
+	KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
+	    ("cap_funwrap: saw capability"));
+
+	*fpp = fp_cap;
+	return (0);
+}
+
+int
+cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
+    struct file **fpp)
+{
+
+	KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
+	    ("cap_funwrap_mmap: saw capability"));
+
+	*fpp = fp_cap;
+	*maxprotp = VM_PROT_ALL;
+	return (0);
+}
+
 #endif /* CAPABILITIES */
+
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index a4c0069..ee36b35 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -829,6 +829,15 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 
 	case PT_ATTACH:
 		/* security check done above */
+		/*
+		 * It would be nice if the tracing relationship was separate
+		 * from the parent relationship but that would require
+		 * another set of links in the proc struct or for "wait"
+		 * to scan the entire proc table.  To make life easier,
+		 * we just re-parent the process we're trying to trace.
+		 * The old parent is remembered so we can put things back
+		 * on a "detach".
+		 */
 		p->p_flag |= P_TRACED;
 		p->p_oppid = p->p_pptr->p_pid;
 		if (p->p_pptr != td->td_proc) {
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 29a6485..abd9484 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -521,8 +521,8 @@ const char *syscallnames[] = {
 	"msgctl",			/* 511 = msgctl */
 	"shmctl",			/* 512 = shmctl */
 	"lpathconf",			/* 513 = lpathconf */
-	"#514",			/* 514 = cap_new */
-	"#515",			/* 515 = cap_getrights */
+	"cap_new",			/* 514 = cap_new */
+	"cap_getrights",			/* 515 = cap_getrights */
 	"cap_enter",			/* 516 = cap_enter */
 	"cap_getmode",			/* 517 = cap_getmode */
 	"#518",			/* 518 = pdfork */
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index af958c9..0b249a5 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -914,8 +914,9 @@
 512	AUE_SHMCTL	NOSTD	{ int shmctl(int shmid, int cmd, \
 				    struct shmid_ds *buf); }
 513	AUE_LPATHCONF	STD	{ int lpathconf(char *path, int name); }
-514	AUE_CAP_NEW	UNIMPL	cap_new
-515	AUE_CAP_GETRIGHTS	UNIMPL	cap_getrights
+514	AUE_CAP_NEW	STD	{ int cap_new(int fd, u_int64_t rights); }
+515	AUE_CAP_GETRIGHTS	STD	{ int cap_getrights(int fd, \
+				    u_int64_t *rightsp); }
 516	AUE_CAP_ENTER	STD	{ int cap_enter(void); }
 517	AUE_CAP_GETMODE	STD	{ int cap_getmode(u_int *modep); }
 518	AUE_PDFORK	UNIMPL	pdfork
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 880b46b..f57777f 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3096,6 +3096,22 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 2;
 		break;
 	}
+	/* cap_new */
+	case 514: {
+		struct cap_new_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = p->rights; /* u_int64_t */
+		*n_args = 2;
+		break;
+	}
+	/* cap_getrights */
+	case 515: {
+		struct cap_getrights_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->rightsp; /* u_int64_t * */
+		*n_args = 2;
+		break;
+	}
 	/* cap_enter */
 	case 516: {
 		*n_args = 0;
@@ -8326,6 +8342,32 @@ systrace_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* cap_new */
+	case 514:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "u_int64_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_getrights */
+	case 515:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "u_int64_t *";
+			break;
+		default:
+			break;
+		};
+		break;
 	/* cap_enter */
 	case 516:
 		break;
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index 87d479e..ffd8580 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -620,6 +620,7 @@ msgget(td, uap)
 			error = ENOSPC;
 			goto done2;
 		}
+#ifdef RACCT
 		PROC_LOCK(td->td_proc);
 		error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
 		PROC_UNLOCK(td->td_proc);
@@ -627,6 +628,7 @@ msgget(td, uap)
 			error = ENOSPC;
 			goto done2;
 		}
+#endif
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
@@ -685,7 +687,9 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	short next;
+#ifdef RACCT
 	size_t saved_msgsz;
+#endif
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
@@ -723,6 +727,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 		goto done2;
 #endif
 
+#ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
 		PROC_UNLOCK(td->td_proc);
@@ -737,6 +742,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 		goto done2;
 	}
 	PROC_UNLOCK(td->td_proc);
+#endif
 
 	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
@@ -991,12 +997,14 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done3:
+#ifdef RACCT
 	if (error != 0) {
 		PROC_LOCK(td->td_proc);
 		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 		racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
 		PROC_UNLOCK(td->td_proc);
 	}
+#endif
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index ac53a8d..4a4c479 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -149,9 +149,6 @@ struct sem_undo {
 #endif
 
 /* shouldn't need tuning */
-#ifndef SEMMAP
-#define SEMMAP	30		/* # of entries in semaphore map */
-#endif
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
@@ -182,7 +179,6 @@ struct sem_undo {
  * semaphore info struct
  */
 struct seminfo seminfo = {
-                SEMMAP,         /* # of entries in semaphore map */
                 SEMMNI,         /* # of semaphore identifiers */
                 SEMMNS,         /* # of semaphores in system */
                 SEMMNU,         /* # of undo structures in system */
@@ -194,8 +190,6 @@ struct seminfo seminfo = {
                 SEMAEM          /* adjust on exit max value */
 };
 
-SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0,
-    "Number of entries in the semaphore map");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
@@ -255,7 +249,6 @@ seminit(void)
 {
 	int i, error;
 
-	TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap);
 	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
 	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
 	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
@@ -931,6 +924,7 @@ semget(struct thread *td, struct semget_args *uap)
 			error = ENOSPC;
 			goto done2;
 		}
+#ifdef RACCT
 		PROC_LOCK(td->td_proc);
 		error = racct_add(td->td_proc, RACCT_NSEM, nsems);
 		PROC_UNLOCK(td->td_proc);
@@ -938,6 +932,7 @@ semget(struct thread *td, struct semget_args *uap)
 			error = ENOSPC;
 			goto done2;
 		}
+#endif
 		DPRINTF(("semid %d is available\n", semid));
 		mtx_lock(&sema_mtx[semid]);
 		KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
@@ -1023,12 +1018,14 @@ semop(struct thread *td, struct semop_args *uap)
 		    nsops));
 		return (E2BIG);
 	} else {
+#ifdef RACCT
 		PROC_LOCK(td->td_proc);
 		if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
 			PROC_UNLOCK(td->td_proc);
 			return (E2BIG);
 		}
 		PROC_UNLOCK(td->td_proc);
+#endif
 
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index f5a84ae..1741a21 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -672,6 +672,7 @@ shmget_allocate_segment(td, uap, mode)
 		shm_last_free = -1;
 	}
 	shmseg = &shmsegs[segnum];
+#ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
 		PROC_UNLOCK(td->td_proc);
@@ -683,6 +684,7 @@ shmget_allocate_segment(td, uap, mode)
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(td->td_proc);
+#endif
 	/*
 	 * In case we sleep in malloc(), mark the segment present but deleted
 	 * so that noone else tries to create the same key.
@@ -699,10 +701,12 @@ shmget_allocate_segment(td, uap, mode)
 	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
 	    0, size, VM_PROT_DEFAULT, 0, cred);
 	if (shm_object == NULL) {
+#ifdef RACCT
 		PROC_LOCK(td->td_proc);
 		racct_sub(td->td_proc, RACCT_NSHM, 1);
 		racct_sub(td->td_proc, RACCT_SHMSIZE, size);
 		PROC_UNLOCK(td->td_proc);
+#endif
 		return (ENOMEM);
 	}
 	VM_OBJECT_LOCK(shm_object);
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index 8aa3af2..187e635 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -91,7 +91,7 @@ static const char	*dev_console_filename;
 			HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
 			CDSR_OFLOW|CCAR_OFLOW)
 
-#define	TTY_CALLOUT(tp,d) ((d) != (tp)->t_dev && (d) != dev_console)
+#define	TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
 
 /*
  * Set TTY buffer sizes.
@@ -470,10 +470,10 @@ ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
 			if (error)
 				goto done;
 		}
- 
- 		tp->t_flags |= TF_BUSY_OUT;
+
+		tp->t_flags |= TF_BUSY_OUT;
 		error = ttydisc_write(tp, uio, ioflag);
- 		tp->t_flags &= ~TF_BUSY_OUT;
+		tp->t_flags &= ~TF_BUSY_OUT;
 		cv_signal(&tp->t_outserwait);
 	}
 
@@ -772,6 +772,10 @@ ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
 		goto done;
 	}
 
+	error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
+	if (error != ENOIOCTL)
+		goto done;
+
 	switch (cmd) {
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
@@ -878,6 +882,13 @@ ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
 }
 
 static int
+ttydevsw_defcioctl(struct tty *tp, int unit, u_long cmd, caddr_t data, struct thread *td)
+{
+
+	return (ENOIOCTL);
+}
+
+static int
 ttydevsw_defparam(struct tty *tp, struct termios *t)
 {
 
@@ -955,6 +966,7 @@ tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
 	PATCH_FUNC(outwakeup);
 	PATCH_FUNC(inwakeup);
 	PATCH_FUNC(ioctl);
+	PATCH_FUNC(cioctl);
 	PATCH_FUNC(param);
 	PATCH_FUNC(modem);
 	PATCH_FUNC(mmap);
@@ -1054,7 +1066,7 @@ tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
 
 	if (tp->t_pgrp == pg)
 		tp->t_pgrp = NULL;
-	
+
 	tty_unlock(tp);
 }
 
@@ -1190,13 +1202,13 @@ tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
 
 	/* Slave call-in devices. */
 	if (tp->t_flags & TF_INITLOCK) {
-		dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+		dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred,
 		    uid, gid, mode, "%s%s.init", prefix, name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
 		dev->si_drv2 = &tp->t_termios_init_in;
 
-		dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+		dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
 		    uid, gid, mode, "%s%s.lock", prefix, name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
@@ -1205,20 +1217,22 @@ tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
 
 	/* Call-out devices. */
 	if (tp->t_flags & TF_CALLOUT) {
-		dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
+		dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred,
 		    UID_UUCP, GID_DIALER, 0660, "cua%s", name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
 
 		/* Slave call-out devices. */
 		if (tp->t_flags & TF_INITLOCK) {
-			dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+			dev = make_dev_cred(&ttyil_cdevsw, 
+			    TTYUNIT_CALLOUT | TTYUNIT_INIT, cred,
 			    UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
 			dev_depends(tp->t_dev, dev);
 			dev->si_drv1 = tp;
 			dev->si_drv2 = &tp->t_termios_init_out;
 
-			dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+			dev = make_dev_cred(&ttyil_cdevsw,
+			    TTYUNIT_CALLOUT | TTYUNIT_LOCK, cred,
 			    UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
 			dev_depends(tp->t_dev, dev);
 			dev->si_drv1 = tp;
@@ -1241,7 +1255,7 @@ tty_signal_sessleader(struct tty *tp, int sig)
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
-	
+
 	if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
 		p = tp->t_session->s_leader;
 		PROC_LOCK(p);
@@ -1305,7 +1319,7 @@ tty_wait(struct tty *tp, struct cv *cv)
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
-	
+
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
@@ -1327,7 +1341,7 @@ tty_timedwait(struct tty *tp, struct cv *cv, int hz)
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
-	
+
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
@@ -1469,7 +1483,7 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
 				return (error);
 
 			/* XXX: CLOCAL? */
-			
+
 			tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
 			tp->t_termios.c_ispeed = t->c_ispeed;
 			tp->t_termios.c_ospeed = t->c_ospeed;
@@ -1708,7 +1722,7 @@ tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
 
 	if (tty_gone(tp))
 		return (ENXIO);
-	
+
 	error = ttydevsw_ioctl(tp, cmd, data, td);
 	if (error == ENOIOCTL)
 		error = tty_generic_ioctl(tp, cmd, data, fflag, td);
@@ -1786,7 +1800,7 @@ ttyhook_defrint(struct tty *tp, char c, int flags)
 
 	if (ttyhook_rint_bypass(tp, &c, 1) != 1)
 		return (-1);
-	
+
 	return (0);
 }
 
@@ -1812,7 +1826,7 @@ ttyhook_register(struct tty **rtp, struct proc *p, int fd,
 		error = EBADF;
 		goto done1;
 	}
-	
+
 	/*
 	 * Make sure the vnode is bound to a character device.
 	 * Unlocked check for the vnode type is ok there, because we
@@ -1910,7 +1924,7 @@ ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 	/* System console has no TTY associated. */
 	if (dev_console->si_drv1 == NULL)
 		return (ENXIO);
-	
+
 	return (ttydev_open(dev, oflags, devtype, td));
 }
 
diff --git a/sys/kern/tty_inq.c b/sys/kern/tty_inq.c
index b0e9b18..0c39a29 100644
--- a/sys/kern/tty_inq.c
+++ b/sys/kern/tty_inq.c
@@ -142,7 +142,7 @@ void
 ttyinq_free(struct ttyinq *ti)
 {
 	struct ttyinq_block *tib;
-	
+
 	ttyinq_flush(ti);
 	ti->ti_quota = 0;
 
@@ -276,7 +276,7 @@ ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
 	struct ttyinq_block *tib;
 	unsigned int boff;
 	size_t l;
-	
+
 	while (nbytes > 0) {
 		boff = ti->ti_end % TTYINQ_DATASIZE;
 
@@ -313,7 +313,7 @@ ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
 		nbytes -= l;
 		ti->ti_end += l;
 	}
-	
+
 	return (cbuf - (const char *)buf);
 }
 
@@ -397,7 +397,7 @@ ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote)
 
 	*c = tib->tib_data[boff];
 	*quote = GETBIT(tib, boff);
-	
+
 	return (0);
 }
 
diff --git a/sys/kern/tty_outq.c b/sys/kern/tty_outq.c
index d5ed221..5d40abe 100644
--- a/sys/kern/tty_outq.c
+++ b/sys/kern/tty_outq.c
@@ -119,7 +119,7 @@ void
 ttyoutq_free(struct ttyoutq *to)
 {
 	struct ttyoutq_block *tob;
-	
+
 	ttyoutq_flush(to);
 	to->to_quota = 0;
 
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
index d89c183..a3db59b 100644
--- a/sys/kern/tty_pts.c
+++ b/sys/kern/tty_pts.c
@@ -295,7 +295,7 @@ ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
 			return (EINVAL);
 		return copyout(p, fgn->buf, i);
 	}
-	
+
 	/*
 	 * We need to implement TIOCGPGRP and TIOCGSID here again. When
 	 * called on the pseudo-terminal master, it should not check if
@@ -563,7 +563,7 @@ ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
 	sb->st_uid = dev->si_uid;
 	sb->st_gid = dev->si_gid;
 	sb->st_mode = dev->si_mode | S_IFCHR;
-	
+
 	return (0);
 }
 
@@ -823,7 +823,7 @@ posix_openpt(struct thread *td, struct posix_openpt_args *uap)
 	 */
 	if (uap->flags & ~(O_RDWR|O_NOCTTY))
 		return (EINVAL);
-	
+
 	error = falloc(td, &fp, &fd, 0);
 	if (error)
 		return (error);
diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
index 6afac8d..2a0bb4b 100644
--- a/sys/kern/tty_ttydisc.c
+++ b/sys/kern/tty_ttydisc.c
@@ -270,13 +270,13 @@ ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag)
 
 	MPASS(tp->t_termios.c_cc[VMIN] != 0);
 	MPASS(tp->t_termios.c_cc[VTIME] != 0);
-	
+
 	/*
 	 * When using the interbyte timer, the timer should be started
 	 * after the first byte has been received. We just call into the
 	 * generic read timer code after we've received the first byte.
 	 */
-	
+
 	for (;;) {
 		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
 		    uio->uio_resid, 0);
@@ -331,7 +331,7 @@ ttydisc_read(struct tty *tp, struct uio *uio, int ioflag)
 		/* Unset the input watermark when we've got enough space. */
 		tty_hiwat_in_unblock(tp);
 	}
-	
+
 	return (error);
 }
 
@@ -521,7 +521,7 @@ ttydisc_write(struct tty *tp, struct uio *uio, int ioflag)
 				error = EWOULDBLOCK;
 				goto done;
 			}
-			
+
 			/*
 			 * The driver may write back the data
 			 * synchronously. Be sure to check the high
@@ -567,7 +567,7 @@ ttydisc_optimize(struct tty *tp)
 	} else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) &&
 	    (!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) &&
 	    (!CMP_FLAG(i, PARMRK) ||
-	        CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
+		CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
 	    !CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) {
 		tp->t_flags |= TF_BYPASS;
 	} else {
@@ -583,7 +583,7 @@ ttydisc_modem(struct tty *tp, int open)
 
 	if (open)
 		cv_broadcast(&tp->t_dcdwait);
-	
+
 	/*
 	 * Ignore modem status lines when CLOCAL is turned on, but don't
 	 * enter the zombie state when the TTY isn't opened, because
@@ -834,7 +834,7 @@ ttydisc_rint(struct tty *tp, char c, int flags)
 
 	if (ttyhook_hashook(tp, rint))
 		return ttyhook_rint(tp, c, flags);
-	
+
 	if (tp->t_flags & TF_BYPASS)
 		goto processed;
 
@@ -1072,7 +1072,7 @@ ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len)
 	size_t ret;
 
 	tty_lock_assert(tp, MA_OWNED);
-	
+
 	MPASS(tp->t_flags & TF_BYPASS);
 
 	atomic_add_long(&tty_nin, len);
@@ -1122,7 +1122,7 @@ ttydisc_rint_poll(struct tty *tp)
 	l = ttyinq_bytesleft(&tp->t_inq);
 	if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0)
 		return (1);
-	
+
 	return (l);
 }
 
@@ -1201,7 +1201,7 @@ ttydisc_getc_uio(struct tty *tp, struct uio *uio)
 			tty_unlock(tp);
 			error = uiomove(buf, len, uio);
 			tty_lock(tp);
-			
+
 			if (error != 0)
 				break;
 		}
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 00496af..0414f12 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -55,7 +55,10 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_capsicum.h"
+
 #include <sys/param.h>
+#include <sys/capability.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
@@ -264,7 +267,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
-			    FALSE);
+			    0);
 
 		/* Toss pages from swap. */
 		if (object->type == OBJT_SWAP)
@@ -486,6 +489,14 @@ shm_open(struct thread *td, struct shm_open_args *uap)
 	mode_t cmode;
 	int fd, error;
 
+#ifdef CAPABILITY_MODE
+	/*
+	 * shm_open(2) is only allowed for anonymous objects.
+	 */
+	if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
+		return (ECAPMODE);
+#endif
+
 	if ((uap->flags & O_ACCMODE) != O_RDONLY &&
 	    (uap->flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 3334fc2..990c6ba 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1915,7 +1915,6 @@ release:
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
  */
-#ifdef TCP_SORECEIVE_STREAM
 int
 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
@@ -1955,20 +1954,9 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
 	}
 	oresid = uio->uio_resid;
 
-	/* We will never ever get anything unless we are connected. */
+	/* We will never ever get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
-		/* When disconnecting there may be still some data left. */
-		if (sb->sb_cc > 0)
-			goto deliver;
-		if (!(so->so_state & SS_ISDISCONNECTED))
-			error = ENOTCONN;
-		goto out;
-	}
-
-	/* Socket buffer is empty and we shall not block. */
-	if (sb->sb_cc == 0 &&
-	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
-		error = EAGAIN;
+		error = ENOTCONN;
 		goto out;
 	}
 
@@ -1995,6 +1983,13 @@ restart:
 			goto out;
 	}
 
+	/* Socket buffer is empty and we shall not block. */
+	if (sb->sb_cc == 0 &&
+	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+		error = EAGAIN;
+		goto out;
+	}
+
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
 	    ((sb->sb_flags & SS_NBIO) ||
@@ -2109,7 +2104,6 @@ out:
 	sbunlock(sb);
 	return (error);
 }
-#endif /* TCP_SORECEIVE_STREAM */
 
 /*
  * Optimized version of soreceive() for simple datagram cases from userspace.
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 19aaee0..c434973 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -35,6 +35,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_capsicum.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
@@ -43,6 +44,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/capability.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -675,6 +677,11 @@ sendit(td, s, mp, flags)
 	struct sockaddr *to;
 	int error;
 
+#ifdef CAPABILITY_MODE
+	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
+		return (ECAPMODE);
+#endif
+
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error) {
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 2743089..a6ad81e 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1625,6 +1625,7 @@ vfs_vmio_release(struct buf *bp)
 	int i;
 	vm_page_t m;
 
+	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
@@ -1658,7 +1659,6 @@ vfs_vmio_release(struct buf *bp)
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
-	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	
 	if (bp->b_bufsize) {
 		bufspacewakeup();
@@ -3012,6 +3012,10 @@ allocbuf(struct buf *bp, int size)
 			if (desiredpages < bp->b_npages) {
 				vm_page_t m;
 
+				pmap_qremove((vm_offset_t)trunc_page(
+				    (vm_offset_t)bp->b_data) +
+				    (desiredpages << PAGE_SHIFT),
+				    (bp->b_npages - desiredpages));
 				VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
@@ -3032,8 +3036,6 @@ allocbuf(struct buf *bp, int size)
 					vm_page_unlock(m);
 				}
 				VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
-				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
-				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 2e07cf1..5edf0f5 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
+#include <sys/sbuf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
index 496ea70..ccbcb33 100644
--- a/sys/kern/vfs_mountroot.c
+++ b/sys/kern/vfs_mountroot.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
+#include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 741061d..934745b 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1190,8 +1190,8 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 	 */
 	if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
 		VM_OBJECT_LOCK(bo->bo_object);
-		vm_object_page_remove(bo->bo_object, 0, 0,
-			(flags & V_SAVE) ? TRUE : FALSE);
+		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
+		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_UNLOCK(bo->bo_object);
 	}
 
@@ -3590,9 +3590,6 @@ vn_isdisk(struct vnode *vp, int *errp)
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
- *
- * The ifdef'd CAPABILITIES version is here for reference, but is not
- * actually used.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,