summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/Make.tags.inc2
-rw-r--r--sys/kern/imgact_aout.c4
-rw-r--r--sys/kern/imgact_elf.c13
-rw-r--r--sys/kern/init_sysent.c4
-rw-r--r--sys/kern/kern_clocksource.c1
-rw-r--r--sys/kern/kern_descrip.c207
-rw-r--r--sys/kern/kern_exec.c24
-rw-r--r--sys/kern/kern_exit.c6
-rw-r--r--sys/kern/kern_fail.c171
-rw-r--r--sys/kern/kern_fork.c23
-rw-r--r--sys/kern/kern_jail.c1
-rw-r--r--sys/kern/kern_racct.c28
-rw-r--r--sys/kern/kern_rctl.c36
-rw-r--r--sys/kern/kern_rmlock.c6
-rw-r--r--sys/kern/kern_tc.c6
-rw-r--r--sys/kern/kern_thr.c4
-rw-r--r--sys/kern/link_elf.c2
-rw-r--r--sys/kern/sched_4bsd.c57
-rw-r--r--sys/kern/subr_kdb.c37
-rw-r--r--sys/kern/subr_pcpu.c1
-rw-r--r--sys/kern/subr_smp.c8
-rw-r--r--sys/kern/subr_trap.c4
-rw-r--r--sys/kern/subr_uio.c63
-rw-r--r--sys/kern/sys_capability.c412
-rw-r--r--sys/kern/sys_process.c9
-rw-r--r--sys/kern/syscalls.c4
-rw-r--r--sys/kern/syscalls.master5
-rw-r--r--sys/kern/systrace_args.c42
-rw-r--r--sys/kern/sysv_msg.c8
-rw-r--r--sys/kern/sysv_sem.c11
-rw-r--r--sys/kern/sysv_shm.c4
-rw-r--r--sys/kern/tty.c50
-rw-r--r--sys/kern/tty_inq.c8
-rw-r--r--sys/kern/tty_outq.c2
-rw-r--r--sys/kern/tty_pts.c6
-rw-r--r--sys/kern/tty_ttydisc.c20
-rw-r--r--sys/kern/uipc_shm.c13
-rw-r--r--sys/kern/uipc_socket.c24
-rw-r--r--sys/kern/uipc_syscalls.c7
-rw-r--r--sys/kern/vfs_bio.c8
-rw-r--r--sys/kern/vfs_mount.c1
-rw-r--r--sys/kern/vfs_mountroot.c1
-rw-r--r--sys/kern/vfs_subr.c7
43 files changed, 1029 insertions, 321 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
index ad7ea11..6f5a13a 100644
--- a/sys/kern/Make.tags.inc
+++ b/sys/kern/Make.tags.inc
@@ -37,6 +37,7 @@ COMM= ${SYS}/dev/advansys/*.[ch] \
${SYS}/fs/smbfs/*.[ch] \
${SYS}/fs/udf/*.[ch] \
${SYS}/fs/unionfs/*.[ch] \
+ ${SYS}/geom/*.[ch] \
${SYS}/kern/*.[ch] \
${SYS}/net/*.[ch] \
${SYS}/netatalk/*.[ch] \
@@ -55,6 +56,7 @@ COMM= ${SYS}/dev/advansys/*.[ch] \
${SYS}/sys/*.[ch]
COMMDIR1= ${SYS}/conf \
+ ${SYS}/geom \
${SYS}/kern \
${SYS}/net \
${SYS}/netatalk \
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
index 2f889ca..3908da7 100644
--- a/sys/kern/imgact_aout.c
+++ b/sys/kern/imgact_aout.c
@@ -103,7 +103,7 @@ struct sysentvec aout_sysvec = {
#elif defined(__amd64__)
-#define AOUT32_USRSTACK 0xbfc0000
+#define AOUT32_USRSTACK 0xbfc00000
#define AOUT32_PS_STRINGS \
(AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
@@ -152,7 +152,7 @@ aout_fixup(register_t **stack_base, struct image_params *imgp)
{
*(char **)stack_base -= sizeof(uint32_t);
- return (suword(*stack_base, imgp->args->argc));
+ return (suword32(*stack_base, imgp->args->argc));
}
static int
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index b41741a..45f6d64 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -31,10 +31,12 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_capsicum.h"
#include "opt_compat.h"
#include "opt_core.h"
#include <sys/param.h>
+#include <sys/capability.h>
#include <sys/exec.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
@@ -578,6 +580,15 @@ __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
u_long base_addr = 0;
int vfslocked, error, i, numsegs;
+#ifdef CAPABILITY_MODE
+ /*
+ * XXXJA: This check can go away once we are sufficiently confident
+ * that the checks in namei() are correct.
+ */
+ if (IN_CAPABILITY_MODE(curthread))
+ return (ECAPMODE);
+#endif
+
tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
nd = &tempdata->nd;
attr = &tempdata->attr;
@@ -1104,6 +1115,7 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
hdrsize = 0;
__elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count);
+#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(td->td_proc, RACCT_CORE, hdrsize + seginfo.size);
PROC_UNLOCK(td->td_proc);
@@ -1111,6 +1123,7 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
error = EFAULT;
goto done;
}
+#endif
if (hdrsize + seginfo.size >= limit) {
error = EFAULT;
goto done;
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index e45ffc5..004516b 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -548,8 +548,8 @@ struct sysent sysent[] = {
{ AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 511 = msgctl */
{ AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 512 = shmctl */
{ AS(lpathconf_args), (sy_call_t *)lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0, SY_THR_STATIC }, /* 513 = lpathconf */
- { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 514 = cap_new */
- { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 515 = cap_getrights */
+ { AS(cap_new_args), (sy_call_t *)cap_new, AUE_CAP_NEW, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 514 = cap_new */
+ { AS(cap_getrights_args), (sy_call_t *)cap_getrights, AUE_CAP_GETRIGHTS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 515 = cap_getrights */
{ 0, (sy_call_t *)cap_enter, AUE_CAP_ENTER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 516 = cap_enter */
{ AS(cap_getmode_args), (sy_call_t *)cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 517 = cap_getmode */
{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 518 = pdfork */
diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
index dd8bab5..ecfd408 100644
--- a/sys/kern/kern_clocksource.c
+++ b/sys/kern/kern_clocksource.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
cyclic_clock_func_t cyclic_clock_func = NULL;
#endif
+int cpu_can_deep_sleep = 0; /* C3 state is available. */
int cpu_disable_deep_sleep = 0; /* Timer dies in C3. */
static void setuptimer(void);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 180d598..829ece2 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -37,6 +37,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_capsicum.h"
#include "opt_compat.h"
#include "opt_ddb.h"
#include "opt_ktrace.h"
@@ -44,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/capability.h>
#include <sys/conf.h>
#include <sys/domain.h>
#include <sys/fcntl.h>
@@ -91,6 +93,7 @@ __FBSDID("$FreeBSD$");
#include <security/audit/audit.h>
#include <vm/uma.h>
+#include <vm/vm.h>
#include <ddb/ddb.h>
@@ -818,6 +821,7 @@ do_dup(struct thread *td, int flags, int old, int new,
* descriptors, just put the limit on the size of the file
* descriptor table.
*/
+#ifdef RACCT
PROC_LOCK(p);
error = racct_set(p, RACCT_NOFILE, new + 1);
PROC_UNLOCK(p);
@@ -826,6 +830,7 @@ do_dup(struct thread *td, int flags, int old, int new,
fdrop(fp, td);
return (EMFILE);
}
+#endif
fdgrowtable(fdp, new + 1);
}
if (fdp->fd_ofiles[new] == NULL)
@@ -1155,7 +1160,7 @@ kern_close(td, fd)
int fd;
{
struct filedesc *fdp;
- struct file *fp;
+ struct file *fp, *fp_object;
int error;
int holdleaders;
@@ -1190,8 +1195,14 @@ kern_close(td, fd)
* added, and deleteing a knote for the new fd.
*/
knote_fdclose(td, fd);
- if (fp->f_type == DTYPE_MQUEUE)
- mq_fdclose(td, fd, fp);
+
+ /*
+ * When we're closing an fd with a capability, we need to notify
+ * mqueue if the underlying object is of type mqueue.
+ */
+ (void)cap_funwrap(fp, 0, &fp_object);
+ if (fp_object->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, fd, fp_object);
FILEDESC_XUNLOCK(fdp);
error = closef(fp, td);
@@ -1473,7 +1484,10 @@ fdalloc(struct thread *td, int minfd, int *result)
{
struct proc *p = td->td_proc;
struct filedesc *fdp = p->p_fd;
- int fd = -1, maxfd, error;
+ int fd = -1, maxfd;
+#ifdef RACCT
+ int error;
+#endif
FILEDESC_XLOCK_ASSERT(fdp);
@@ -1496,11 +1510,13 @@ fdalloc(struct thread *td, int minfd, int *result)
return (EMFILE);
if (fd < fdp->fd_nfiles)
break;
+#ifdef RACCT
PROC_LOCK(p);
error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
PROC_UNLOCK(p);
if (error != 0)
return (EMFILE);
+#endif
fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
}
@@ -1561,54 +1577,85 @@ fdavail(struct thread *td, int n)
int
falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
{
- struct proc *p = td->td_proc;
struct file *fp;
- int error, i;
+ int error, fd;
+
+ error = falloc_noinstall(td, &fp);
+ if (error)
+ return (error); /* no reference held on error */
+
+ error = finstall(td, fp, &fd, flags);
+ if (error) {
+ fdrop(fp, td); /* one reference (fp only) */
+ return (error);
+ }
+
+ if (resultfp != NULL)
+ *resultfp = fp; /* copy out result */
+ else
+ fdrop(fp, td); /* release local reference */
+
+ if (resultfd != NULL)
+ *resultfd = fd;
+
+ return (0);
+}
+
+/*
+ * Create a new open file structure without allocating a file descriptor.
+ */
+int
+falloc_noinstall(struct thread *td, struct file **resultfp)
+{
+ struct file *fp;
int maxuserfiles = maxfiles - (maxfiles / 20);
static struct timeval lastfail;
static int curfail;
- fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
+ KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
+
if ((openfiles >= maxuserfiles &&
priv_check(td, PRIV_MAXFILES) != 0) ||
openfiles >= maxfiles) {
if (ppsratecheck(&lastfail, &curfail, 1)) {
- printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
- td->td_ucred->cr_ruid);
+ printf("kern.maxfiles limit exceeded by uid %i, "
+ "please see tuning(7).\n", td->td_ucred->cr_ruid);
}
- uma_zfree(file_zone, fp);
return (ENFILE);
}
atomic_add_int(&openfiles, 1);
-
- /*
- * If the process has file descriptor zero open, add the new file
- * descriptor to the list of open files at that point, otherwise
- * put it at the front of the list of open files.
- */
+ fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
refcount_init(&fp->f_count, 1);
- if (resultfp)
- fhold(fp);
fp->f_cred = crhold(td->td_ucred);
fp->f_ops = &badfileops;
fp->f_data = NULL;
fp->f_vnode = NULL;
- FILEDESC_XLOCK(p->p_fd);
- if ((error = fdalloc(td, 0, &i))) {
- FILEDESC_XUNLOCK(p->p_fd);
- fdrop(fp, td);
- if (resultfp)
- fdrop(fp, td);
+ *resultfp = fp;
+ return (0);
+}
+
+/*
+ * Install a file in a file descriptor table.
+ */
+int
+finstall(struct thread *td, struct file *fp, int *fd, int flags)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+
+ KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
+ KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
+
+ FILEDESC_XLOCK(fdp);
+ if ((error = fdalloc(td, 0, fd))) {
+ FILEDESC_XUNLOCK(fdp);
return (error);
}
- p->p_fd->fd_ofiles[i] = fp;
+ fhold(fp);
+ fdp->fd_ofiles[*fd] = fp;
if ((flags & O_CLOEXEC) != 0)
- p->p_fd->fd_ofileflags[i] |= UF_EXCLOSE;
- FILEDESC_XUNLOCK(p->p_fd);
- if (resultfp)
- *resultfp = fp;
- if (resultfd)
- *resultfd = i;
+ fdp->fd_ofileflags[*fd] |= UF_EXCLOSE;
+ FILEDESC_XUNLOCK(fdp);
return (0);
}
@@ -1739,11 +1786,11 @@ fdcopy(struct filedesc *fdp)
FILEDESC_XUNLOCK(newfdp);
FILEDESC_SLOCK(fdp);
}
- /* copy everything except kqueue descriptors */
+ /* copy all passable descriptors (i.e. not kqueue) */
newfdp->fd_freefile = -1;
for (i = 0; i <= fdp->fd_lastfile; ++i) {
if (fdisused(fdp, i) &&
- fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE &&
+ (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
fdp->fd_ofiles[i]->f_ops != &badfileops) {
newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
@@ -1785,9 +1832,11 @@ fdfree(struct thread *td)
if (fdp == NULL)
return;
+#ifdef RACCT
PROC_LOCK(td->td_proc);
racct_set(td->td_proc, RACCT_NOFILE, 0);
PROC_UNLOCK(td->td_proc);
+#endif
/* Check for special need to clear POSIX style locks */
fdtol = td->td_proc->p_fdtol;
@@ -2103,6 +2152,7 @@ closef(struct file *fp, struct thread *td)
struct flock lf;
struct filedesc_to_leader *fdtol;
struct filedesc *fdp;
+ struct file *fp_object;
/*
* POSIX record locking dictates that any close releases ALL
@@ -2115,11 +2165,15 @@ closef(struct file *fp, struct thread *td)
* NULL thread pointer when there really is no owning
* context that might have locks, or the locks will be
* leaked.
+ *
+ * If this is a capability, we do lock processing under the underlying
+ * node, not the capability itself.
*/
- if (fp->f_type == DTYPE_VNODE && td != NULL) {
+ (void)cap_funwrap(fp, 0, &fp_object);
+ if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
int vfslocked;
- vp = fp->f_vnode;
+ vp = fp_object->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
lf.l_whence = SEEK_SET;
@@ -2149,7 +2203,7 @@ closef(struct file *fp, struct thread *td)
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
- vp = fp->f_vnode;
+ vp = fp_object->f_vnode;
(void) VOP_ADVLOCK(vp,
(caddr_t)fdtol->fdl_leader,
F_UNLCK, &lf, F_POSIX);
@@ -2228,15 +2282,27 @@ fget_unlocked(struct filedesc *fdp, int fd)
* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
* returned.
*
+ * If the FGET_GETCAP flag is set, the capability itself will be returned.
+ * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
+ * Otherwise, if the file is a capability, its rights will be checked against
+ * the capability rights mask, and if successful, the object will be unwrapped.
+ *
* If an error occured the non-zero error is returned and *fpp is set to
* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
* responsible for fdrop().
*/
+#define FGET_GETCAP 0x00000001
static __inline int
-_fget(struct thread *td, int fd, struct file **fpp, int flags)
+_fget(struct thread *td, int fd, struct file **fpp, int flags,
+ cap_rights_t needrights, cap_rights_t *haverights, u_char *maxprotp,
+ int fget_flags)
{
struct filedesc *fdp;
struct file *fp;
+#ifdef CAPABILITIES
+ struct file *fp_fromcap;
+ int error;
+#endif
*fpp = NULL;
if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
@@ -2247,6 +2313,47 @@ _fget(struct thread *td, int fd, struct file **fpp, int flags)
fdrop(fp, td);
return (EBADF);
}
+
+#ifdef CAPABILITIES
+ /*
+ * If a capability has been requested, return the capability directly.
+ * Otherwise, check capability rights, extract the underlying object,
+ * and check its access flags.
+ */
+ if (fget_flags & FGET_GETCAP) {
+ if (fp->f_type != DTYPE_CAPABILITY) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ } else {
+ if (maxprotp == NULL)
+ error = cap_funwrap(fp, needrights, &fp_fromcap);
+ else
+ error = cap_funwrap_mmap(fp, needrights, maxprotp,
+ &fp_fromcap);
+ if (error) {
+ fdrop(fp, td);
+ return (error);
+ }
+
+ /*
+ * If we've unwrapped a file, drop the original capability
+ * and hold the new descriptor. fp after this point refers to
+ * the actual (unwrapped) object, not the capability.
+ */
+ if (fp != fp_fromcap) {
+ fhold(fp_fromcap);
+ fdrop(fp, td);
+ fp = fp_fromcap;
+ }
+ }
+#else /* !CAPABILITIES */
+ KASSERT(fp->f_type != DTYPE_CAPABILITY,
+ ("%s: saw capability", __func__));
+ if (maxprotp != NULL)
+ *maxprotp = VM_PROT_ALL;
+#endif /* CAPABILITIES */
+
/*
* FREAD and FWRITE failure return EBADF as per POSIX.
*
@@ -2265,23 +2372,36 @@ int
fget(struct thread *td, int fd, struct file **fpp)
{
- return(_fget(td, fd, fpp, 0));
+ return(_fget(td, fd, fpp, 0, 0, NULL, NULL, 0));
}
int
fget_read(struct thread *td, int fd, struct file **fpp)
{
- return(_fget(td, fd, fpp, FREAD));
+ return(_fget(td, fd, fpp, FREAD, 0, NULL, NULL, 0));
}
int
fget_write(struct thread *td, int fd, struct file **fpp)
{
- return(_fget(td, fd, fpp, FWRITE));
+ return(_fget(td, fd, fpp, FWRITE, 0, NULL, NULL, 0));
+}
+
+/*
+ * Unlike the other fget() calls, which will accept and check capability rights
+ * but never return capabilities, fgetcap() returns the capability but doesn't
+ * check capability rights.
+ */
+int
+fgetcap(struct thread *td, int fd, struct file **fpp)
+{
+
+ return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
}
+
/*
* Like fget() but loads the underlying vnode, or returns an error if the
* descriptor does not represent a vnode. Note that pipes use vnodes but
@@ -2296,7 +2416,7 @@ _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
int error;
*vpp = NULL;
- if ((error = _fget(td, fd, &fp, flags)) != 0)
+ if ((error = _fget(td, fd, &fp, flags, 0, NULL, NULL, 0)) != 0)
return (error);
if (fp->f_vnode == NULL) {
error = EINVAL;
@@ -2352,7 +2472,7 @@ fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
*spp = NULL;
if (fflagp != NULL)
*fflagp = 0;
- if ((error = _fget(td, fd, &fp, 0)) != 0)
+ if ((error = _fget(td, fd, &fp, 0, 0, NULL, NULL, 0)) != 0)
return (error);
if (fp->f_type != DTYPE_SOCKET) {
error = ENOTSOCK;
@@ -2388,6 +2508,9 @@ fputsock(struct socket *so)
/*
* Handle the last reference to a file being closed.
+ *
+ * No special capability handling here, as the capability's fo_close will run
+ * instead of the object here, and perform any necessary drop on the object.
*/
int
_fdrop(struct file *fp, struct thread *td)
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index c51cc39..f30f89a 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -27,12 +27,14 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_capsicum.h"
#include "opt_hwpmc_hooks.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_vm.h"
#include <sys/param.h>
+#include <sys/capability.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
@@ -415,6 +417,18 @@ do_execve(td, args, mac_p)
interpret:
if (args->fname != NULL) {
+#ifdef CAPABILITY_MODE
+ /*
+ * While capability mode can't reach this point via direct
+ * path arguments to execve(), we also don't allow
+ * interpreters to be used in capability mode (for now).
+ * Catch indirect lookups and return a permissions error.
+ */
+ if (IN_CAPABILITY_MODE(td)) {
+ error = ECAPMODE;
+ goto exec_fail;
+ }
+#endif
error = namei(&nd);
if (error)
goto exec_fail;
@@ -631,6 +645,13 @@ interpret:
* Don't honor setuid/setgid if the filesystem prohibits it or if
* the process is being traced.
*
+ * We disable setuid/setgid/etc in compatibility mode on the basis
+ * that most setugid applications are not written with that
+ * environment in mind, and will therefore almost certainly operate
+ * incorrectly. In principle there's no reason that setugid
+ * applications might not be useful in capability mode, so we may want
+ * to reconsider this conservative design choice in the future.
+ *
* XXXMAC: For the time being, use NOSUID to also prohibit
* transitions on the file system.
*/
@@ -646,6 +667,9 @@ interpret:
#endif
if (credential_changing &&
+#ifdef CAPABILITY_MODE
+ ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
+#endif
(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
(p->p_flag & P_TRACED) == 0) {
/*
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index bb25d17..30b94b6 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -744,9 +744,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options,
* Destroy resource accounting information associated with the process.
*/
racct_proc_exit(p);
+#ifdef RACCT
PROC_LOCK(p->p_pptr);
racct_sub(p->p_pptr, RACCT_NPROC, 1);
PROC_UNLOCK(p->p_pptr);
+#endif
/*
* Free credentials, arguments, and sigacts.
@@ -905,19 +907,23 @@ loop:
void
proc_reparent(struct proc *child, struct proc *parent)
{
+#ifdef RACCT
int locked;
+#endif
sx_assert(&proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(child, MA_OWNED);
if (child->p_pptr == parent)
return;
+#ifdef RACCT
locked = PROC_LOCKED(parent);
if (!locked)
PROC_LOCK(parent);
racct_add_force(parent, RACCT_NPROC, 1);
if (!locked)
PROC_UNLOCK(parent);
+#endif
PROC_LOCK(child->p_pptr);
racct_sub(child->p_pptr, RACCT_NPROC, 1);
sigqueue_take(child->p_ksi);
diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c
index e0fb32b..f192471 100644
--- a/sys/kern/kern_fail.c
+++ b/sys/kern/kern_fail.c
@@ -52,6 +52,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <sys/ctype.h>
#include <sys/errno.h>
#include <sys/fail.h>
#include <sys/kernel.h>
@@ -59,6 +60,7 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/proc.h>
#include <sys/sbuf.h>
#include <machine/stdarg.h>
@@ -88,16 +90,20 @@ enum fail_point_t {
FAIL_POINT_BREAK, /**< break into the debugger */
FAIL_POINT_PRINT, /**< print a message */
FAIL_POINT_SLEEP, /**< sleep for some msecs */
- FAIL_POINT_INVALID, /**< placeholder */
+ FAIL_POINT_NUMTYPES
};
-static const char *fail_type_strings[] = {
- "off",
- "panic",
- "return",
- "break",
- "print",
- "sleep",
+static struct {
+ const char *name;
+ int nmlen;
+} fail_type_strings[] = {
+#define FP_TYPE_NM_LEN(s) { s, sizeof(s) - 1 }
+ [FAIL_POINT_OFF] = FP_TYPE_NM_LEN("off"),
+ [FAIL_POINT_PANIC] = FP_TYPE_NM_LEN("panic"),
+ [FAIL_POINT_RETURN] = FP_TYPE_NM_LEN("return"),
+ [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"),
+ [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"),
+ [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"),
};
/**
@@ -109,7 +115,7 @@ struct fail_point_entry {
int fe_arg; /**< argument to type (e.g. return value) */
int fe_prob; /**< likelihood of firing in millionths */
int fe_count; /**< number of times to fire, 0 means always */
-
+ pid_t fe_pid; /**< only fail for this process */
TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */
};
@@ -120,7 +126,7 @@ fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent,
/* convert from millisecs to ticks, rounding up */
int timo = ((msecs * hz) + 999) / 1000;
- if (timo) {
+ if (timo > 0) {
if (fp->fp_sleep_fn == NULL) {
msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo);
} else {
@@ -191,19 +197,13 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...)
void
fail_point_destroy(struct fail_point *fp)
{
- struct fail_point_entry *ent;
- if (fp->fp_flags & FAIL_POINT_DYNAMIC_NAME && fp->fp_name != NULL) {
- fp_free((void *)(intptr_t)fp->fp_name);
+ if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
+ fp_free(__DECONST(void *, fp->fp_name));
fp->fp_name = NULL;
}
fp->fp_flags = 0;
-
- while (!TAILQ_EMPTY(&fp->fp_entries)) {
- ent = TAILQ_FIRST(&fp->fp_entries);
- TAILQ_REMOVE(&fp->fp_entries, ent, fe_entries);
- fp_free(ent);
- }
+ clear_entries(&fp->fp_entries);
}
/**
@@ -222,16 +222,14 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
FP_LOCK();
- ent = TAILQ_FIRST(&fp->fp_entries);
- while (ent) {
+ TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) {
int cont = 0; /* don't continue by default */
- next = TAILQ_NEXT(ent, fe_entries);
if (ent->fe_prob < PROB_MAX &&
- ent->fe_prob < random() % PROB_MAX) {
- cont = 1;
- goto loop_end;
- }
+ ent->fe_prob < random() % PROB_MAX)
+ continue;
+ if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
+ continue;
switch (ent->fe_type) {
case FAIL_POINT_PANIC:
@@ -239,13 +237,14 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
/* NOTREACHED */
case FAIL_POINT_RETURN:
- if (return_value)
+ if (return_value != NULL)
*return_value = ent->fe_arg;
ret = FAIL_POINT_RC_RETURN;
break;
case FAIL_POINT_BREAK:
- printf("fail point %s breaking to debugger\n", fp->fp_name);
+ printf("fail point %s breaking to debugger\n",
+ fp->fp_name);
breakpoint();
break;
@@ -273,13 +272,9 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
break;
}
- if (ent && ent->fe_count > 0 && --ent->fe_count == 0)
+ if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0)
free_entry(&fp->fp_entries, ent);
-
-loop_end:
- if (cont)
- ent = next;
- else
+ if (cont == 0)
break;
}
@@ -290,7 +285,7 @@ loop_end:
FP_UNLOCK();
- return ret;
+ return (ret);
}
/**
@@ -320,9 +315,11 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb)
}
if (ent->fe_count > 0)
sbuf_printf(sb, "%d*", ent->fe_count);
- sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type]);
+ sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
if (ent->fe_arg)
sbuf_printf(sb, "(%d)", ent->fe_arg);
+ if (ent->fe_pid != NO_PID)
+ sbuf_printf(sb, "[pid %d]", ent->fe_pid);
if (TAILQ_NEXT(ent, fe_entries))
sbuf_printf(sb, "->");
}
@@ -380,7 +377,7 @@ fail_point_set(struct fail_point *fp, char *buf)
fp->fp_name, fp->fp_location, buf);
#endif /* IWARNING */
- return error;
+ return (error);
}
#define MAX_FAIL_POINT_BUF 1023
@@ -422,9 +419,8 @@ fail_point_sysctl(SYSCTL_HANDLER_ARGS)
}
out:
- if (buf)
- fp_free(buf);
- return error;
+ fp_free(buf);
+ return (error);
}
/**
@@ -437,12 +433,17 @@ parse_fail_point(struct fail_point_entries *ents, char *p)
/* <fail_point> ::
* <term> ( "->" <term> )*
*/
- if (!(p = parse_term(ents, p)))
- return 0;
- while (*p)
- if (p[0] != '-' || p[1] != '>' || !(p = parse_term(ents, p+2)))
- return 0;
- return p;
+ p = parse_term(ents, p);
+ if (p == NULL)
+ return (NULL);
+ while (*p != '\0') {
+ if (p[0] != '-' || p[1] != '>')
+ return (NULL);
+ p = parse_term(ents, p + 2);
+ if (p == NULL)
+ return (NULL);
+ }
+ return (p);
}
/**
@@ -455,6 +456,7 @@ parse_term(struct fail_point_entries *ents, char *p)
ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO);
ent->fe_prob = PROB_MAX;
+ ent->fe_pid = NO_PID;
TAILQ_INSERT_TAIL(ents, ent, fe_entries);
/*
@@ -462,14 +464,16 @@ parse_term(struct fail_point_entries *ents, char *p)
* ( (<float> "%") | (<integer> "*" ) )*
* <type>
* [ "(" <integer> ")" ]
+ * [ "[pid " <integer> "]" ]
*/
/* ( (<float> "%") | (<integer> "*" ) )* */
- while (('0' <= *p && *p <= '9') || *p == '.') {
+ while (isdigit(*p) || *p == '.') {
int units, decimal;
- if (!(p = parse_number(&units, &decimal, p)))
- return 0;
+ p = parse_number(&units, &decimal, p);
+ if (p == NULL)
+ return (NULL);
if (*p == '%') {
if (units > 100) /* prevent overflow early */
@@ -477,37 +481,44 @@ parse_term(struct fail_point_entries *ents, char *p)
ent->fe_prob = units * (PROB_MAX / 100) + decimal;
if (ent->fe_prob > PROB_MAX)
ent->fe_prob = PROB_MAX;
-
} else if (*p == '*') {
if (!units || decimal)
- return 0;
+ return (NULL);
ent->fe_count = units;
-
- } else {
- return 0;
- }
-
+ } else
+ return (NULL);
p++;
}
/* <type> */
- if (!(p = parse_type(ent, p)))
- return 0;
+ p = parse_type(ent, p);
+ if (p == NULL)
+ return (NULL);
if (*p == '\0')
- return p;
+ return (p);
/* [ "(" <integer> ")" ] */
if (*p != '(')
return p;
p++;
- if (('0' <= *p && *p <= '9') || *p == '-')
- ent->fe_arg = strtol(p, &p, 0);
- else
- return 0;
+ if (!isdigit(*p) && *p != '-')
+ return (NULL);
+ ent->fe_arg = strtol(p, &p, 0);
if (*p++ != ')')
- return 0;
-
- return p;
+ return (NULL);
+
+ /* [ "[pid " <integer> "]" ] */
+#define PID_STRING "[pid "
+ if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
+ return (p);
+ p += sizeof(PID_STRING) - 1;
+ if (!isdigit(*p))
+ return (NULL);
+ ent->fe_pid = strtol(p, &p, 0);
+ if (*p++ != ']')
+ return (NULL);
+
+ return (p);
}
/**
@@ -528,14 +539,14 @@ parse_number(int *out_units, int *out_decimal, char *p)
old_p = p;
*out_units = strtol(p, &p, 10);
if (p == old_p && *p != '.')
- return 0;
+ return (NULL);
/* fractional part */
*out_decimal = 0;
if (*p == '.') {
int digits = 0;
p++;
- while ('0' <= *p && *p <= '9') {
+ while (isdigit(*p)) {
int digit = *p - '0';
if (digits < PROB_DIGITS - 2)
*out_decimal = *out_decimal * 10 + digit;
@@ -545,12 +556,12 @@ parse_number(int *out_units, int *out_decimal, char *p)
p++;
}
if (!digits) /* need at least one digit after '.' */
- return 0;
+ return (NULL);
while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */
*out_decimal *= 10;
}
- return p; /* success */
+ return (p); /* success */
}
/**
@@ -560,21 +571,16 @@ static char *
parse_type(struct fail_point_entry *ent, char *beg)
{
enum fail_point_t type;
- char *end = beg;
- while ('a' <= *end && *end <= 'z')
- end++;
- if (beg == end)
- return 0;
- for (type = FAIL_POINT_OFF; type != FAIL_POINT_INVALID; type++) {
- const char *p = fail_type_strings[type];
- const char *q = beg;
- while (q < end && *p++ == *q++);
- if (q == end && *p == '\0') {
+ int len;
+
+ for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) {
+ len = fail_type_strings[type].nmlen;
+ if (strncmp(fail_type_strings[type].name, beg, len) == 0) {
ent->fe_type = type;
- return end;
+ return (beg + len);
}
}
- return 0;
+ return (NULL);
}
/**
@@ -595,6 +601,7 @@ static void
clear_entries(struct fail_point_entries *ents)
{
struct fail_point_entry *ent, *ent_next;
+
TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next)
fp_free(ent);
TAILQ_INIT(ents);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 04e635a..9d3e22d 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -476,7 +476,10 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
sigacts_copy(newsigacts, p1->p_sigacts);
p2->p_sigacts = newsigacts;
}
- if (flags & RFLINUXTHPN)
+
+ if (flags & RFTSIGZMB)
+ p2->p_sigparent = RFTSIGNUM(flags);
+ else if (flags & RFLINUXTHPN)
p2->p_sigparent = SIGUSR1;
else
p2->p_sigparent = SIGCHLD;
@@ -719,10 +722,22 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
static int curfail;
static struct timeval lastfail;
+ /* Check for the undefined or unimplemented flags. */
+ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
+ return (EINVAL);
+
+ /* Signal value requires RFTSIGZMB. */
+ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
+ return (EINVAL);
+
/* Can't copy and clear. */
if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
return (EINVAL);
+ /* Check the validity of the signal number. */
+ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
+ return (EINVAL);
+
p1 = td->td_proc;
/*
@@ -734,11 +749,13 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
return (fork_norfproc(td, flags));
}
+#ifdef RACCT
PROC_LOCK(p1);
error = racct_add(p1, RACCT_NPROC, 1);
PROC_UNLOCK(p1);
if (error != 0)
return (EAGAIN);
+#endif
mem_charged = 0;
vm2 = NULL;
@@ -822,6 +839,7 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
goto fail;
}
+#ifdef RACCT
/*
* After fork, there is exactly one thread running.
*/
@@ -832,6 +850,7 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp)
error = EAGAIN;
goto fail;
}
+#endif
/*
* Increment the count of procs running with this uid. Don't allow
@@ -874,9 +893,11 @@ fail1:
vmspace_free(vm2);
uma_zfree(proc_zone, newproc);
pause("fork", hz / 2);
+#ifdef RACCT
PROC_LOCK(p1);
racct_sub(p1, RACCT_NPROC, 1);
PROC_UNLOCK(p1);
+#endif
return (error);
}
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 5850ad1..358d673 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3639,6 +3639,7 @@ prison_priv_check(struct ucred *cred, int priv)
case PRIV_NET_LAGG:
case PRIV_NET_GIF:
case PRIV_NET_SETIFVNET:
+ case PRIV_NET_SETIFFIB:
/*
* 802.11-related privileges.
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 01f7777..401ce1d 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -103,7 +103,7 @@ SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *",
int racct_types[] = {
[RACCT_CPU] =
- RACCT_IN_THOUSANDS,
+ RACCT_IN_MILLIONS,
[RACCT_DATA] =
RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
[RACCT_STACK] =
@@ -141,7 +141,7 @@ int racct_types[] = {
[RACCT_SHMSIZE] =
RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
[RACCT_WALLCLOCK] =
- RACCT_IN_THOUSANDS };
+ RACCT_IN_MILLIONS };
static void
racct_add_racct(struct racct *dest, const struct racct *src)
@@ -173,7 +173,7 @@ racct_sub_racct(struct racct *dest, const struct racct *src)
* Update resource usage in dest.
*/
for (i = 0; i <= RACCT_MAX; i++) {
- if (!racct_is_sloppy(i)) {
+ if (!RACCT_IS_SLOPPY(i)) {
KASSERT(dest->r_resources[i] >= 0,
("racct propagation meltdown: dest < 0"));
KASSERT(src->r_resources[i] >= 0,
@@ -181,10 +181,10 @@ racct_sub_racct(struct racct *dest, const struct racct *src)
KASSERT(src->r_resources[i] <= dest->r_resources[i],
("racct propagation meltdown: src > dest"));
}
- if (racct_is_reclaimable(i)) {
+ if (RACCT_IS_RECLAIMABLE(i)) {
dest->r_resources[i] -= src->r_resources[i];
if (dest->r_resources[i] < 0) {
- KASSERT(racct_is_sloppy(i),
+ KASSERT(RACCT_IS_SLOPPY(i),
("racct_sub_racct: usage < 0"));
dest->r_resources[i] = 0;
}
@@ -218,9 +218,9 @@ racct_destroy_locked(struct racct **racctp)
racct = *racctp;
for (i = 0; i <= RACCT_MAX; i++) {
- if (racct_is_sloppy(i))
+ if (RACCT_IS_SLOPPY(i))
continue;
- if (!racct_is_reclaimable(i))
+ if (!RACCT_IS_RECLAIMABLE(i))
continue;
KASSERT(racct->r_resources[i] == 0,
("destroying non-empty racct: "
@@ -255,7 +255,7 @@ racct_alloc_resource(struct racct *racct, int resource,
racct->r_resources[resource] += amount;
if (racct->r_resources[resource] < 0) {
- KASSERT(racct_is_sloppy(resource),
+ KASSERT(RACCT_IS_SLOPPY(resource),
("racct_alloc_resource: usage < 0"));
racct->r_resources[resource] = 0;
}
@@ -285,7 +285,7 @@ racct_add(struct proc *p, int resource, uint64_t amount)
mtx_lock(&racct_lock);
#ifdef RCTL
error = rctl_enforce(p, resource, amount);
- if (error && racct_is_deniable(resource)) {
+ if (error && RACCT_IS_DENIABLE(resource)) {
SDT_PROBE(racct, kernel, rusage, add_failure, p, resource,
amount, 0, 0);
mtx_unlock(&racct_lock);
@@ -373,14 +373,14 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount)
diff = amount - p->p_racct->r_resources[resource];
#ifdef notyet
- KASSERT(diff >= 0 || racct_is_reclaimable(resource),
+ KASSERT(diff >= 0 || RACCT_IS_RECLAIMABLE(resource),
("racct_set: usage of non-reclaimable resource %d dropping",
resource));
#endif
#ifdef RCTL
if (diff > 0) {
error = rctl_enforce(p, resource, diff);
- if (error && racct_is_deniable(resource)) {
+ if (error && RACCT_IS_DENIABLE(resource)) {
SDT_PROBE(racct, kernel, rusage, set_failure, p,
resource, amount, 0, 0);
return (error);
@@ -489,7 +489,7 @@ racct_sub(struct proc *p, int resource, uint64_t amount)
* We need proc lock to dereference p->p_ucred.
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
- KASSERT(racct_is_reclaimable(resource),
+ KASSERT(RACCT_IS_RECLAIMABLE(resource),
("racct_sub: called for non-reclaimable resource %d", resource));
mtx_lock(&racct_lock);
@@ -512,7 +512,7 @@ racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
0, 0);
#ifdef notyet
- KASSERT(racct_is_reclaimable(resource),
+ KASSERT(RACCT_IS_RECLAIMABLE(resource),
("racct_sub_cred: called for non-reclaimable resource %d",
resource));
#endif
@@ -564,7 +564,7 @@ racct_proc_fork(struct proc *parent, struct proc *child)
*/
for (i = 0; i <= RACCT_MAX; i++) {
if (parent->p_racct->r_resources[i] == 0 ||
- !racct_is_inheritable(i))
+ !RACCT_IS_INHERITABLE(i))
continue;
error = racct_set_locked(child, i,
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 3d0a478..a939758 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -99,17 +99,17 @@ static struct dict subjectnames[] = {
{ NULL, -1 }};
static struct dict resourcenames[] = {
- { "cpu", RACCT_CPU },
- { "data", RACCT_DATA },
- { "stack", RACCT_STACK },
- { "core", RACCT_CORE },
- { "rss", RACCT_RSS },
- { "memlock", RACCT_MEMLOCK },
- { "nproc", RACCT_NPROC },
- { "nofile", RACCT_NOFILE },
- { "vmem", RACCT_VMEM },
- { "npts", RACCT_NPTS },
- { "swap", RACCT_SWAP },
+ { "cputime", RACCT_CPU },
+ { "datasize", RACCT_DATA },
+ { "stacksize", RACCT_STACK },
+ { "coredumpsize", RACCT_CORE },
+ { "memoryuse", RACCT_RSS },
+ { "memorylocked", RACCT_MEMLOCK },
+ { "maxproc", RACCT_NPROC },
+ { "openfiles", RACCT_NOFILE },
+ { "vmemoryuse", RACCT_VMEM },
+ { "pseudoterminals", RACCT_NPTS },
+ { "swapuse", RACCT_SWAP },
{ "nthr", RACCT_NTHR },
{ "msgqqueued", RACCT_MSGQQUEUED },
{ "msgqsize", RACCT_MSGQSIZE },
@@ -907,7 +907,7 @@ rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
error = str2int64(amountstr, &rule->rr_amount);
if (error != 0)
goto out;
- if (racct_is_in_thousands(rule->rr_resource))
+ if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
rule->rr_amount *= 1000;
}
@@ -947,7 +947,7 @@ rctl_rule_add(struct rctl_rule *rule)
/*
* Some rules just don't make sense. Note that the one below
- * cannot be rewritten using racct_is_deniable(); the RACCT_PCTCPU,
+ * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
* for example, is not deniable in the racct sense, but the
* limit is enforced in a different way, so "deny" rules for %CPU
* do make sense.
@@ -958,7 +958,7 @@ rctl_rule_add(struct rctl_rule *rule)
return (EOPNOTSUPP);
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
- racct_is_sloppy(rule->rr_resource))
+ RACCT_IS_SLOPPY(rule->rr_resource))
return (EOPNOTSUPP);
/*
@@ -1152,8 +1152,8 @@ rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
amount = rule->rr_amount;
if (amount != RCTL_AMOUNT_UNDEFINED &&
- racct_is_in_thousands(rule->rr_resource))
- amount /= 1000;
+ RACCT_IS_IN_MILLIONS(rule->rr_resource))
+ amount /= 1000000;
sbuf_printf(sb, "%s:%s=%jd",
rctl_resource_name(rule->rr_resource),
@@ -1219,10 +1219,10 @@ rctl_racct_to_sbuf(struct racct *racct, int sloppy)
sb = sbuf_new_auto();
for (i = 0; i <= RACCT_MAX; i++) {
- if (sloppy == 0 && racct_is_sloppy(i))
+ if (sloppy == 0 && RACCT_IS_SLOPPY(i))
continue;
amount = racct->r_resources[i];
- if (racct_is_in_thousands(i))
+ if (RACCT_IS_IN_MILLIONS(i))
amount /= 1000;
sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
}
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
index 3214e1b..1c7337d 100644
--- a/sys/kern/kern_rmlock.c
+++ b/sys/kern/kern_rmlock.c
@@ -263,7 +263,7 @@ _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
pc = pcpu_find(curcpu);
/* Check if we just need to do a proper critical_exit. */
- if (!CPU_OVERLAP(&pc->pc_cpumask, &rm->rm_writecpus)) {
+ if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
critical_exit();
return (1);
}
@@ -325,7 +325,7 @@ _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
critical_enter();
pc = pcpu_find(curcpu);
- CPU_NAND(&rm->rm_writecpus, &pc->pc_cpumask);
+ CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
rm_tracker_add(pc, tracker);
sched_pin();
critical_exit();
@@ -367,7 +367,7 @@ _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
* conditional jump.
*/
if (0 == (td->td_owepreempt |
- CPU_OVERLAP(&rm->rm_writecpus, &pc->pc_cpumask)))
+ CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)))
return (1);
/* We do not have a read token and need to acquire one. */
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index 39d6f23..0c52071 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -492,6 +492,12 @@ tc_windup(void)
/* Now is a good time to change timecounters. */
if (th->th_counter != timecounter) {
+#ifndef __arm__
+ if ((timecounter->tc_flags & TC_FLAGS_C3STOP) != 0)
+ cpu_disable_deep_sleep++;
+ if ((th->th_counter->tc_flags & TC_FLAGS_C3STOP) != 0)
+ cpu_disable_deep_sleep--;
+#endif
th->th_counter = timecounter;
th->th_offset_count = ncount;
tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 7011a53..94e41e2 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -185,11 +185,13 @@ create_thread(struct thread *td, mcontext_t *ctx,
}
}
+#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(p, RACCT_NTHR, 1);
PROC_UNLOCK(td->td_proc);
if (error != 0)
return (EPROCLIM);
+#endif
/* Initialize our td */
newtd = thread_alloc(0);
@@ -277,9 +279,11 @@ create_thread(struct thread *td, mcontext_t *ctx,
return (0);
fail:
+#ifdef RACCT
PROC_LOCK(p);
racct_sub(p, RACCT_NTHR, 1);
PROC_UNLOCK(p);
+#endif
return (error);
}
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
index 38bf37f..2f9a1f6 100644
--- a/sys/kern/link_elf.c
+++ b/sys/kern/link_elf.c
@@ -950,11 +950,11 @@ link_elf_load_file(linker_class_t cls, const char* filename,
ef->ddbstrcnt = strcnt;
ef->ddbstrtab = ef->strbase;
+nosyms:
error = link_elf_link_common_finish(lf);
if (error != 0)
goto out;
-nosyms:
*result = lf;
out:
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 592bb80..574755f0 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -951,8 +951,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
if (td->td_flags & TDF_IDLETD) {
TD_SET_CAN_RUN(td);
#ifdef SMP
- /* Spinlock held here, assume no migration. */
- CPU_NAND(&idle_cpus_mask, PCPU_PTR(cpumask));
+ CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
#endif
} else {
if (TD_IS_RUNNING(td)) {
@@ -1026,7 +1025,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
#ifdef SMP
if (td->td_flags & TDF_IDLETD)
- CPU_OR(&idle_cpus_mask, PCPU_PTR(cpumask));
+ CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
#endif
sched_lock.mtx_lock = (uintptr_t)td;
td->td_oncpu = PCPU_GET(cpuid);
@@ -1055,7 +1054,8 @@ static int
forward_wakeup(int cpunum)
{
struct pcpu *pc;
- cpuset_t dontuse, id, map, map2, me;
+ cpuset_t dontuse, map, map2;
+ u_int id, me;
int iscpuset;
mtx_assert(&sched_lock, MA_OWNED);
@@ -1073,27 +1073,24 @@ forward_wakeup(int cpunum)
/*
* Check the idle mask we received against what we calculated
* before in the old version.
- *
- * Also note that sched_lock is held now, thus no migration is
- * expected.
*/
- me = PCPU_GET(cpumask);
+ me = PCPU_GET(cpuid);
/* Don't bother if we should be doing it ourself. */
- if (CPU_OVERLAP(&me, &idle_cpus_mask) &&
- (cpunum == NOCPU || CPU_ISSET(cpunum, &me)))
+ if (CPU_ISSET(me, &idle_cpus_mask) &&
+ (cpunum == NOCPU || me == cpunum))
return (0);
- dontuse = me;
+ CPU_SETOF(me, &dontuse);
CPU_OR(&dontuse, &stopped_cpus);
CPU_OR(&dontuse, &hlt_cpus_mask);
CPU_ZERO(&map2);
if (forward_wakeup_use_loop) {
STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
- id = pc->pc_cpumask;
- if (!CPU_OVERLAP(&id, &dontuse) &&
+ id = pc->pc_cpuid;
+ if (!CPU_ISSET(id, &dontuse) &&
pc->pc_curthread == pc->pc_idlethread) {
- CPU_OR(&map2, &id);
+ CPU_SET(id, &map2);
}
}
}
@@ -1125,11 +1122,11 @@ forward_wakeup(int cpunum)
if (!CPU_EMPTY(&map)) {
forward_wakeups_delivered++;
STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
- id = pc->pc_cpumask;
- if (!CPU_OVERLAP(&map, &id))
+ id = pc->pc_cpuid;
+ if (!CPU_ISSET(id, &map))
continue;
if (cpu_idle_wakeup(pc->pc_cpuid))
- CPU_NAND(&map, &id);
+ CPU_CLR(id, &map);
}
if (!CPU_EMPTY(&map))
ipi_selected(map, IPI_AST);
@@ -1147,7 +1144,7 @@ kick_other_cpu(int pri, int cpuid)
int cpri;
pcpu = pcpu_find(cpuid);
- if (CPU_OVERLAP(&idle_cpus_mask, &pcpu->pc_cpumask)) {
+ if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
forward_wakeups_delivered++;
if (!cpu_idle_wakeup(cpuid))
ipi_cpu(cpuid, IPI_AST);
@@ -1205,10 +1202,10 @@ void
sched_add(struct thread *td, int flags)
#ifdef SMP
{
- cpuset_t idle, me, tidlemsk;
+ cpuset_t tidlemsk;
struct td_sched *ts;
+ u_int cpu, cpuid;
int forwarded = 0;
- int cpu;
int single_cpu = 0;
ts = td->td_sched;
@@ -1271,23 +1268,17 @@ sched_add(struct thread *td, int flags)
ts->ts_runq = &runq;
}
- if (single_cpu && (cpu != PCPU_GET(cpuid))) {
+ cpuid = PCPU_GET(cpuid);
+ if (single_cpu && cpu != cpuid) {
kick_other_cpu(td->td_priority, cpu);
} else {
if (!single_cpu) {
+ tidlemsk = idle_cpus_mask;
+ CPU_NAND(&tidlemsk, &hlt_cpus_mask);
+ CPU_CLR(cpuid, &tidlemsk);
- /*
- * Thread spinlock is held here, assume no
- * migration is possible.
- */
- me = PCPU_GET(cpumask);
- idle = idle_cpus_mask;
- tidlemsk = idle;
- CPU_AND(&idle, &me);
- CPU_OR(&me, &hlt_cpus_mask);
- CPU_NAND(&tidlemsk, &me);
-
- if (CPU_EMPTY(&idle) && ((flags & SRQ_INTR) == 0) &&
+ if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
+ ((flags & SRQ_INTR) == 0) &&
!CPU_EMPTY(&tidlemsk))
forwarded = forward_wakeup(cpu);
}
diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c
index c2f6e99..f5cb31e 100644
--- a/sys/kern/subr_kdb.c
+++ b/sys/kern/subr_kdb.c
@@ -88,20 +88,6 @@ SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
/*
- * Flag indicating whether or not to IPI the other CPUs to stop them on
- * entering the debugger. Sometimes, this will result in a deadlock as
- * stop_cpus() waits for the other cpus to stop, so we allow it to be
- * disabled. In order to maximize the chances of success, use a hard
- * stop for that.
- */
-#ifdef SMP
-static int kdb_stop_cpus = 1;
-SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus, CTLFLAG_RW | CTLFLAG_TUN,
- &kdb_stop_cpus, 0, "stop other CPUs when entering the debugger");
-TUNABLE_INT("debug.kdb.stop_cpus", &kdb_stop_cpus);
-#endif
-
-/*
* Flag to indicate to debuggers why the debugger was entered.
*/
const char * volatile kdb_why = KDB_WHY_UNSET;
@@ -211,9 +197,12 @@ kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
void
kdb_panic(const char *msg)
{
-
#ifdef SMP
- stop_cpus_hard(PCPU_GET(other_cpus));
+ cpuset_t other_cpus;
+
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ stop_cpus_hard(other_cpus);
#endif
printf("KDB: panic\n");
panic("%s", msg);
@@ -429,7 +418,7 @@ kdb_thr_ctx(struct thread *thr)
#if defined(SMP) && defined(KDB_STOPPEDPCB)
STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
if (pc->pc_curthread == thr &&
- CPU_OVERLAP(&stopped_cpus, &pc->pc_cpumask))
+ CPU_ISSET(pc->pc_cpuid, &stopped_cpus))
return (KDB_STOPPEDPCB(pc));
}
#endif
@@ -513,11 +502,11 @@ kdb_thr_select(struct thread *thr)
int
kdb_trap(int type, int code, struct trapframe *tf)
{
- struct kdb_dbbe *be;
- register_t intr;
#ifdef SMP
- int did_stop_cpus;
+ cpuset_t other_cpus;
#endif
+ struct kdb_dbbe *be;
+ register_t intr;
int handled;
be = kdb_dbbe;
@@ -531,8 +520,9 @@ kdb_trap(int type, int code, struct trapframe *tf)
intr = intr_disable();
#ifdef SMP
- if ((did_stop_cpus = kdb_stop_cpus) != 0)
- stop_cpus_hard(PCPU_GET(other_cpus));
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ stop_cpus_hard(other_cpus);
#endif
kdb_active++;
@@ -558,8 +548,7 @@ kdb_trap(int type, int code, struct trapframe *tf)
kdb_active--;
#ifdef SMP
- if (did_stop_cpus)
- restart_cpus(stopped_cpus);
+ restart_cpus(stopped_cpus);
#endif
intr_restore(intr);
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
index a6b3ae0..ec6b590 100644
--- a/sys/kern/subr_pcpu.c
+++ b/sys/kern/subr_pcpu.c
@@ -87,7 +87,6 @@ pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
KASSERT(cpuid >= 0 && cpuid < MAXCPU,
("pcpu_init: invalid cpuid %d", cpuid));
pcpu->pc_cpuid = cpuid;
- CPU_SETOF(cpuid, &pcpu->pc_cpumask);
cpuid_to_pcpu[cpuid] = pcpu;
STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu);
cpu_pcpu_init(pcpu, cpuid, size);
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index c38177b..caec965 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -142,7 +142,7 @@ mp_start(void *dummy)
/* Probe for MP hardware. */
if (smp_disabled != 0 || cpu_mp_probe() == 0) {
mp_ncpus = 1;
- all_cpus = PCPU_GET(cpumask);
+ CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
return;
}
@@ -236,12 +236,10 @@ generic_stop_cpus(cpuset_t map, u_int type)
/* spin */
cpu_spinwait();
i++;
-#ifdef DIAGNOSTIC
- if (i == 100000) {
+ if (i == 100000000) {
printf("timeout stopping cpus\n");
break;
}
-#endif
}
stopping_cpu = NOCPU;
@@ -708,7 +706,7 @@ mp_setvariables_for_up(void *dummy)
{
mp_ncpus = 1;
mp_maxid = PCPU_GET(cpuid);
- all_cpus = PCPU_GET(cpumask);
+ CPU_SETOF(mp_maxid, &all_cpus);
KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
}
SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 0113d7b..3527ed1 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -44,7 +44,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_capabilities.h"
+#include "opt_capsicum.h"
#include "opt_ktrace.h"
#include "opt_kdtrace.h"
#include "opt_sched.h"
@@ -313,7 +313,7 @@ syscallenter(struct thread *td, struct syscall_args *sa)
goto retval;
}
-#ifdef CAPABILITIES
+#ifdef CAPABILITY_MODE
/*
* In capability mode, we only allow access to system calls
* flagged with SYF_CAPENABLED.
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
index 9385dc4..6e81328 100644
--- a/sys/kern/subr_uio.c
+++ b/sys/kern/subr_uio.c
@@ -64,6 +64,8 @@ __FBSDID("$FreeBSD$");
SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
+static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
+
#ifdef ZERO_COPY_SOCKETS
/* Declared in uipc_socket.c */
extern int so_zero_copy_receive;
@@ -129,23 +131,65 @@ retry:
#endif /* ZERO_COPY_SOCKETS */
int
+copyin_nofault(const void *udaddr, void *kaddr, size_t len)
+{
+ int error, save;
+
+ save = vm_fault_disable_pagefaults();
+ error = copyin(udaddr, kaddr, len);
+ vm_fault_enable_pagefaults(save);
+ return (error);
+}
+
+int
+copyout_nofault(const void *kaddr, void *udaddr, size_t len)
+{
+ int error, save;
+
+ save = vm_fault_disable_pagefaults();
+ error = copyout(kaddr, udaddr, len);
+ vm_fault_enable_pagefaults(save);
+ return (error);
+}
+
+int
uiomove(void *cp, int n, struct uio *uio)
{
- struct thread *td = curthread;
+
+ return (uiomove_faultflag(cp, n, uio, 0));
+}
+
+int
+uiomove_nofault(void *cp, int n, struct uio *uio)
+{
+
+ return (uiomove_faultflag(cp, n, uio, 1));
+}
+
+static int
+uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
+{
+ struct thread *td;
struct iovec *iov;
u_int cnt;
- int error = 0;
- int save = 0;
+ int error, newflags, save;
+
+ td = curthread;
+ error = 0;
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
("uiomove: mode"));
- KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
("uiomove proc"));
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "Calling uiomove()");
+ if (!nofault)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "Calling uiomove()");
- save = td->td_pflags & TDP_DEADLKTREAT;
- td->td_pflags |= TDP_DEADLKTREAT;
+ /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
+ newflags = TDP_DEADLKTREAT;
+ if (uio->uio_segflg == UIO_USERSPACE && nofault)
+ newflags |= TDP_NOFAULTING;
+ save = curthread_pflags_set(newflags);
while (n > 0 && uio->uio_resid) {
iov = uio->uio_iov;
@@ -187,8 +231,7 @@ uiomove(void *cp, int n, struct uio *uio)
n -= cnt;
}
out:
- if (save == 0)
- td->td_pflags &= ~TDP_DEADLKTREAT;
+ curthread_pflags_restore(save);
return (error);
}
diff --git a/sys/kern/sys_capability.c b/sys/kern/sys_capability.c
index 89dc923..04f98d8 100644
--- a/sys/kern/sys_capability.c
+++ b/sys/kern/sys_capability.c
@@ -36,7 +36,7 @@
*
*/
-#include "opt_capabilities.h"
+#include "opt_capsicum.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
@@ -59,18 +59,11 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <vm/vm.h>
-#ifdef CAPABILITIES
+#ifdef CAPABILITY_MODE
FEATURE(security_capabilities, "Capsicum Capability Mode");
/*
- * We don't currently have any MIB entries for sysctls, but we do expose
- * security.capabilities so that it's easy to tell if options CAPABILITIES is
- * compiled into the kernel.
- */
-SYSCTL_NODE(_security, OID_AUTO, capabilities, CTLFLAG_RW, 0, "Capsicum");
-
-/*
* System call to enter capability mode for the process.
*/
int
@@ -106,7 +99,7 @@ cap_getmode(struct thread *td, struct cap_getmode_args *uap)
return (copyout(&i, uap->modep, sizeof(i)));
}
-#else /* !CAPABILITIES */
+#else /* !CAPABILITY_MODE */
int
cap_enter(struct thread *td, struct cap_enter_args *uap)
@@ -122,4 +115,403 @@ cap_getmode(struct thread *td, struct cap_getmode_args *uap)
return (ENOSYS);
}
+#endif /* CAPABILITY_MODE */
+
+#ifdef CAPABILITIES
+
+/*
+ * struct capability describes a capability, and is hung off of its struct
+ * file f_data field. cap_file and cap_rightss are static once hooked up, as
+ * neither the object it references nor the rights it encapsulates are
+ * permitted to change.
+ */
+struct capability {
+ struct file *cap_object; /* Underlying object's file. */
+ struct file *cap_file; /* Back-pointer to cap's file. */
+ cap_rights_t cap_rights; /* Mask of rights on object. */
+};
+
+/*
+ * Capabilities have a fileops vector, but in practice none should ever be
+ * called except for fo_close, as the capability will normally not be
+ * returned during a file descriptor lookup in the system call code.
+ */
+static fo_rdwr_t capability_read;
+static fo_rdwr_t capability_write;
+static fo_truncate_t capability_truncate;
+static fo_ioctl_t capability_ioctl;
+static fo_poll_t capability_poll;
+static fo_kqfilter_t capability_kqfilter;
+static fo_stat_t capability_stat;
+static fo_close_t capability_close;
+
+static struct fileops capability_ops = {
+ .fo_read = capability_read,
+ .fo_write = capability_write,
+ .fo_truncate = capability_truncate,
+ .fo_ioctl = capability_ioctl,
+ .fo_poll = capability_poll,
+ .fo_kqfilter = capability_kqfilter,
+ .fo_stat = capability_stat,
+ .fo_close = capability_close,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+static struct fileops capability_ops_unpassable = {
+ .fo_read = capability_read,
+ .fo_write = capability_write,
+ .fo_truncate = capability_truncate,
+ .fo_ioctl = capability_ioctl,
+ .fo_poll = capability_poll,
+ .fo_kqfilter = capability_kqfilter,
+ .fo_stat = capability_stat,
+ .fo_close = capability_close,
+ .fo_flags = 0,
+};
+
+static uma_zone_t capability_zone;
+
+static void
+capability_init(void *dummy __unused)
+{
+
+ capability_zone = uma_zcreate("capability", sizeof(struct capability),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ if (capability_zone == NULL)
+ panic("capability_init: capability_zone not initialized");
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL);
+
+/*
+ * Test whether a capability grants the requested rights.
+ */
+static int
+cap_check(struct capability *c, cap_rights_t rights)
+{
+
+ if ((c->cap_rights | rights) != c->cap_rights)
+ return (ENOTCAPABLE);
+ return (0);
+}
+
+/*
+ * Extract rights from a capability for monitoring purposes -- not for use in
+ * any other way, as we want to keep all capability permission evaluation in
+ * this one file.
+ */
+cap_rights_t
+cap_rights(struct file *fp_cap)
+{
+ struct capability *c;
+
+ KASSERT(fp_cap->f_type == DTYPE_CAPABILITY,
+ ("cap_rights: !capability"));
+
+ c = fp_cap->f_data;
+ return (c->cap_rights);
+}
+
+/*
+ * System call to create a new capability reference to either an existing
+ * file object or an an existing capability.
+ */
+int
+cap_new(struct thread *td, struct cap_new_args *uap)
+{
+ int error, capfd;
+ int fd = uap->fd;
+ struct file *fp, *fcapp;
+ cap_rights_t rights = uap->rights;
+
+ AUDIT_ARG_FD(fd);
+#ifdef notyet /* capability auditing will follow in a few commits */
+ AUDIT_ARG_RIGHTS(rights);
+#endif
+ error = fget(td, fd, &fp);
+ if (error)
+ return (error);
+ AUDIT_ARG_FILE(td->td_proc, fp);
+ error = kern_capwrap(td, fp, rights, &fcapp, &capfd);
+ if (error)
+ return (error);
+
+ /*
+ * Release our reference to the file (kern_capwrap has held a reference
+ * for the filedesc array).
+ */
+ fdrop(fp, td);
+ td->td_retval[0] = capfd;
+ return (0);
+}
+
+/*
+ * System call to query the rights mask associated with a capability.
+ */
+int
+cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+{
+ struct capability *cp;
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = fgetcap(td, uap->fd, &fp);
+ if (error)
+ return (error);
+ cp = fp->f_data;
+ error = copyout(&cp->cap_rights, uap->rightsp, sizeof(*uap->rightsp));
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Create a capability to wrap around an existing file.
+ */
+int
+kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights,
+ struct file **fcappp, int *capfdp)
+{
+ struct capability *cp, *cp_old;
+ struct file *fp_object;
+ int error;
+
+ if ((rights | CAP_MASK_VALID) != CAP_MASK_VALID)
+ return (EINVAL);
+
+ /*
+ * If a new capability is being derived from an existing capability,
+ * then the new capability rights must be a subset of the existing
+ * rights.
+ */
+ if (fp->f_type == DTYPE_CAPABILITY) {
+ cp_old = fp->f_data;
+ if ((cp_old->cap_rights | rights) != cp_old->cap_rights)
+ return (ENOTCAPABLE);
+ }
+
+ /*
+ * Allocate a new file descriptor to hang the capability off of.
+ */
+ error = falloc(td, fcappp, capfdp, fp->f_flag);
+ if (error)
+ return (error);
+
+ /*
+ * Rather than nesting capabilities, directly reference the object an
+ * existing capability references. There's nothing else interesting
+ * to preserve for future use, as we've incorporated the previous
+ * rights mask into the new one. This prevents us from having to
+ * deal with capability chains.
+ */
+ if (fp->f_type == DTYPE_CAPABILITY)
+ fp_object = ((struct capability *)fp->f_data)->cap_object;
+ else
+ fp_object = fp;
+ fhold(fp_object);
+ cp = uma_zalloc(capability_zone, M_WAITOK | M_ZERO);
+ cp->cap_rights = rights;
+ cp->cap_object = fp_object;
+ cp->cap_file = *fcappp;
+ if (fp->f_flag & DFLAG_PASSABLE)
+ finit(*fcappp, fp->f_flag, DTYPE_CAPABILITY, cp,
+ &capability_ops);
+ else
+ finit(*fcappp, fp->f_flag, DTYPE_CAPABILITY, cp,
+ &capability_ops_unpassable);
+
+ /*
+ * Release our private reference (the proc filedesc still has one).
+ */
+ fdrop(*fcappp, td);
+ return (0);
+}
+
+/*
+ * Given a file descriptor, test it against a capability rights mask and then
+ * return the file descriptor on which to actually perform the requested
+ * operation. As long as the reference to fp_cap remains valid, the returned
+ * pointer in *fp will remain valid, so no extra reference management is
+ * required, and the caller should fdrop() fp_cap as normal when done with
+ * both.
+ */
+int
+cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
+{
+ struct capability *c;
+ int error;
+
+ if (fp_cap->f_type != DTYPE_CAPABILITY) {
+ *fpp = fp_cap;
+ return (0);
+ }
+ c = fp_cap->f_data;
+ error = cap_check(c, rights);
+ if (error)
+ return (error);
+ *fpp = c->cap_object;
+ return (0);
+}
+
+/*
+ * Slightly different routine for memory mapping file descriptors: unwrap the
+ * capability and check CAP_MMAP, but also return a bitmask representing the
+ * maximum mapping rights the capability allows on the object.
+ */
+int
+cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
+ struct file **fpp)
+{
+ struct capability *c;
+ u_char maxprot;
+ int error;
+
+ if (fp_cap->f_type != DTYPE_CAPABILITY) {
+ *fpp = fp_cap;
+ *maxprotp = VM_PROT_ALL;
+ return (0);
+ }
+ c = fp_cap->f_data;
+ error = cap_check(c, rights | CAP_MMAP);
+ if (error)
+ return (error);
+ *fpp = c->cap_object;
+ maxprot = 0;
+ if (c->cap_rights & CAP_READ)
+ maxprot |= VM_PROT_READ;
+ if (c->cap_rights & CAP_WRITE)
+ maxprot |= VM_PROT_WRITE;
+ if (c->cap_rights & CAP_MAPEXEC)
+ maxprot |= VM_PROT_EXECUTE;
+ *maxprotp = maxprot;
+ return (0);
+}
+
+/*
+ * When a capability is closed, simply drop the reference on the underlying
+ * object and free the capability. fdrop() will handle the case where the
+ * underlying object also needs to close, and the caller will have already
+ * performed any object-specific lock or mqueue handling.
+ */
+static int
+capability_close(struct file *fp, struct thread *td)
+{
+ struct capability *c;
+ struct file *fp_object;
+
+ KASSERT(fp->f_type == DTYPE_CAPABILITY,
+ ("capability_close: !capability"));
+
+ c = fp->f_data;
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+ fp_object = c->cap_object;
+ uma_zfree(capability_zone, c);
+ return (fdrop(fp_object, td));
+}
+
+/*
+ * In general, file descriptor operations should never make it to the
+ * capability, only the underlying file descriptor operation vector, so panic
+ * if we do turn up here.
+ */
+static int
+capability_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ panic("capability_read");
+}
+
+static int
+capability_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ panic("capability_write");
+}
+
+static int
+capability_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ panic("capability_truncate");
+}
+
+static int
+capability_ioctl(struct file *fp, u_long com, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ panic("capability_ioctl");
+}
+
+static int
+capability_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ panic("capability_poll");
+}
+
+static int
+capability_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ panic("capability_kqfilter");
+}
+
+static int
+capability_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ panic("capability_stat");
+}
+
+#else /* !CAPABILITIES */
+
+/*
+ * Stub Capability functions for when options CAPABILITIES isn't compiled
+ * into the kernel.
+ */
+int
+cap_new(struct thread *td, struct cap_new_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
+{
+
+ KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
+ ("cap_funwrap: saw capability"));
+
+ *fpp = fp_cap;
+ return (0);
+}
+
+int
+cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
+ struct file **fpp)
+{
+
+ KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
+ ("cap_funwrap_mmap: saw capability"));
+
+ *fpp = fp_cap;
+ *maxprotp = VM_PROT_ALL;
+ return (0);
+}
+
#endif /* CAPABILITIES */
+
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index a4c0069..ee36b35 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -829,6 +829,15 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
case PT_ATTACH:
/* security check done above */
+ /*
+ * It would be nice if the tracing relationship was separate
+ * from the parent relationship but that would require
+ * another set of links in the proc struct or for "wait"
+ * to scan the entire proc table. To make life easier,
+ * we just re-parent the process we're trying to trace.
+ * The old parent is remembered so we can put things back
+ * on a "detach".
+ */
p->p_flag |= P_TRACED;
p->p_oppid = p->p_pptr->p_pid;
if (p->p_pptr != td->td_proc) {
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 29a6485..abd9484 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -521,8 +521,8 @@ const char *syscallnames[] = {
"msgctl", /* 511 = msgctl */
"shmctl", /* 512 = shmctl */
"lpathconf", /* 513 = lpathconf */
- "#514", /* 514 = cap_new */
- "#515", /* 515 = cap_getrights */
+ "cap_new", /* 514 = cap_new */
+ "cap_getrights", /* 515 = cap_getrights */
"cap_enter", /* 516 = cap_enter */
"cap_getmode", /* 517 = cap_getmode */
"#518", /* 518 = pdfork */
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index af958c9..0b249a5 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -914,8 +914,9 @@
512 AUE_SHMCTL NOSTD { int shmctl(int shmid, int cmd, \
struct shmid_ds *buf); }
513 AUE_LPATHCONF STD { int lpathconf(char *path, int name); }
-514 AUE_CAP_NEW UNIMPL cap_new
-515 AUE_CAP_GETRIGHTS UNIMPL cap_getrights
+514 AUE_CAP_NEW STD { int cap_new(int fd, u_int64_t rights); }
+515 AUE_CAP_GETRIGHTS STD { int cap_getrights(int fd, \
+ u_int64_t *rightsp); }
516 AUE_CAP_ENTER STD { int cap_enter(void); }
517 AUE_CAP_GETMODE STD { int cap_getmode(u_int *modep); }
518 AUE_PDFORK UNIMPL pdfork
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 880b46b..f57777f 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3096,6 +3096,22 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
*n_args = 2;
break;
}
+ /* cap_new */
+ case 514: {
+ struct cap_new_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = p->rights; /* u_int64_t */
+ *n_args = 2;
+ break;
+ }
+ /* cap_getrights */
+ case 515: {
+ struct cap_getrights_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->rightsp; /* u_int64_t * */
+ *n_args = 2;
+ break;
+ }
/* cap_enter */
case 516: {
*n_args = 0;
@@ -8326,6 +8342,32 @@ systrace_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
break;
};
break;
+ /* cap_new */
+ case 514:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "u_int64_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_getrights */
+ case 515:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "u_int64_t *";
+ break;
+ default:
+ break;
+ };
+ break;
/* cap_enter */
case 516:
break;
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index 87d479e..ffd8580 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -620,6 +620,7 @@ msgget(td, uap)
error = ENOSPC;
goto done2;
}
+#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
PROC_UNLOCK(td->td_proc);
@@ -627,6 +628,7 @@ msgget(td, uap)
error = ENOSPC;
goto done2;
}
+#endif
DPRINTF(("msqid %d is available\n", msqid));
msqkptr->u.msg_perm.key = key;
msqkptr->u.msg_perm.cuid = cred->cr_uid;
@@ -685,7 +687,9 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
register struct msqid_kernel *msqkptr;
register struct msg *msghdr;
short next;
+#ifdef RACCT
size_t saved_msgsz;
+#endif
if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
return (ENOSYS);
@@ -723,6 +727,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
goto done2;
#endif
+#ifdef RACCT
PROC_LOCK(td->td_proc);
if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
PROC_UNLOCK(td->td_proc);
@@ -737,6 +742,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
goto done2;
}
PROC_UNLOCK(td->td_proc);
+#endif
segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
@@ -991,12 +997,14 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
wakeup(msqkptr);
td->td_retval[0] = 0;
done3:
+#ifdef RACCT
if (error != 0) {
PROC_LOCK(td->td_proc);
racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
PROC_UNLOCK(td->td_proc);
}
+#endif
done2:
mtx_unlock(&msq_mtx);
return (error);
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index ac53a8d..4a4c479 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -149,9 +149,6 @@ struct sem_undo {
#endif
/* shouldn't need tuning */
-#ifndef SEMMAP
-#define SEMMAP 30 /* # of entries in semaphore map */
-#endif
#ifndef SEMMSL
#define SEMMSL SEMMNS /* max # of semaphores per id */
#endif
@@ -182,7 +179,6 @@ struct sem_undo {
* semaphore info struct
*/
struct seminfo seminfo = {
- SEMMAP, /* # of entries in semaphore map */
SEMMNI, /* # of semaphore identifiers */
SEMMNS, /* # of semaphores in system */
SEMMNU, /* # of undo structures in system */
@@ -194,8 +190,6 @@ struct seminfo seminfo = {
SEMAEM /* adjust on exit max value */
};
-SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0,
- "Number of entries in the semaphore map");
SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
"Number of semaphore identifiers");
SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
@@ -255,7 +249,6 @@ seminit(void)
{
int i, error;
- TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap);
TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
@@ -931,6 +924,7 @@ semget(struct thread *td, struct semget_args *uap)
error = ENOSPC;
goto done2;
}
+#ifdef RACCT
PROC_LOCK(td->td_proc);
error = racct_add(td->td_proc, RACCT_NSEM, nsems);
PROC_UNLOCK(td->td_proc);
@@ -938,6 +932,7 @@ semget(struct thread *td, struct semget_args *uap)
error = ENOSPC;
goto done2;
}
+#endif
DPRINTF(("semid %d is available\n", semid));
mtx_lock(&sema_mtx[semid]);
KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
@@ -1023,12 +1018,14 @@ semop(struct thread *td, struct semop_args *uap)
nsops));
return (E2BIG);
} else {
+#ifdef RACCT
PROC_LOCK(td->td_proc);
if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
PROC_UNLOCK(td->td_proc);
return (E2BIG);
}
PROC_UNLOCK(td->td_proc);
+#endif
sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index f5a84ae..1741a21 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -672,6 +672,7 @@ shmget_allocate_segment(td, uap, mode)
shm_last_free = -1;
}
shmseg = &shmsegs[segnum];
+#ifdef RACCT
PROC_LOCK(td->td_proc);
if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
PROC_UNLOCK(td->td_proc);
@@ -683,6 +684,7 @@ shmget_allocate_segment(td, uap, mode)
return (ENOMEM);
}
PROC_UNLOCK(td->td_proc);
+#endif
/*
* In case we sleep in malloc(), mark the segment present but deleted
* so that noone else tries to create the same key.
@@ -699,10 +701,12 @@ shmget_allocate_segment(td, uap, mode)
shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
0, size, VM_PROT_DEFAULT, 0, cred);
if (shm_object == NULL) {
+#ifdef RACCT
PROC_LOCK(td->td_proc);
racct_sub(td->td_proc, RACCT_NSHM, 1);
racct_sub(td->td_proc, RACCT_SHMSIZE, size);
PROC_UNLOCK(td->td_proc);
+#endif
return (ENOMEM);
}
VM_OBJECT_LOCK(shm_object);
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index 8aa3af2..187e635 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -91,7 +91,7 @@ static const char *dev_console_filename;
HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
CDSR_OFLOW|CCAR_OFLOW)
-#define TTY_CALLOUT(tp,d) ((d) != (tp)->t_dev && (d) != dev_console)
+#define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
/*
* Set TTY buffer sizes.
@@ -470,10 +470,10 @@ ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
if (error)
goto done;
}
-
- tp->t_flags |= TF_BUSY_OUT;
+
+ tp->t_flags |= TF_BUSY_OUT;
error = ttydisc_write(tp, uio, ioflag);
- tp->t_flags &= ~TF_BUSY_OUT;
+ tp->t_flags &= ~TF_BUSY_OUT;
cv_signal(&tp->t_outserwait);
}
@@ -772,6 +772,10 @@ ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
goto done;
}
+ error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
+ if (error != ENOIOCTL)
+ goto done;
+
switch (cmd) {
case TIOCGETA:
/* Obtain terminal flags through tcgetattr(). */
@@ -878,6 +882,13 @@ ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
}
static int
+ttydevsw_defcioctl(struct tty *tp, int unit, u_long cmd, caddr_t data, struct thread *td)
+{
+
+ return (ENOIOCTL);
+}
+
+static int
ttydevsw_defparam(struct tty *tp, struct termios *t)
{
@@ -955,6 +966,7 @@ tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
PATCH_FUNC(outwakeup);
PATCH_FUNC(inwakeup);
PATCH_FUNC(ioctl);
+ PATCH_FUNC(cioctl);
PATCH_FUNC(param);
PATCH_FUNC(modem);
PATCH_FUNC(mmap);
@@ -1054,7 +1066,7 @@ tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
if (tp->t_pgrp == pg)
tp->t_pgrp = NULL;
-
+
tty_unlock(tp);
}
@@ -1190,13 +1202,13 @@ tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
/* Slave call-in devices. */
if (tp->t_flags & TF_INITLOCK) {
- dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+ dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred,
uid, gid, mode, "%s%s.init", prefix, name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
dev->si_drv2 = &tp->t_termios_init_in;
- dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+ dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
uid, gid, mode, "%s%s.lock", prefix, name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
@@ -1205,20 +1217,22 @@ tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
/* Call-out devices. */
if (tp->t_flags & TF_CALLOUT) {
- dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
+ dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred,
UID_UUCP, GID_DIALER, 0660, "cua%s", name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
/* Slave call-out devices. */
if (tp->t_flags & TF_INITLOCK) {
- dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+ dev = make_dev_cred(&ttyil_cdevsw,
+ TTYUNIT_CALLOUT | TTYUNIT_INIT, cred,
UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
dev->si_drv2 = &tp->t_termios_init_out;
- dev = make_dev_cred(&ttyil_cdevsw, 0, cred,
+ dev = make_dev_cred(&ttyil_cdevsw,
+ TTYUNIT_CALLOUT | TTYUNIT_LOCK, cred,
UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
dev_depends(tp->t_dev, dev);
dev->si_drv1 = tp;
@@ -1241,7 +1255,7 @@ tty_signal_sessleader(struct tty *tp, int sig)
/* Make signals start output again. */
tp->t_flags &= ~TF_STOPPED;
-
+
if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
p = tp->t_session->s_leader;
PROC_LOCK(p);
@@ -1305,7 +1319,7 @@ tty_wait(struct tty *tp, struct cv *cv)
/* Restart the system call when we may have been revoked. */
if (tp->t_revokecnt != revokecnt)
return (ERESTART);
-
+
/* Bail out when the device slipped away. */
if (tty_gone(tp))
return (ENXIO);
@@ -1327,7 +1341,7 @@ tty_timedwait(struct tty *tp, struct cv *cv, int hz)
/* Restart the system call when we may have been revoked. */
if (tp->t_revokecnt != revokecnt)
return (ERESTART);
-
+
/* Bail out when the device slipped away. */
if (tty_gone(tp))
return (ENXIO);
@@ -1469,7 +1483,7 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
return (error);
/* XXX: CLOCAL? */
-
+
tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
tp->t_termios.c_ispeed = t->c_ispeed;
tp->t_termios.c_ospeed = t->c_ospeed;
@@ -1708,7 +1722,7 @@ tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
if (tty_gone(tp))
return (ENXIO);
-
+
error = ttydevsw_ioctl(tp, cmd, data, td);
if (error == ENOIOCTL)
error = tty_generic_ioctl(tp, cmd, data, fflag, td);
@@ -1786,7 +1800,7 @@ ttyhook_defrint(struct tty *tp, char c, int flags)
if (ttyhook_rint_bypass(tp, &c, 1) != 1)
return (-1);
-
+
return (0);
}
@@ -1812,7 +1826,7 @@ ttyhook_register(struct tty **rtp, struct proc *p, int fd,
error = EBADF;
goto done1;
}
-
+
/*
* Make sure the vnode is bound to a character device.
* Unlocked check for the vnode type is ok there, because we
@@ -1910,7 +1924,7 @@ ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
/* System console has no TTY associated. */
if (dev_console->si_drv1 == NULL)
return (ENXIO);
-
+
return (ttydev_open(dev, oflags, devtype, td));
}
diff --git a/sys/kern/tty_inq.c b/sys/kern/tty_inq.c
index b0e9b18..0c39a29 100644
--- a/sys/kern/tty_inq.c
+++ b/sys/kern/tty_inq.c
@@ -142,7 +142,7 @@ void
ttyinq_free(struct ttyinq *ti)
{
struct ttyinq_block *tib;
-
+
ttyinq_flush(ti);
ti->ti_quota = 0;
@@ -276,7 +276,7 @@ ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
struct ttyinq_block *tib;
unsigned int boff;
size_t l;
-
+
while (nbytes > 0) {
boff = ti->ti_end % TTYINQ_DATASIZE;
@@ -313,7 +313,7 @@ ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
nbytes -= l;
ti->ti_end += l;
}
-
+
return (cbuf - (const char *)buf);
}
@@ -397,7 +397,7 @@ ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote)
*c = tib->tib_data[boff];
*quote = GETBIT(tib, boff);
-
+
return (0);
}
diff --git a/sys/kern/tty_outq.c b/sys/kern/tty_outq.c
index d5ed221..5d40abe 100644
--- a/sys/kern/tty_outq.c
+++ b/sys/kern/tty_outq.c
@@ -119,7 +119,7 @@ void
ttyoutq_free(struct ttyoutq *to)
{
struct ttyoutq_block *tob;
-
+
ttyoutq_flush(to);
to->to_quota = 0;
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
index d89c183..a3db59b 100644
--- a/sys/kern/tty_pts.c
+++ b/sys/kern/tty_pts.c
@@ -295,7 +295,7 @@ ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
return (EINVAL);
return copyout(p, fgn->buf, i);
}
-
+
/*
* We need to implement TIOCGPGRP and TIOCGSID here again. When
* called on the pseudo-terminal master, it should not check if
@@ -563,7 +563,7 @@ ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
sb->st_uid = dev->si_uid;
sb->st_gid = dev->si_gid;
sb->st_mode = dev->si_mode | S_IFCHR;
-
+
return (0);
}
@@ -823,7 +823,7 @@ posix_openpt(struct thread *td, struct posix_openpt_args *uap)
*/
if (uap->flags & ~(O_RDWR|O_NOCTTY))
return (EINVAL);
-
+
error = falloc(td, &fp, &fd, 0);
if (error)
return (error);
diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
index 6afac8d..2a0bb4b 100644
--- a/sys/kern/tty_ttydisc.c
+++ b/sys/kern/tty_ttydisc.c
@@ -270,13 +270,13 @@ ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag)
MPASS(tp->t_termios.c_cc[VMIN] != 0);
MPASS(tp->t_termios.c_cc[VTIME] != 0);
-
+
/*
* When using the interbyte timer, the timer should be started
* after the first byte has been received. We just call into the
* generic read timer code after we've received the first byte.
*/
-
+
for (;;) {
error = ttyinq_read_uio(&tp->t_inq, tp, uio,
uio->uio_resid, 0);
@@ -331,7 +331,7 @@ ttydisc_read(struct tty *tp, struct uio *uio, int ioflag)
/* Unset the input watermark when we've got enough space. */
tty_hiwat_in_unblock(tp);
}
-
+
return (error);
}
@@ -521,7 +521,7 @@ ttydisc_write(struct tty *tp, struct uio *uio, int ioflag)
error = EWOULDBLOCK;
goto done;
}
-
+
/*
* The driver may write back the data
* synchronously. Be sure to check the high
@@ -567,7 +567,7 @@ ttydisc_optimize(struct tty *tp)
} else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) &&
(!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) &&
(!CMP_FLAG(i, PARMRK) ||
- CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
+ CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
!CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) {
tp->t_flags |= TF_BYPASS;
} else {
@@ -583,7 +583,7 @@ ttydisc_modem(struct tty *tp, int open)
if (open)
cv_broadcast(&tp->t_dcdwait);
-
+
/*
* Ignore modem status lines when CLOCAL is turned on, but don't
* enter the zombie state when the TTY isn't opened, because
@@ -834,7 +834,7 @@ ttydisc_rint(struct tty *tp, char c, int flags)
if (ttyhook_hashook(tp, rint))
return ttyhook_rint(tp, c, flags);
-
+
if (tp->t_flags & TF_BYPASS)
goto processed;
@@ -1072,7 +1072,7 @@ ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len)
size_t ret;
tty_lock_assert(tp, MA_OWNED);
-
+
MPASS(tp->t_flags & TF_BYPASS);
atomic_add_long(&tty_nin, len);
@@ -1122,7 +1122,7 @@ ttydisc_rint_poll(struct tty *tp)
l = ttyinq_bytesleft(&tp->t_inq);
if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0)
return (1);
-
+
return (l);
}
@@ -1201,7 +1201,7 @@ ttydisc_getc_uio(struct tty *tp, struct uio *uio)
tty_unlock(tp);
error = uiomove(buf, len, uio);
tty_lock(tp);
-
+
if (error != 0)
break;
}
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 00496af..0414f12 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -55,7 +55,10 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_capsicum.h"
+
#include <sys/param.h>
+#include <sys/capability.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
@@ -264,7 +267,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
/* Toss in memory pages. */
if (nobjsize < object->size)
vm_object_page_remove(object, nobjsize, object->size,
- FALSE);
+ 0);
/* Toss pages from swap. */
if (object->type == OBJT_SWAP)
@@ -486,6 +489,14 @@ shm_open(struct thread *td, struct shm_open_args *uap)
mode_t cmode;
int fd, error;
+#ifdef CAPABILITY_MODE
+ /*
+ * shm_open(2) is only allowed for anonymous objects.
+ */
+ if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
+ return (ECAPMODE);
+#endif
+
if ((uap->flags & O_ACCMODE) != O_RDONLY &&
(uap->flags & O_ACCMODE) != O_RDWR)
return (EINVAL);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 3334fc2..990c6ba 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1915,7 +1915,6 @@ release:
/*
* Optimized version of soreceive() for stream (TCP) sockets.
*/
-#ifdef TCP_SORECEIVE_STREAM
int
soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
@@ -1955,20 +1954,9 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
}
oresid = uio->uio_resid;
- /* We will never ever get anything unless we are connected. */
+ /* We will never ever get anything unless we are or were connected. */
if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
- /* When disconnecting there may be still some data left. */
- if (sb->sb_cc > 0)
- goto deliver;
- if (!(so->so_state & SS_ISDISCONNECTED))
- error = ENOTCONN;
- goto out;
- }
-
- /* Socket buffer is empty and we shall not block. */
- if (sb->sb_cc == 0 &&
- ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
- error = EAGAIN;
+ error = ENOTCONN;
goto out;
}
@@ -1995,6 +1983,13 @@ restart:
goto out;
}
+ /* Socket buffer is empty and we shall not block. */
+ if (sb->sb_cc == 0 &&
+ ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+ error = EAGAIN;
+ goto out;
+ }
+
/* Socket buffer got some data that we shall deliver now. */
if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
((sb->sb_flags & SS_NBIO) ||
@@ -2109,7 +2104,6 @@ out:
sbunlock(sb);
return (error);
}
-#endif /* TCP_SORECEIVE_STREAM */
/*
* Optimized version of soreceive() for simple datagram cases from userspace.
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 19aaee0..c434973 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -35,6 +35,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_capsicum.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_sctp.h"
@@ -43,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/capability.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -675,6 +677,11 @@ sendit(td, s, mp, flags)
struct sockaddr *to;
int error;
+#ifdef CAPABILITY_MODE
+ if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
+ return (ECAPMODE);
+#endif
+
if (mp->msg_name != NULL) {
error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
if (error) {
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 2743089..a6ad81e 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1625,6 +1625,7 @@ vfs_vmio_release(struct buf *bp)
int i;
vm_page_t m;
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
@@ -1658,7 +1659,6 @@ vfs_vmio_release(struct buf *bp)
vm_page_unlock(m);
}
VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
- pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
if (bp->b_bufsize) {
bufspacewakeup();
@@ -3012,6 +3012,10 @@ allocbuf(struct buf *bp, int size)
if (desiredpages < bp->b_npages) {
vm_page_t m;
+ pmap_qremove((vm_offset_t)trunc_page(
+ (vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT),
+ (bp->b_npages - desiredpages));
VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
for (i = desiredpages; i < bp->b_npages; i++) {
/*
@@ -3032,8 +3036,6 @@ allocbuf(struct buf *bp, int size)
vm_page_unlock(m);
}
VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
- pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
- (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
bp->b_npages = desiredpages;
}
} else if (size > bp->b_bcount) {
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 2e07cf1..5edf0f5 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/reboot.h>
+#include <sys/sbuf.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
#include <sys/sx.h>
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
index 496ea70..ccbcb33 100644
--- a/sys/kern/vfs_mountroot.c
+++ b/sys/kern/vfs_mountroot.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/reboot.h>
+#include <sys/sbuf.h>
#include <sys/stat.h>
#include <sys/syscallsubr.h>
#include <sys/sysproto.h>
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 741061d..934745b 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1190,8 +1190,8 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
*/
if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
VM_OBJECT_LOCK(bo->bo_object);
- vm_object_page_remove(bo->bo_object, 0, 0,
- (flags & V_SAVE) ? TRUE : FALSE);
+ vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
+ OBJPR_CLEANONLY : 0);
VM_OBJECT_UNLOCK(bo->bo_object);
}
@@ -3590,9 +3590,6 @@ vn_isdisk(struct vnode *vp, int *errp)
* and optional call-by-reference privused argument allowing vaccess()
* to indicate to the caller whether privilege was used to satisfy the
* request (obsoleted). Returns 0 on success, or an errno on failure.
- *
- * The ifdef'd CAPABILITIES version is here for reference, but is not
- * actually used.
*/
int
vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
OpenPOWER on IntegriCloud