diff options
Diffstat (limited to 'sys/kern')
43 files changed, 1029 insertions, 321 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc index ad7ea11..6f5a13a 100644 --- a/sys/kern/Make.tags.inc +++ b/sys/kern/Make.tags.inc @@ -37,6 +37,7 @@ COMM= ${SYS}/dev/advansys/*.[ch] \ ${SYS}/fs/smbfs/*.[ch] \ ${SYS}/fs/udf/*.[ch] \ ${SYS}/fs/unionfs/*.[ch] \ + ${SYS}/geom/*.[ch] \ ${SYS}/kern/*.[ch] \ ${SYS}/net/*.[ch] \ ${SYS}/netatalk/*.[ch] \ @@ -55,6 +56,7 @@ COMM= ${SYS}/dev/advansys/*.[ch] \ ${SYS}/sys/*.[ch] COMMDIR1= ${SYS}/conf \ + ${SYS}/geom \ ${SYS}/kern \ ${SYS}/net \ ${SYS}/netatalk \ diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c index 2f889ca..3908da7 100644 --- a/sys/kern/imgact_aout.c +++ b/sys/kern/imgact_aout.c @@ -103,7 +103,7 @@ struct sysentvec aout_sysvec = { #elif defined(__amd64__) -#define AOUT32_USRSTACK 0xbfc0000 +#define AOUT32_USRSTACK 0xbfc00000 #define AOUT32_PS_STRINGS \ (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings)) @@ -152,7 +152,7 @@ aout_fixup(register_t **stack_base, struct image_params *imgp) { *(char **)stack_base -= sizeof(uint32_t); - return (suword(*stack_base, imgp->args->argc)); + return (suword32(*stack_base, imgp->args->argc)); } static int diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index b41741a..45f6d64 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -31,10 +31,12 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_capsicum.h" #include "opt_compat.h" #include "opt_core.h" #include <sys/param.h> +#include <sys/capability.h> #include <sys/exec.h> #include <sys/fcntl.h> #include <sys/imgact.h> @@ -578,6 +580,15 @@ __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long base_addr = 0; int vfslocked, error, i, numsegs; +#ifdef CAPABILITY_MODE + /* + * XXXJA: This check can go away once we are sufficiently confident + * that the checks in namei() are correct. + */ + if (IN_CAPABILITY_MODE(curthread)) + return (ECAPMODE); +#endif + tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; @@ -1104,6 +1115,7 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) hdrsize = 0; __elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count); +#ifdef RACCT PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_CORE, hdrsize + seginfo.size); PROC_UNLOCK(td->td_proc); @@ -1111,6 +1123,7 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) error = EFAULT; goto done; } +#endif if (hdrsize + seginfo.size >= limit) { error = EFAULT; goto done; diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index e45ffc5..004516b 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -548,8 +548,8 @@ struct sysent sysent[] = { { AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 511 = msgctl */ { AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 512 = shmctl */ { AS(lpathconf_args), (sy_call_t *)lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0, SY_THR_STATIC }, /* 513 = lpathconf */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 514 = cap_new */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 515 = cap_getrights */ + { AS(cap_new_args), (sy_call_t *)cap_new, AUE_CAP_NEW, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 514 = cap_new */ + { AS(cap_getrights_args), (sy_call_t *)cap_getrights, AUE_CAP_GETRIGHTS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 515 = cap_getrights */ { 0, (sy_call_t *)cap_enter, AUE_CAP_ENTER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 516 = cap_enter */ { AS(cap_getmode_args), (sy_call_t *)cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 517 = cap_getmode */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 518 = pdfork */ diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c index dd8bab5..ecfd408 100644 --- a/sys/kern/kern_clocksource.c +++ b/sys/kern/kern_clocksource.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); cyclic_clock_func_t cyclic_clock_func = NULL; #endif +int cpu_can_deep_sleep = 0; /* C3 state is available. */ int cpu_disable_deep_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 180d598..829ece2 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -37,6 +37,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_capsicum.h" #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" @@ -44,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/capability.h> #include <sys/conf.h> #include <sys/domain.h> #include <sys/fcntl.h> @@ -91,6 +93,7 @@ __FBSDID("$FreeBSD$"); #include <security/audit/audit.h> #include <vm/uma.h> +#include <vm/vm.h> #include <ddb/ddb.h> @@ -818,6 +821,7 @@ do_dup(struct thread *td, int flags, int old, int new, * descriptors, just put the limit on the size of the file * descriptor table. */ +#ifdef RACCT PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, new + 1); PROC_UNLOCK(p); @@ -826,6 +830,7 @@ do_dup(struct thread *td, int flags, int old, int new, fdrop(fp, td); return (EMFILE); } +#endif fdgrowtable(fdp, new + 1); } if (fdp->fd_ofiles[new] == NULL) @@ -1155,7 +1160,7 @@ kern_close(td, fd) int fd; { struct filedesc *fdp; - struct file *fp; + struct file *fp, *fp_object; int error; int holdleaders; @@ -1190,8 +1195,14 @@ kern_close(td, fd) * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); - if (fp->f_type == DTYPE_MQUEUE) - mq_fdclose(td, fd, fp); + + /* + * When we're closing an fd with a capability, we need to notify + * mqueue if the underlying object is of type mqueue. + */ + (void)cap_funwrap(fp, 0, &fp_object); + if (fp_object->f_type == DTYPE_MQUEUE) + mq_fdclose(td, fd, fp_object); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); @@ -1473,7 +1484,10 @@ fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; - int fd = -1, maxfd, error; + int fd = -1, maxfd; +#ifdef RACCT + int error; +#endif FILEDESC_XLOCK_ASSERT(fdp); @@ -1496,11 +1510,13 @@ fdalloc(struct thread *td, int minfd, int *result) return (EMFILE); if (fd < fdp->fd_nfiles) break; +#ifdef RACCT PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd)); PROC_UNLOCK(p); if (error != 0) return (EMFILE); +#endif fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } @@ -1561,54 +1577,85 @@ fdavail(struct thread *td, int n) int falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags) { - struct proc *p = td->td_proc; struct file *fp; - int error, i; + int error, fd; + + error = falloc_noinstall(td, &fp); + if (error) + return (error); /* no reference held on error */ + + error = finstall(td, fp, &fd, flags); + if (error) { + fdrop(fp, td); /* one reference (fp only) */ + return (error); + } + + if (resultfp != NULL) + *resultfp = fp; /* copy out result */ + else + fdrop(fp, td); /* release local reference */ + + if (resultfd != NULL) + *resultfd = fd; + + return (0); +} + +/* + * Create a new open file structure without allocating a file descriptor. + */ +int +falloc_noinstall(struct thread *td, struct file **resultfp) +{ + struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; - fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); + KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); + if ((openfiles >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles >= maxfiles) { if (ppsratecheck(&lastfail, &curfail, 1)) { - printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", - td->td_ucred->cr_ruid); + printf("kern.maxfiles limit exceeded by uid %i, " + "please see tuning(7).\n", td->td_ucred->cr_ruid); } - uma_zfree(file_zone, fp); return (ENFILE); } atomic_add_int(&openfiles, 1); - - /* - * If the process has file descriptor zero open, add the new file - * descriptor to the list of open files at that point, otherwise - * put it at the front of the list of open files. - */ + fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); refcount_init(&fp->f_count, 1); - if (resultfp) - fhold(fp); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; - FILEDESC_XLOCK(p->p_fd); - if ((error = fdalloc(td, 0, &i))) { - FILEDESC_XUNLOCK(p->p_fd); - fdrop(fp, td); - if (resultfp) - fdrop(fp, td); + *resultfp = fp; + return (0); +} + +/* + * Install a file in a file descriptor table. + */ +int +finstall(struct thread *td, struct file *fp, int *fd, int flags) +{ + struct filedesc *fdp = td->td_proc->p_fd; + int error; + + KASSERT(fd != NULL, ("%s: fd == NULL", __func__)); + KASSERT(fp != NULL, ("%s: fp == NULL", __func__)); + + FILEDESC_XLOCK(fdp); + if ((error = fdalloc(td, 0, fd))) { + FILEDESC_XUNLOCK(fdp); return (error); } - p->p_fd->fd_ofiles[i] = fp; + fhold(fp); + fdp->fd_ofiles[*fd] = fp; if ((flags & O_CLOEXEC) != 0) - p->p_fd->fd_ofileflags[i] |= UF_EXCLOSE; - FILEDESC_XUNLOCK(p->p_fd); - if (resultfp) - *resultfp = fp; - if (resultfd) - *resultfd = i; + fdp->fd_ofileflags[*fd] |= UF_EXCLOSE; + FILEDESC_XUNLOCK(fdp); return (0); } @@ -1739,11 +1786,11 @@ fdcopy(struct filedesc *fdp) FILEDESC_XUNLOCK(newfdp); FILEDESC_SLOCK(fdp); } - /* copy everything except kqueue descriptors */ + /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { if (fdisused(fdp, i) && - fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE && + (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) && fdp->fd_ofiles[i]->f_ops != &badfileops) { newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; @@ -1785,9 +1832,11 @@ fdfree(struct thread *td) if (fdp == NULL) return; +#ifdef RACCT PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_NOFILE, 0); PROC_UNLOCK(td->td_proc); +#endif /* Check for special need to clear POSIX style locks */ fdtol = td->td_proc->p_fdtol; @@ -2103,6 +2152,7 @@ closef(struct file *fp, struct thread *td) struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; + struct file *fp_object; /* * POSIX record locking dictates that any close releases ALL @@ -2115,11 +2165,15 @@ closef(struct file *fp, struct thread *td) * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. + * + * If this is a capability, we do lock processing under the underlying + * node, not the capability itself. */ - if (fp->f_type == DTYPE_VNODE && td != NULL) { + (void)cap_funwrap(fp, 0, &fp_object); + if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) { int vfslocked; - vp = fp->f_vnode; + vp = fp_object->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; @@ -2149,7 +2203,7 @@ closef(struct file *fp, struct thread *td) lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; - vp = fp->f_vnode; + vp = fp_object->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); @@ -2228,15 +2282,27 @@ fget_unlocked(struct filedesc *fdp, int fd) * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * + * If the FGET_GETCAP flag is set, the capability itself will be returned. + * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL. + * Otherwise, if the file is a capability, its rights will be checked against + * the capability rights mask, and if successful, the object will be unwrapped. + * * If an error occured the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ +#define FGET_GETCAP 0x00000001 static __inline int -_fget(struct thread *td, int fd, struct file **fpp, int flags) +_fget(struct thread *td, int fd, struct file **fpp, int flags, + cap_rights_t needrights, cap_rights_t *haverights, u_char *maxprotp, + int fget_flags) { struct filedesc *fdp; struct file *fp; +#ifdef CAPABILITIES + struct file *fp_fromcap; + int error; +#endif *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) @@ -2247,6 +2313,47 @@ _fget(struct thread *td, int fd, struct file **fpp, int flags) fdrop(fp, td); return (EBADF); } + +#ifdef CAPABILITIES + /* + * If a capability has been requested, return the capability directly. + * Otherwise, check capability rights, extract the underlying object, + * and check its access flags. + */ + if (fget_flags & FGET_GETCAP) { + if (fp->f_type != DTYPE_CAPABILITY) { + fdrop(fp, td); + return (EINVAL); + } + } else { + if (maxprotp == NULL) + error = cap_funwrap(fp, needrights, &fp_fromcap); + else + error = cap_funwrap_mmap(fp, needrights, maxprotp, + &fp_fromcap); + if (error) { + fdrop(fp, td); + return (error); + } + + /* + * If we've unwrapped a file, drop the original capability + * and hold the new descriptor. fp after this point refers to + * the actual (unwrapped) object, not the capability. + */ + if (fp != fp_fromcap) { + fhold(fp_fromcap); + fdrop(fp, td); + fp = fp_fromcap; + } + } +#else /* !CAPABILITIES */ + KASSERT(fp->f_type != DTYPE_CAPABILITY, + ("%s: saw capability", __func__)); + if (maxprotp != NULL) + *maxprotp = VM_PROT_ALL; +#endif /* CAPABILITIES */ + /* * FREAD and FWRITE failure return EBADF as per POSIX. * @@ -2265,23 +2372,36 @@ int fget(struct thread *td, int fd, struct file **fpp) { - return(_fget(td, fd, fpp, 0)); + return(_fget(td, fd, fpp, 0, 0, NULL, NULL, 0)); } int fget_read(struct thread *td, int fd, struct file **fpp) { - return(_fget(td, fd, fpp, FREAD)); + return(_fget(td, fd, fpp, FREAD, 0, NULL, NULL, 0)); } int fget_write(struct thread *td, int fd, struct file **fpp) { - return(_fget(td, fd, fpp, FWRITE)); + return(_fget(td, fd, fpp, FWRITE, 0, NULL, NULL, 0)); +} + +/* + * Unlike the other fget() calls, which will accept and check capability rights + * but never return capabilities, fgetcap() returns the capability but doesn't + * check capability rights. + */ +int +fgetcap(struct thread *td, int fd, struct file **fpp) +{ + + return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP)); } + /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but @@ -2296,7 +2416,7 @@ _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) int error; *vpp = NULL; - if ((error = _fget(td, fd, &fp, flags)) != 0) + if ((error = _fget(td, fd, &fp, flags, 0, NULL, NULL, 0)) != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; @@ -2352,7 +2472,7 @@ fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) *spp = NULL; if (fflagp != NULL) *fflagp = 0; - if ((error = _fget(td, fd, &fp, 0)) != 0) + if ((error = _fget(td, fd, &fp, 0, 0, NULL, NULL, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; @@ -2388,6 +2508,9 @@ fputsock(struct socket *so) /* * Handle the last reference to a file being closed. + * + * No special capability handling here, as the capability's fo_close will run + * instead of the object here, and perform any necessary drop on the object. */ int _fdrop(struct file *fp, struct thread *td) diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index c51cc39..f30f89a 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -27,12 +27,14 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_vm.h" #include <sys/param.h> +#include <sys/capability.h> #include <sys/systm.h> #include <sys/eventhandler.h> #include <sys/lock.h> @@ -415,6 +417,18 @@ do_execve(td, args, mac_p) interpret: if (args->fname != NULL) { +#ifdef CAPABILITY_MODE + /* + * While capability mode can't reach this point via direct + * path arguments to execve(), we also don't allow + * interpreters to be used in capability mode (for now). + * Catch indirect lookups and return a permissions error. + */ + if (IN_CAPABILITY_MODE(td)) { + error = ECAPMODE; + goto exec_fail; + } +#endif error = namei(&nd); if (error) goto exec_fail; @@ -631,6 +645,13 @@ interpret: * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * + * We disable setuid/setgid/etc in compatibility mode on the basis + * that most setugid applications are not written with that + * environment in mind, and will therefore almost certainly operate + * incorrectly. In principle there's no reason that setugid + * applications might not be useful in capability mode, so we may want + * to reconsider this conservative design choice in the future. + * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ @@ -646,6 +667,9 @@ interpret: #endif if (credential_changing && +#ifdef CAPABILITY_MODE + ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && +#endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index bb25d17..30b94b6 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -744,9 +744,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options, * Destroy resource accounting information associated with the process. */ racct_proc_exit(p); +#ifdef RACCT PROC_LOCK(p->p_pptr); racct_sub(p->p_pptr, RACCT_NPROC, 1); PROC_UNLOCK(p->p_pptr); +#endif /* * Free credentials, arguments, and sigacts. @@ -905,19 +907,23 @@ loop: void proc_reparent(struct proc *child, struct proc *parent) { +#ifdef RACCT int locked; +#endif sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; +#ifdef RACCT locked = PROC_LOCKED(parent); if (!locked) PROC_LOCK(parent); racct_add_force(parent, RACCT_NPROC, 1); if (!locked) PROC_UNLOCK(parent); +#endif PROC_LOCK(child->p_pptr); racct_sub(child->p_pptr, RACCT_NPROC, 1); sigqueue_take(child->p_ksi); diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c index e0fb32b..f192471 100644 --- a/sys/kern/kern_fail.c +++ b/sys/kern/kern_fail.c @@ -52,6 +52,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <sys/ctype.h> #include <sys/errno.h> #include <sys/fail.h> #include <sys/kernel.h> @@ -59,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> +#include <sys/proc.h> #include <sys/sbuf.h> #include <machine/stdarg.h> @@ -88,16 +90,20 @@ enum fail_point_t { FAIL_POINT_BREAK, /**< break into the debugger */ FAIL_POINT_PRINT, /**< print a message */ FAIL_POINT_SLEEP, /**< sleep for some msecs */ - FAIL_POINT_INVALID, /**< placeholder */ + FAIL_POINT_NUMTYPES }; -static const char *fail_type_strings[] = { - "off", - "panic", - "return", - "break", - "print", - "sleep", +static struct { + const char *name; + int nmlen; +} fail_type_strings[] = { +#define FP_TYPE_NM_LEN(s) { s, sizeof(s) - 1 } + [FAIL_POINT_OFF] = FP_TYPE_NM_LEN("off"), + [FAIL_POINT_PANIC] = FP_TYPE_NM_LEN("panic"), + [FAIL_POINT_RETURN] = FP_TYPE_NM_LEN("return"), + [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"), + [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"), + [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"), }; /** @@ -109,7 +115,7 @@ struct fail_point_entry { int fe_arg; /**< argument to type (e.g. return value) */ int fe_prob; /**< likelihood of firing in millionths */ int fe_count; /**< number of times to fire, 0 means always */ - + pid_t fe_pid; /**< only fail for this process */ TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */ }; @@ -120,7 +126,7 @@ fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent, /* convert from millisecs to ticks, rounding up */ int timo = ((msecs * hz) + 999) / 1000; - if (timo) { + if (timo > 0) { if (fp->fp_sleep_fn == NULL) { msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo); } else { @@ -191,19 +197,13 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...) void fail_point_destroy(struct fail_point *fp) { - struct fail_point_entry *ent; - if (fp->fp_flags & FAIL_POINT_DYNAMIC_NAME && fp->fp_name != NULL) { - fp_free((void *)(intptr_t)fp->fp_name); + if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) { + fp_free(__DECONST(void *, fp->fp_name)); fp->fp_name = NULL; } fp->fp_flags = 0; - - while (!TAILQ_EMPTY(&fp->fp_entries)) { - ent = TAILQ_FIRST(&fp->fp_entries); - TAILQ_REMOVE(&fp->fp_entries, ent, fe_entries); - fp_free(ent); - } + clear_entries(&fp->fp_entries); } /** @@ -222,16 +222,14 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) FP_LOCK(); - ent = TAILQ_FIRST(&fp->fp_entries); - while (ent) { + TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) { int cont = 0; /* don't continue by default */ - next = TAILQ_NEXT(ent, fe_entries); if (ent->fe_prob < PROB_MAX && - ent->fe_prob < random() % PROB_MAX) { - cont = 1; - goto loop_end; - } + ent->fe_prob < random() % PROB_MAX) + continue; + if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid) + continue; switch (ent->fe_type) { case FAIL_POINT_PANIC: @@ -239,13 +237,14 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) /* NOTREACHED */ case FAIL_POINT_RETURN: - if (return_value) + if (return_value != NULL) *return_value = ent->fe_arg; ret = FAIL_POINT_RC_RETURN; break; case FAIL_POINT_BREAK: - printf("fail point %s breaking to debugger\n", fp->fp_name); + printf("fail point %s breaking to debugger\n", + fp->fp_name); breakpoint(); break; @@ -273,13 +272,9 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) break; } - if (ent && ent->fe_count > 0 && --ent->fe_count == 0) + if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0) free_entry(&fp->fp_entries, ent); - -loop_end: - if (cont) - ent = next; - else + if (cont == 0) break; } @@ -290,7 +285,7 @@ loop_end: FP_UNLOCK(); - return ret; + return (ret); } /** @@ -320,9 +315,11 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) } if (ent->fe_count > 0) sbuf_printf(sb, "%d*", ent->fe_count); - sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type]); + sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name); if (ent->fe_arg) sbuf_printf(sb, "(%d)", ent->fe_arg); + if (ent->fe_pid != NO_PID) + sbuf_printf(sb, "[pid %d]", ent->fe_pid); if (TAILQ_NEXT(ent, fe_entries)) sbuf_printf(sb, "->"); } @@ -380,7 +377,7 @@ fail_point_set(struct fail_point *fp, char *buf) fp->fp_name, fp->fp_location, buf); #endif /* IWARNING */ - return error; + return (error); } #define MAX_FAIL_POINT_BUF 1023 @@ -422,9 +419,8 @@ fail_point_sysctl(SYSCTL_HANDLER_ARGS) } out: - if (buf) - fp_free(buf); - return error; + fp_free(buf); + return (error); } /** @@ -437,12 +433,17 @@ parse_fail_point(struct fail_point_entries *ents, char *p) /* <fail_point> :: * <term> ( "->" <term> )* */ - if (!(p = parse_term(ents, p))) - return 0; - while (*p) - if (p[0] != '-' || p[1] != '>' || !(p = parse_term(ents, p+2))) - return 0; - return p; + p = parse_term(ents, p); + if (p == NULL) + return (NULL); + while (*p != '\0') { + if (p[0] != '-' || p[1] != '>') + return (NULL); + p = parse_term(ents, p + 2); + if (p == NULL) + return (NULL); + } + return (p); } /** @@ -455,6 +456,7 @@ parse_term(struct fail_point_entries *ents, char *p) ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO); ent->fe_prob = PROB_MAX; + ent->fe_pid = NO_PID; TAILQ_INSERT_TAIL(ents, ent, fe_entries); /* @@ -462,14 +464,16 @@ parse_term(struct fail_point_entries *ents, char *p) * ( (<float> "%") | (<integer> "*" ) )* * <type> * [ "(" <integer> ")" ] + * [ "[pid " <integer> "]" ] */ /* ( (<float> "%") | (<integer> "*" ) )* */ - while (('0' <= *p && *p <= '9') || *p == '.') { + while (isdigit(*p) || *p == '.') { int units, decimal; - if (!(p = parse_number(&units, &decimal, p))) - return 0; + p = parse_number(&units, &decimal, p); + if (p == NULL) + return (NULL); if (*p == '%') { if (units > 100) /* prevent overflow early */ @@ -477,37 +481,44 @@ parse_term(struct fail_point_entries *ents, char *p) ent->fe_prob = units * (PROB_MAX / 100) + decimal; if (ent->fe_prob > PROB_MAX) ent->fe_prob = PROB_MAX; - } else if (*p == '*') { if (!units || decimal) - return 0; + return (NULL); ent->fe_count = units; - - } else { - return 0; - } - + } else + return (NULL); p++; } /* <type> */ - if (!(p = parse_type(ent, p))) - return 0; + p = parse_type(ent, p); + if (p == NULL) + return (NULL); if (*p == '\0') - return p; + return (p); /* [ "(" <integer> ")" ] */ if (*p != '(') return p; p++; - if (('0' <= *p && *p <= '9') || *p == '-') - ent->fe_arg = strtol(p, &p, 0); - else - return 0; + if (!isdigit(*p) && *p != '-') + return (NULL); + ent->fe_arg = strtol(p, &p, 0); if (*p++ != ')') - return 0; - - return p; + return (NULL); + + /* [ "[pid " <integer> "]" ] */ +#define PID_STRING "[pid " + if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0) + return (p); + p += sizeof(PID_STRING) - 1; + if (!isdigit(*p)) + return (NULL); + ent->fe_pid = strtol(p, &p, 0); + if (*p++ != ']') + return (NULL); + + return (p); } /** @@ -528,14 +539,14 @@ parse_number(int *out_units, int *out_decimal, char *p) old_p = p; *out_units = strtol(p, &p, 10); if (p == old_p && *p != '.') - return 0; + return (NULL); /* fractional part */ *out_decimal = 0; if (*p == '.') { int digits = 0; p++; - while ('0' <= *p && *p <= '9') { + while (isdigit(*p)) { int digit = *p - '0'; if (digits < PROB_DIGITS - 2) *out_decimal = *out_decimal * 10 + digit; @@ -545,12 +556,12 @@ parse_number(int *out_units, int *out_decimal, char *p) p++; } if (!digits) /* need at least one digit after '.' */ - return 0; + return (NULL); while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */ *out_decimal *= 10; } - return p; /* success */ + return (p); /* success */ } /** @@ -560,21 +571,16 @@ static char * parse_type(struct fail_point_entry *ent, char *beg) { enum fail_point_t type; - char *end = beg; - while ('a' <= *end && *end <= 'z') - end++; - if (beg == end) - return 0; - for (type = FAIL_POINT_OFF; type != FAIL_POINT_INVALID; type++) { - const char *p = fail_type_strings[type]; - const char *q = beg; - while (q < end && *p++ == *q++); - if (q == end && *p == '\0') { + int len; + + for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) { + len = fail_type_strings[type].nmlen; + if (strncmp(fail_type_strings[type].name, beg, len) == 0) { ent->fe_type = type; - return end; + return (beg + len); } } - return 0; + return (NULL); } /** @@ -595,6 +601,7 @@ static void clear_entries(struct fail_point_entries *ents) { struct fail_point_entry *ent, *ent_next; + TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next) fp_free(ent); TAILQ_INIT(ents); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 04e635a..9d3e22d 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -476,7 +476,10 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2, sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } - if (flags & RFLINUXTHPN) + + if (flags & RFTSIGZMB) + p2->p_sigparent = RFTSIGNUM(flags); + else if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; @@ -719,10 +722,22 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp) static int curfail; static struct timeval lastfail; + /* Check for the undefined or unimplemented flags. */ + if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) + return (EINVAL); + + /* Signal value requires RFTSIGZMB. */ + if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) + return (EINVAL); + /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); + /* Check the validity of the signal number. */ + if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) + return (EINVAL); + p1 = td->td_proc; /* @@ -734,11 +749,13 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp) return (fork_norfproc(td, flags)); } +#ifdef RACCT PROC_LOCK(p1); error = racct_add(p1, RACCT_NPROC, 1); PROC_UNLOCK(p1); if (error != 0) return (EAGAIN); +#endif mem_charged = 0; vm2 = NULL; @@ -822,6 +839,7 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp) goto fail; } +#ifdef RACCT /* * After fork, there is exactly one thread running. */ @@ -832,6 +850,7 @@ fork1(struct thread *td, int flags, int pages, struct proc **procp) error = EAGAIN; goto fail; } +#endif /* * Increment the count of procs running with this uid. Don't allow @@ -874,9 +893,11 @@ fail1: vmspace_free(vm2); uma_zfree(proc_zone, newproc); pause("fork", hz / 2); +#ifdef RACCT PROC_LOCK(p1); racct_sub(p1, RACCT_NPROC, 1); PROC_UNLOCK(p1); +#endif return (error); } diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 5850ad1..358d673 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -3639,6 +3639,7 @@ prison_priv_check(struct ucred *cred, int priv) case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: + case PRIV_NET_SETIFFIB: /* * 802.11-related privileges. diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 01f7777..401ce1d 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -103,7 +103,7 @@ SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *", int racct_types[] = { [RACCT_CPU] = - RACCT_IN_THOUSANDS, + RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = @@ -141,7 +141,7 @@ int racct_types[] = { [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = - RACCT_IN_THOUSANDS }; + RACCT_IN_MILLIONS }; static void racct_add_racct(struct racct *dest, const struct racct *src) @@ -173,7 +173,7 @@ racct_sub_racct(struct racct *dest, const struct racct *src) * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { - if (!racct_is_sloppy(i)) { + if (!RACCT_IS_SLOPPY(i)) { KASSERT(dest->r_resources[i] >= 0, ("racct propagation meltdown: dest < 0")); KASSERT(src->r_resources[i] >= 0, @@ -181,10 +181,10 @@ racct_sub_racct(struct racct *dest, const struct racct *src) KASSERT(src->r_resources[i] <= dest->r_resources[i], ("racct propagation meltdown: src > dest")); } - if (racct_is_reclaimable(i)) { + if (RACCT_IS_RECLAIMABLE(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) { - KASSERT(racct_is_sloppy(i), + KASSERT(RACCT_IS_SLOPPY(i), ("racct_sub_racct: usage < 0")); dest->r_resources[i] = 0; } @@ -218,9 +218,9 @@ racct_destroy_locked(struct racct **racctp) racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { - if (racct_is_sloppy(i)) + if (RACCT_IS_SLOPPY(i)) continue; - if (!racct_is_reclaimable(i)) + if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " @@ -255,7 +255,7 @@ racct_alloc_resource(struct racct *racct, int resource, racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { - KASSERT(racct_is_sloppy(resource), + KASSERT(RACCT_IS_SLOPPY(resource), ("racct_alloc_resource: usage < 0")); racct->r_resources[resource] = 0; } @@ -285,7 +285,7 @@ racct_add(struct proc *p, int resource, uint64_t amount) mtx_lock(&racct_lock); #ifdef RCTL error = rctl_enforce(p, resource, amount); - if (error && racct_is_deniable(resource)) { + if (error && RACCT_IS_DENIABLE(resource)) { SDT_PROBE(racct, kernel, rusage, add_failure, p, resource, amount, 0, 0); mtx_unlock(&racct_lock); @@ -373,14 +373,14 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount) diff = amount - p->p_racct->r_resources[resource]; #ifdef notyet - KASSERT(diff >= 0 || racct_is_reclaimable(resource), + KASSERT(diff >= 0 || RACCT_IS_RECLAIMABLE(resource), ("racct_set: usage of non-reclaimable resource %d dropping", resource)); #endif #ifdef RCTL if (diff > 0) { error = rctl_enforce(p, resource, diff); - if (error && racct_is_deniable(resource)) { + if (error && RACCT_IS_DENIABLE(resource)) { SDT_PROBE(racct, kernel, rusage, set_failure, p, resource, amount, 0, 0); return (error); @@ -489,7 +489,7 @@ racct_sub(struct proc *p, int resource, uint64_t amount) * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); - KASSERT(racct_is_reclaimable(resource), + KASSERT(RACCT_IS_RECLAIMABLE(resource), ("racct_sub: called for non-reclaimable resource %d", resource)); mtx_lock(&racct_lock); @@ -512,7 +512,7 @@ racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 0, 0); #ifdef notyet - KASSERT(racct_is_reclaimable(resource), + KASSERT(RACCT_IS_RECLAIMABLE(resource), ("racct_sub_cred: called for non-reclaimable resource %d", resource)); #endif @@ -564,7 +564,7 @@ racct_proc_fork(struct proc *parent, struct proc *child) */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || - !racct_is_inheritable(i)) + !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 3d0a478..a939758 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -99,17 +99,17 @@ static struct dict subjectnames[] = { { NULL, -1 }}; static struct dict resourcenames[] = { - { "cpu", RACCT_CPU }, - { "data", RACCT_DATA }, - { "stack", RACCT_STACK }, - { "core", RACCT_CORE }, - { "rss", RACCT_RSS }, - { "memlock", RACCT_MEMLOCK }, - { "nproc", RACCT_NPROC }, - { "nofile", RACCT_NOFILE }, - { "vmem", RACCT_VMEM }, - { "npts", RACCT_NPTS }, - { "swap", RACCT_SWAP }, + { "cputime", RACCT_CPU }, + { "datasize", RACCT_DATA }, + { "stacksize", RACCT_STACK }, + { "coredumpsize", RACCT_CORE }, + { "memoryuse", RACCT_RSS }, + { "memorylocked", RACCT_MEMLOCK }, + { "maxproc", RACCT_NPROC }, + { "openfiles", RACCT_NOFILE }, + { "vmemoryuse", RACCT_VMEM }, + { "pseudoterminals", RACCT_NPTS }, + { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, @@ -907,7 +907,7 @@ rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; - if (racct_is_in_thousands(rule->rr_resource)) + if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) rule->rr_amount *= 1000; } @@ -947,7 +947,7 @@ rctl_rule_add(struct rctl_rule *rule) /* * Some rules just don't make sense. Note that the one below - * cannot be rewritten using racct_is_deniable(); the RACCT_PCTCPU, + * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU, * for example, is not deniable in the racct sense, but the * limit is enforced in a different way, so "deny" rules for %CPU * do make sense. @@ -958,7 +958,7 @@ rctl_rule_add(struct rctl_rule *rule) return (EOPNOTSUPP); if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && - racct_is_sloppy(rule->rr_resource)) + RACCT_IS_SLOPPY(rule->rr_resource)) return (EOPNOTSUPP); /* @@ -1152,8 +1152,8 @@ rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && - racct_is_in_thousands(rule->rr_resource)) - amount /= 1000; + RACCT_IS_IN_MILLIONS(rule->rr_resource)) + amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), @@ -1219,10 +1219,10 @@ rctl_racct_to_sbuf(struct racct *racct, int sloppy) sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { - if (sloppy == 0 && racct_is_sloppy(i)) + if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; amount = racct->r_resources[i]; - if (racct_is_in_thousands(i)) + if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c index 3214e1b..1c7337d 100644 --- a/sys/kern/kern_rmlock.c +++ b/sys/kern/kern_rmlock.c @@ -263,7 +263,7 @@ _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) pc = pcpu_find(curcpu); /* Check if we just need to do a proper critical_exit. */ - if (!CPU_OVERLAP(&pc->pc_cpumask, &rm->rm_writecpus)) { + if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) { critical_exit(); return (1); } @@ -325,7 +325,7 @@ _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) critical_enter(); pc = pcpu_find(curcpu); - CPU_NAND(&rm->rm_writecpus, &pc->pc_cpumask); + CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus); rm_tracker_add(pc, tracker); sched_pin(); critical_exit(); @@ -367,7 +367,7 @@ _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) * conditional jump. */ if (0 == (td->td_owepreempt | - CPU_OVERLAP(&rm->rm_writecpus, &pc->pc_cpumask))) + CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))) return (1); /* We do not have a read token and need to acquire one. */ diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index 39d6f23..0c52071 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -492,6 +492,12 @@ tc_windup(void) /* Now is a good time to change timecounters. */ if (th->th_counter != timecounter) { +#ifndef __arm__ + if ((timecounter->tc_flags & TC_FLAGS_C3STOP) != 0) + cpu_disable_deep_sleep++; + if ((th->th_counter->tc_flags & TC_FLAGS_C3STOP) != 0) + cpu_disable_deep_sleep--; +#endif th->th_counter = timecounter; th->th_offset_count = ncount; tc_min_ticktock_freq = max(1, timecounter->tc_frequency / diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 7011a53..94e41e2 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -185,11 +185,13 @@ create_thread(struct thread *td, mcontext_t *ctx, } } +#ifdef RACCT PROC_LOCK(td->td_proc); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(td->td_proc); if (error != 0) return (EPROCLIM); +#endif /* Initialize our td */ newtd = thread_alloc(0); @@ -277,9 +279,11 @@ create_thread(struct thread *td, mcontext_t *ctx, return (0); fail: +#ifdef RACCT PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); +#endif return (error); } diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index 38bf37f..2f9a1f6 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -950,11 +950,11 @@ link_elf_load_file(linker_class_t cls, const char* filename, ef->ddbstrcnt = strcnt; ef->ddbstrtab = ef->strbase; +nosyms: error = link_elf_link_common_finish(lf); if (error != 0) goto out; -nosyms: *result = lf; out: diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 592bb80..574755f0 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -951,8 +951,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) if (td->td_flags & TDF_IDLETD) { TD_SET_CAN_RUN(td); #ifdef SMP - /* Spinlock held here, assume no migration. */ - CPU_NAND(&idle_cpus_mask, PCPU_PTR(cpumask)); + CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask); #endif } else { if (TD_IS_RUNNING(td)) { @@ -1026,7 +1025,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) #ifdef SMP if (td->td_flags & TDF_IDLETD) - CPU_OR(&idle_cpus_mask, PCPU_PTR(cpumask)); + CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask); #endif sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); @@ -1055,7 +1054,8 @@ static int forward_wakeup(int cpunum) { struct pcpu *pc; - cpuset_t dontuse, id, map, map2, me; + cpuset_t dontuse, map, map2; + u_int id, me; int iscpuset; mtx_assert(&sched_lock, MA_OWNED); @@ -1073,27 +1073,24 @@ forward_wakeup(int cpunum) /* * Check the idle mask we received against what we calculated * before in the old version. - * - * Also note that sched_lock is held now, thus no migration is - * expected. */ - me = PCPU_GET(cpumask); + me = PCPU_GET(cpuid); /* Don't bother if we should be doing it ourself. */ - if (CPU_OVERLAP(&me, &idle_cpus_mask) && - (cpunum == NOCPU || CPU_ISSET(cpunum, &me))) + if (CPU_ISSET(me, &idle_cpus_mask) && + (cpunum == NOCPU || me == cpunum)) return (0); - dontuse = me; + CPU_SETOF(me, &dontuse); CPU_OR(&dontuse, &stopped_cpus); CPU_OR(&dontuse, &hlt_cpus_mask); CPU_ZERO(&map2); if (forward_wakeup_use_loop) { STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { - id = pc->pc_cpumask; - if (!CPU_OVERLAP(&id, &dontuse) && + id = pc->pc_cpuid; + if (!CPU_ISSET(id, &dontuse) && pc->pc_curthread == pc->pc_idlethread) { - CPU_OR(&map2, &id); + CPU_SET(id, &map2); } } } @@ -1125,11 +1122,11 @@ forward_wakeup(int cpunum) if (!CPU_EMPTY(&map)) { forward_wakeups_delivered++; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { - id = pc->pc_cpumask; - if (!CPU_OVERLAP(&map, &id)) + id = pc->pc_cpuid; + if (!CPU_ISSET(id, &map)) continue; if (cpu_idle_wakeup(pc->pc_cpuid)) - CPU_NAND(&map, &id); + CPU_CLR(id, &map); } if (!CPU_EMPTY(&map)) ipi_selected(map, IPI_AST); @@ -1147,7 +1144,7 @@ kick_other_cpu(int pri, int cpuid) int cpri; pcpu = pcpu_find(cpuid); - if (CPU_OVERLAP(&idle_cpus_mask, &pcpu->pc_cpumask)) { + if (CPU_ISSET(cpuid, &idle_cpus_mask)) { forward_wakeups_delivered++; if (!cpu_idle_wakeup(cpuid)) ipi_cpu(cpuid, IPI_AST); @@ -1205,10 +1202,10 @@ void sched_add(struct thread *td, int flags) #ifdef SMP { - cpuset_t idle, me, tidlemsk; + cpuset_t tidlemsk; struct td_sched *ts; + u_int cpu, cpuid; int forwarded = 0; - int cpu; int single_cpu = 0; ts = td->td_sched; @@ -1271,23 +1268,17 @@ sched_add(struct thread *td, int flags) ts->ts_runq = &runq; } - if (single_cpu && (cpu != PCPU_GET(cpuid))) { + cpuid = PCPU_GET(cpuid); + if (single_cpu && cpu != cpuid) { kick_other_cpu(td->td_priority, cpu); } else { if (!single_cpu) { + tidlemsk = idle_cpus_mask; + CPU_NAND(&tidlemsk, &hlt_cpus_mask); + CPU_CLR(cpuid, &tidlemsk); - /* - * Thread spinlock is held here, assume no - * migration is possible. - */ - me = PCPU_GET(cpumask); - idle = idle_cpus_mask; - tidlemsk = idle; - CPU_AND(&idle, &me); - CPU_OR(&me, &hlt_cpus_mask); - CPU_NAND(&tidlemsk, &me); - - if (CPU_EMPTY(&idle) && ((flags & SRQ_INTR) == 0) && + if (!CPU_ISSET(cpuid, &idle_cpus_mask) && + ((flags & SRQ_INTR) == 0) && !CPU_EMPTY(&tidlemsk)) forwarded = forward_wakeup(cpu); } diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c index c2f6e99..f5cb31e 100644 --- a/sys/kern/subr_kdb.c +++ b/sys/kern/subr_kdb.c @@ -88,20 +88,6 @@ SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, kdb_sysctl_trap_code, "I", "set to cause a page fault via code access"); /* - * Flag indicating whether or not to IPI the other CPUs to stop them on - * entering the debugger. Sometimes, this will result in a deadlock as - * stop_cpus() waits for the other cpus to stop, so we allow it to be - * disabled. In order to maximize the chances of success, use a hard - * stop for that. - */ -#ifdef SMP -static int kdb_stop_cpus = 1; -SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus, CTLFLAG_RW | CTLFLAG_TUN, - &kdb_stop_cpus, 0, "stop other CPUs when entering the debugger"); -TUNABLE_INT("debug.kdb.stop_cpus", &kdb_stop_cpus); -#endif - -/* * Flag to indicate to debuggers why the debugger was entered. */ const char * volatile kdb_why = KDB_WHY_UNSET; @@ -211,9 +197,12 @@ kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS) void kdb_panic(const char *msg) { - #ifdef SMP - stop_cpus_hard(PCPU_GET(other_cpus)); + cpuset_t other_cpus; + + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + stop_cpus_hard(other_cpus); #endif printf("KDB: panic\n"); panic("%s", msg); @@ -429,7 +418,7 @@ kdb_thr_ctx(struct thread *thr) #if defined(SMP) && defined(KDB_STOPPEDPCB) STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc->pc_curthread == thr && - CPU_OVERLAP(&stopped_cpus, &pc->pc_cpumask)) + CPU_ISSET(pc->pc_cpuid, &stopped_cpus)) return (KDB_STOPPEDPCB(pc)); } #endif @@ -513,11 +502,11 @@ kdb_thr_select(struct thread *thr) int kdb_trap(int type, int code, struct trapframe *tf) { - struct kdb_dbbe *be; - register_t intr; #ifdef SMP - int did_stop_cpus; + cpuset_t other_cpus; #endif + struct kdb_dbbe *be; + register_t intr; int handled; be = kdb_dbbe; @@ -531,8 +520,9 @@ kdb_trap(int type, int code, struct trapframe *tf) intr = intr_disable(); #ifdef SMP - if ((did_stop_cpus = kdb_stop_cpus) != 0) - stop_cpus_hard(PCPU_GET(other_cpus)); + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + stop_cpus_hard(other_cpus); #endif kdb_active++; @@ -558,8 +548,7 @@ kdb_trap(int type, int code, struct trapframe *tf) kdb_active--; #ifdef SMP - if (did_stop_cpus) - restart_cpus(stopped_cpus); + restart_cpus(stopped_cpus); #endif intr_restore(intr); diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c index a6b3ae0..ec6b590 100644 --- a/sys/kern/subr_pcpu.c +++ b/sys/kern/subr_pcpu.c @@ -87,7 +87,6 @@ pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) KASSERT(cpuid >= 0 && cpuid < MAXCPU, ("pcpu_init: invalid cpuid %d", cpuid)); pcpu->pc_cpuid = cpuid; - CPU_SETOF(cpuid, &pcpu->pc_cpumask); cpuid_to_pcpu[cpuid] = pcpu; STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu); cpu_pcpu_init(pcpu, cpuid, size); diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index c38177b..caec965 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -142,7 +142,7 @@ mp_start(void *dummy) /* Probe for MP hardware. */ if (smp_disabled != 0 || cpu_mp_probe() == 0) { mp_ncpus = 1; - all_cpus = PCPU_GET(cpumask); + CPU_SETOF(PCPU_GET(cpuid), &all_cpus); return; } @@ -236,12 +236,10 @@ generic_stop_cpus(cpuset_t map, u_int type) /* spin */ cpu_spinwait(); i++; -#ifdef DIAGNOSTIC - if (i == 100000) { + if (i == 100000000) { printf("timeout stopping cpus\n"); break; } -#endif } stopping_cpu = NOCPU; @@ -708,7 +706,7 @@ mp_setvariables_for_up(void *dummy) { mp_ncpus = 1; mp_maxid = PCPU_GET(cpuid); - all_cpus = PCPU_GET(cpumask); + CPU_SETOF(mp_maxid, &all_cpus); KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); } SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 0113d7b..3527ed1 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -44,7 +44,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include "opt_capabilities.h" +#include "opt_capsicum.h" #include "opt_ktrace.h" #include "opt_kdtrace.h" #include "opt_sched.h" @@ -313,7 +313,7 @@ syscallenter(struct thread *td, struct syscall_args *sa) goto retval; } -#ifdef CAPABILITIES +#ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c index 9385dc4..6e81328 100644 --- a/sys/kern/subr_uio.c +++ b/sys/kern/subr_uio.c @@ -64,6 +64,8 @@ __FBSDID("$FreeBSD$"); SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); +static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault); + #ifdef ZERO_COPY_SOCKETS /* Declared in uipc_socket.c */ extern int so_zero_copy_receive; @@ -129,23 +131,65 @@ retry: #endif /* ZERO_COPY_SOCKETS */ int +copyin_nofault(const void *udaddr, void *kaddr, size_t len) +{ + int error, save; + + save = vm_fault_disable_pagefaults(); + error = copyin(udaddr, kaddr, len); + vm_fault_enable_pagefaults(save); + return (error); +} + +int +copyout_nofault(const void *kaddr, void *udaddr, size_t len) +{ + int error, save; + + save = vm_fault_disable_pagefaults(); + error = copyout(kaddr, udaddr, len); + vm_fault_enable_pagefaults(save); + return (error); +} + +int uiomove(void *cp, int n, struct uio *uio) { - struct thread *td = curthread; + + return (uiomove_faultflag(cp, n, uio, 0)); +} + +int +uiomove_nofault(void *cp, int n, struct uio *uio) +{ + + return (uiomove_faultflag(cp, n, uio, 1)); +} + +static int +uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) +{ + struct thread *td; struct iovec *iov; u_int cnt; - int error = 0; - int save = 0; + int error, newflags, save; + + td = curthread; + error = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove: mode")); - KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, + KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, ("uiomove proc")); - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, - "Calling uiomove()"); + if (!nofault) + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "Calling uiomove()"); - save = td->td_pflags & TDP_DEADLKTREAT; - td->td_pflags |= TDP_DEADLKTREAT; + /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */ + newflags = TDP_DEADLKTREAT; + if (uio->uio_segflg == UIO_USERSPACE && nofault) + newflags |= TDP_NOFAULTING; + save = curthread_pflags_set(newflags); while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; @@ -187,8 +231,7 @@ uiomove(void *cp, int n, struct uio *uio) n -= cnt; } out: - if (save == 0) - td->td_pflags &= ~TDP_DEADLKTREAT; + curthread_pflags_restore(save); return (error); } diff --git a/sys/kern/sys_capability.c b/sys/kern/sys_capability.c index 89dc923..04f98d8 100644 --- a/sys/kern/sys_capability.c +++ b/sys/kern/sys_capability.c @@ -36,7 +36,7 @@ * */ -#include "opt_capabilities.h" +#include "opt_capsicum.h" #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -59,18 +59,11 @@ __FBSDID("$FreeBSD$"); #include <vm/uma.h> #include <vm/vm.h> -#ifdef CAPABILITIES +#ifdef CAPABILITY_MODE FEATURE(security_capabilities, "Capsicum Capability Mode"); /* - * We don't currently have any MIB entries for sysctls, but we do expose - * security.capabilities so that it's easy to tell if options CAPABILITIES is - * compiled into the kernel. - */ -SYSCTL_NODE(_security, OID_AUTO, capabilities, CTLFLAG_RW, 0, "Capsicum"); - -/* * System call to enter capability mode for the process. */ int @@ -106,7 +99,7 @@ cap_getmode(struct thread *td, struct cap_getmode_args *uap) return (copyout(&i, uap->modep, sizeof(i))); } -#else /* !CAPABILITIES */ +#else /* !CAPABILITY_MODE */ int cap_enter(struct thread *td, struct cap_enter_args *uap) @@ -122,4 +115,403 @@ cap_getmode(struct thread *td, struct cap_getmode_args *uap) return (ENOSYS); } +#endif /* CAPABILITY_MODE */ + +#ifdef CAPABILITIES + +/* + * struct capability describes a capability, and is hung off of its struct + * file f_data field. cap_file and cap_rightss are static once hooked up, as + * neither the object it references nor the rights it encapsulates are + * permitted to change. + */ +struct capability { + struct file *cap_object; /* Underlying object's file. */ + struct file *cap_file; /* Back-pointer to cap's file. */ + cap_rights_t cap_rights; /* Mask of rights on object. */ +}; + +/* + * Capabilities have a fileops vector, but in practice none should ever be + * called except for fo_close, as the capability will normally not be + * returned during a file descriptor lookup in the system call code. + */ +static fo_rdwr_t capability_read; +static fo_rdwr_t capability_write; +static fo_truncate_t capability_truncate; +static fo_ioctl_t capability_ioctl; +static fo_poll_t capability_poll; +static fo_kqfilter_t capability_kqfilter; +static fo_stat_t capability_stat; +static fo_close_t capability_close; + +static struct fileops capability_ops = { + .fo_read = capability_read, + .fo_write = capability_write, + .fo_truncate = capability_truncate, + .fo_ioctl = capability_ioctl, + .fo_poll = capability_poll, + .fo_kqfilter = capability_kqfilter, + .fo_stat = capability_stat, + .fo_close = capability_close, + .fo_flags = DFLAG_PASSABLE, +}; + +static struct fileops capability_ops_unpassable = { + .fo_read = capability_read, + .fo_write = capability_write, + .fo_truncate = capability_truncate, + .fo_ioctl = capability_ioctl, + .fo_poll = capability_poll, + .fo_kqfilter = capability_kqfilter, + .fo_stat = capability_stat, + .fo_close = capability_close, + .fo_flags = 0, +}; + +static uma_zone_t capability_zone; + +static void +capability_init(void *dummy __unused) +{ + + capability_zone = uma_zcreate("capability", sizeof(struct capability), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + if (capability_zone == NULL) + panic("capability_init: capability_zone not initialized"); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL); + +/* + * Test whether a capability grants the requested rights. + */ +static int +cap_check(struct capability *c, cap_rights_t rights) +{ + + if ((c->cap_rights | rights) != c->cap_rights) + return (ENOTCAPABLE); + return (0); +} + +/* + * Extract rights from a capability for monitoring purposes -- not for use in + * any other way, as we want to keep all capability permission evaluation in + * this one file. + */ +cap_rights_t +cap_rights(struct file *fp_cap) +{ + struct capability *c; + + KASSERT(fp_cap->f_type == DTYPE_CAPABILITY, + ("cap_rights: !capability")); + + c = fp_cap->f_data; + return (c->cap_rights); +} + +/* + * System call to create a new capability reference to either an existing + * file object or an an existing capability. + */ +int +cap_new(struct thread *td, struct cap_new_args *uap) +{ + int error, capfd; + int fd = uap->fd; + struct file *fp, *fcapp; + cap_rights_t rights = uap->rights; + + AUDIT_ARG_FD(fd); +#ifdef notyet /* capability auditing will follow in a few commits */ + AUDIT_ARG_RIGHTS(rights); +#endif + error = fget(td, fd, &fp); + if (error) + return (error); + AUDIT_ARG_FILE(td->td_proc, fp); + error = kern_capwrap(td, fp, rights, &fcapp, &capfd); + if (error) + return (error); + + /* + * Release our reference to the file (kern_capwrap has held a reference + * for the filedesc array). + */ + fdrop(fp, td); + td->td_retval[0] = capfd; + return (0); +} + +/* + * System call to query the rights mask associated with a capability. + */ +int +cap_getrights(struct thread *td, struct cap_getrights_args *uap) +{ + struct capability *cp; + struct file *fp; + int error; + + AUDIT_ARG_FD(uap->fd); + error = fgetcap(td, uap->fd, &fp); + if (error) + return (error); + cp = fp->f_data; + error = copyout(&cp->cap_rights, uap->rightsp, sizeof(*uap->rightsp)); + fdrop(fp, td); + return (error); +} + +/* + * Create a capability to wrap around an existing file. + */ +int +kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights, + struct file **fcappp, int *capfdp) +{ + struct capability *cp, *cp_old; + struct file *fp_object; + int error; + + if ((rights | CAP_MASK_VALID) != CAP_MASK_VALID) + return (EINVAL); + + /* + * If a new capability is being derived from an existing capability, + * then the new capability rights must be a subset of the existing + * rights. + */ + if (fp->f_type == DTYPE_CAPABILITY) { + cp_old = fp->f_data; + if ((cp_old->cap_rights | rights) != cp_old->cap_rights) + return (ENOTCAPABLE); + } + + /* + * Allocate a new file descriptor to hang the capability off of. + */ + error = falloc(td, fcappp, capfdp, fp->f_flag); + if (error) + return (error); + + /* + * Rather than nesting capabilities, directly reference the object an + * existing capability references. There's nothing else interesting + * to preserve for future use, as we've incorporated the previous + * rights mask into the new one. This prevents us from having to + * deal with capability chains. + */ + if (fp->f_type == DTYPE_CAPABILITY) + fp_object = ((struct capability *)fp->f_data)->cap_object; + else + fp_object = fp; + fhold(fp_object); + cp = uma_zalloc(capability_zone, M_WAITOK | M_ZERO); + cp->cap_rights = rights; + cp->cap_object = fp_object; + cp->cap_file = *fcappp; + if (fp->f_flag & DFLAG_PASSABLE) + finit(*fcappp, fp->f_flag, DTYPE_CAPABILITY, cp, + &capability_ops); + else + finit(*fcappp, fp->f_flag, DTYPE_CAPABILITY, cp, + &capability_ops_unpassable); + + /* + * Release our private reference (the proc filedesc still has one). + */ + fdrop(*fcappp, td); + return (0); +} + +/* + * Given a file descriptor, test it against a capability rights mask and then + * return the file descriptor on which to actually perform the requested + * operation. As long as the reference to fp_cap remains valid, the returned + * pointer in *fp will remain valid, so no extra reference management is + * required, and the caller should fdrop() fp_cap as normal when done with + * both. + */ +int +cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp) +{ + struct capability *c; + int error; + + if (fp_cap->f_type != DTYPE_CAPABILITY) { + *fpp = fp_cap; + return (0); + } + c = fp_cap->f_data; + error = cap_check(c, rights); + if (error) + return (error); + *fpp = c->cap_object; + return (0); +} + +/* + * Slightly different routine for memory mapping file descriptors: unwrap the + * capability and check CAP_MMAP, but also return a bitmask representing the + * maximum mapping rights the capability allows on the object. + */ +int +cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp, + struct file **fpp) +{ + struct capability *c; + u_char maxprot; + int error; + + if (fp_cap->f_type != DTYPE_CAPABILITY) { + *fpp = fp_cap; + *maxprotp = VM_PROT_ALL; + return (0); + } + c = fp_cap->f_data; + error = cap_check(c, rights | CAP_MMAP); + if (error) + return (error); + *fpp = c->cap_object; + maxprot = 0; + if (c->cap_rights & CAP_READ) + maxprot |= VM_PROT_READ; + if (c->cap_rights & CAP_WRITE) + maxprot |= VM_PROT_WRITE; + if (c->cap_rights & CAP_MAPEXEC) + maxprot |= VM_PROT_EXECUTE; + *maxprotp = maxprot; + return (0); +} + +/* + * When a capability is closed, simply drop the reference on the underlying + * object and free the capability. fdrop() will handle the case where the + * underlying object also needs to close, and the caller will have already + * performed any object-specific lock or mqueue handling. + */ +static int +capability_close(struct file *fp, struct thread *td) +{ + struct capability *c; + struct file *fp_object; + + KASSERT(fp->f_type == DTYPE_CAPABILITY, + ("capability_close: !capability")); + + c = fp->f_data; + fp->f_ops = &badfileops; + fp->f_data = NULL; + fp_object = c->cap_object; + uma_zfree(capability_zone, c); + return (fdrop(fp_object, td)); +} + +/* + * In general, file descriptor operations should never make it to the + * capability, only the underlying file descriptor operation vector, so panic + * if we do turn up here. + */ +static int +capability_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + panic("capability_read"); +} + +static int +capability_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + panic("capability_write"); +} + +static int +capability_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + + panic("capability_truncate"); +} + +static int +capability_ioctl(struct file *fp, u_long com, void *data, + struct ucred *active_cred, struct thread *td) +{ + + panic("capability_ioctl"); +} + +static int +capability_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + + panic("capability_poll"); +} + +static int +capability_kqfilter(struct file *fp, struct knote *kn) +{ + + panic("capability_kqfilter"); +} + +static int +capability_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) +{ + + panic("capability_stat"); +} + +#else /* !CAPABILITIES */ + +/* + * Stub Capability functions for when options CAPABILITIES isn't compiled + * into the kernel. + */ +int +cap_new(struct thread *td, struct cap_new_args *uap) +{ + + return (ENOSYS); +} + +int +cap_getrights(struct thread *td, struct cap_getrights_args *uap) +{ + + return (ENOSYS); +} + +int +cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp) +{ + + KASSERT(fp_cap->f_type != DTYPE_CAPABILITY, + ("cap_funwrap: saw capability")); + + *fpp = fp_cap; + return (0); +} + +int +cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp, + struct file **fpp) +{ + + KASSERT(fp_cap->f_type != DTYPE_CAPABILITY, + ("cap_funwrap_mmap: saw capability")); + + *fpp = fp_cap; + *maxprotp = VM_PROT_ALL; + return (0); +} + #endif /* CAPABILITIES */ + diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index a4c0069..ee36b35 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -829,6 +829,15 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) case PT_ATTACH: /* security check done above */ + /* + * It would be nice if the tracing relationship was separate + * from the parent relationship but that would require + * another set of links in the proc struct or for "wait" + * to scan the entire proc table. To make life easier, + * we just re-parent the process we're trying to trace. + * The old parent is remembered so we can put things back + * on a "detach". + */ p->p_flag |= P_TRACED; p->p_oppid = p->p_pptr->p_pid; if (p->p_pptr != td->td_proc) { diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 29a6485..abd9484 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -521,8 +521,8 @@ const char *syscallnames[] = { "msgctl", /* 511 = msgctl */ "shmctl", /* 512 = shmctl */ "lpathconf", /* 513 = lpathconf */ - "#514", /* 514 = cap_new */ - "#515", /* 515 = cap_getrights */ + "cap_new", /* 514 = cap_new */ + "cap_getrights", /* 515 = cap_getrights */ "cap_enter", /* 516 = cap_enter */ "cap_getmode", /* 517 = cap_getmode */ "#518", /* 518 = pdfork */ diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index af958c9..0b249a5 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -914,8 +914,9 @@ 512 AUE_SHMCTL NOSTD { int shmctl(int shmid, int cmd, \ struct shmid_ds *buf); } 513 AUE_LPATHCONF STD { int lpathconf(char *path, int name); } -514 AUE_CAP_NEW UNIMPL cap_new -515 AUE_CAP_GETRIGHTS UNIMPL cap_getrights +514 AUE_CAP_NEW STD { int cap_new(int fd, u_int64_t rights); } +515 AUE_CAP_GETRIGHTS STD { int cap_getrights(int fd, \ + u_int64_t *rightsp); } 516 AUE_CAP_ENTER STD { int cap_enter(void); } 517 AUE_CAP_GETMODE STD { int cap_getmode(u_int *modep); } 518 AUE_PDFORK UNIMPL pdfork diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 880b46b..f57777f 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3096,6 +3096,22 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 2; break; } + /* cap_new */ + case 514: { + struct cap_new_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = p->rights; /* u_int64_t */ + *n_args = 2; + break; + } + /* cap_getrights */ + case 515: { + struct cap_getrights_args *p = params; + iarg[0] = p->fd; /* int */ + uarg[1] = (intptr_t) p->rightsp; /* u_int64_t * */ + *n_args = 2; + break; + } /* cap_enter */ case 516: { *n_args = 0; @@ -8326,6 +8342,32 @@ systrace_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* cap_new */ + case 514: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "u_int64_t"; + break; + default: + break; + }; + break; + /* cap_getrights */ + case 515: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "u_int64_t *"; + break; + default: + break; + }; + break; /* cap_enter */ case 516: break; diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c index 87d479e..ffd8580 100644 --- a/sys/kern/sysv_msg.c +++ b/sys/kern/sysv_msg.c @@ -620,6 +620,7 @@ msgget(td, uap) error = ENOSPC; goto done2; } +#ifdef RACCT PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NMSGQ, 1); PROC_UNLOCK(td->td_proc); @@ -627,6 +628,7 @@ msgget(td, uap) error = ENOSPC; goto done2; } +#endif DPRINTF(("msqid %d is available\n", msqid)); msqkptr->u.msg_perm.key = key; msqkptr->u.msg_perm.cuid = cred->cr_uid; @@ -685,7 +687,9 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) register struct msqid_kernel *msqkptr; register struct msg *msghdr; short next; +#ifdef RACCT size_t saved_msgsz; +#endif if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); @@ -723,6 +727,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) goto done2; #endif +#ifdef RACCT PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) { PROC_UNLOCK(td->td_proc); @@ -737,6 +742,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) goto done2; } PROC_UNLOCK(td->td_proc); +#endif segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz, @@ -991,12 +997,14 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) wakeup(msqkptr); td->td_retval[0] = 0; done3: +#ifdef RACCT if (error != 0) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz); PROC_UNLOCK(td->td_proc); } +#endif done2: mtx_unlock(&msq_mtx); return (error); diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c index ac53a8d..4a4c479 100644 --- a/sys/kern/sysv_sem.c +++ b/sys/kern/sysv_sem.c @@ -149,9 +149,6 @@ struct sem_undo { #endif /* shouldn't need tuning */ -#ifndef SEMMAP -#define SEMMAP 30 /* # of entries in semaphore map */ -#endif #ifndef SEMMSL #define SEMMSL SEMMNS /* max # of semaphores per id */ #endif @@ -182,7 +179,6 @@ struct sem_undo { * semaphore info struct */ struct seminfo seminfo = { - SEMMAP, /* # of entries in semaphore map */ SEMMNI, /* # of semaphore identifiers */ SEMMNS, /* # of semaphores in system */ SEMMNU, /* # of undo structures in system */ @@ -194,8 +190,6 @@ struct seminfo seminfo = { SEMAEM /* adjust on exit max value */ }; -SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0, - "Number of entries in the semaphore map"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0, "Number of semaphore identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0, @@ -255,7 +249,6 @@ seminit(void) { int i, error; - TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap); TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni); TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns); TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu); @@ -931,6 +924,7 @@ semget(struct thread *td, struct semget_args *uap) error = ENOSPC; goto done2; } +#ifdef RACCT PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NSEM, nsems); PROC_UNLOCK(td->td_proc); @@ -938,6 +932,7 @@ semget(struct thread *td, struct semget_args *uap) error = ENOSPC; goto done2; } +#endif DPRINTF(("semid %d is available\n", semid)); mtx_lock(&sema_mtx[semid]); KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0, @@ -1023,12 +1018,14 @@ semop(struct thread *td, struct semop_args *uap) nsops)); return (E2BIG); } else { +#ifdef RACCT PROC_LOCK(td->td_proc); if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) { PROC_UNLOCK(td->td_proc); return (E2BIG); } PROC_UNLOCK(td->td_proc); +#endif sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); } diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index f5a84ae..1741a21 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -672,6 +672,7 @@ shmget_allocate_segment(td, uap, mode) shm_last_free = -1; } shmseg = &shmsegs[segnum]; +#ifdef RACCT PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_NSHM, 1)) { PROC_UNLOCK(td->td_proc); @@ -683,6 +684,7 @@ shmget_allocate_segment(td, uap, mode) return (ENOMEM); } PROC_UNLOCK(td->td_proc); +#endif /* * In case we sleep in malloc(), mark the segment present but deleted * so that noone else tries to create the same key. @@ -699,10 +701,12 @@ shmget_allocate_segment(td, uap, mode) shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0, cred); if (shm_object == NULL) { +#ifdef RACCT PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_NSHM, 1); racct_sub(td->td_proc, RACCT_SHMSIZE, size); PROC_UNLOCK(td->td_proc); +#endif return (ENOMEM); } VM_OBJECT_LOCK(shm_object); diff --git a/sys/kern/tty.c b/sys/kern/tty.c index 8aa3af2..187e635 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -91,7 +91,7 @@ static const char *dev_console_filename; HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\ CDSR_OFLOW|CCAR_OFLOW) -#define TTY_CALLOUT(tp,d) ((d) != (tp)->t_dev && (d) != dev_console) +#define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT) /* * Set TTY buffer sizes. @@ -470,10 +470,10 @@ ttydev_write(struct cdev *dev, struct uio *uio, int ioflag) if (error) goto done; } - - tp->t_flags |= TF_BUSY_OUT; + + tp->t_flags |= TF_BUSY_OUT; error = ttydisc_write(tp, uio, ioflag); - tp->t_flags &= ~TF_BUSY_OUT; + tp->t_flags &= ~TF_BUSY_OUT; cv_signal(&tp->t_outserwait); } @@ -772,6 +772,10 @@ ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, goto done; } + error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td); + if (error != ENOIOCTL) + goto done; + switch (cmd) { case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ @@ -878,6 +882,13 @@ ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td) } static int +ttydevsw_defcioctl(struct tty *tp, int unit, u_long cmd, caddr_t data, struct thread *td) +{ + + return (ENOIOCTL); +} + +static int ttydevsw_defparam(struct tty *tp, struct termios *t) { @@ -955,6 +966,7 @@ tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex) PATCH_FUNC(outwakeup); PATCH_FUNC(inwakeup); PATCH_FUNC(ioctl); + PATCH_FUNC(cioctl); PATCH_FUNC(param); PATCH_FUNC(modem); PATCH_FUNC(mmap); @@ -1054,7 +1066,7 @@ tty_rel_pgrp(struct tty *tp, struct pgrp *pg) if (tp->t_pgrp == pg) tp->t_pgrp = NULL; - + tty_unlock(tp); } @@ -1190,13 +1202,13 @@ tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...) /* Slave call-in devices. */ if (tp->t_flags & TF_INITLOCK) { - dev = make_dev_cred(&ttyil_cdevsw, 0, cred, + dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred, uid, gid, mode, "%s%s.init", prefix, name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; dev->si_drv2 = &tp->t_termios_init_in; - dev = make_dev_cred(&ttyil_cdevsw, 0, cred, + dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred, uid, gid, mode, "%s%s.lock", prefix, name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; @@ -1205,20 +1217,22 @@ tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...) /* Call-out devices. */ if (tp->t_flags & TF_CALLOUT) { - dev = make_dev_cred(&ttydev_cdevsw, 0, cred, + dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred, UID_UUCP, GID_DIALER, 0660, "cua%s", name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; /* Slave call-out devices. */ if (tp->t_flags & TF_INITLOCK) { - dev = make_dev_cred(&ttyil_cdevsw, 0, cred, + dev = make_dev_cred(&ttyil_cdevsw, + TTYUNIT_CALLOUT | TTYUNIT_INIT, cred, UID_UUCP, GID_DIALER, 0660, "cua%s.init", name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; dev->si_drv2 = &tp->t_termios_init_out; - dev = make_dev_cred(&ttyil_cdevsw, 0, cred, + dev = make_dev_cred(&ttyil_cdevsw, + TTYUNIT_CALLOUT | TTYUNIT_LOCK, cred, UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; @@ -1241,7 +1255,7 @@ tty_signal_sessleader(struct tty *tp, int sig) /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; - + if (tp->t_session != NULL && tp->t_session->s_leader != NULL) { p = tp->t_session->s_leader; PROC_LOCK(p); @@ -1305,7 +1319,7 @@ tty_wait(struct tty *tp, struct cv *cv) /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); - + /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); @@ -1327,7 +1341,7 @@ tty_timedwait(struct tty *tp, struct cv *cv, int hz) /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); - + /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); @@ -1469,7 +1483,7 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, return (error); /* XXX: CLOCAL? */ - + tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE; tp->t_termios.c_ispeed = t->c_ispeed; tp->t_termios.c_ospeed = t->c_ospeed; @@ -1708,7 +1722,7 @@ tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) if (tty_gone(tp)) return (ENXIO); - + error = ttydevsw_ioctl(tp, cmd, data, td); if (error == ENOIOCTL) error = tty_generic_ioctl(tp, cmd, data, fflag, td); @@ -1786,7 +1800,7 @@ ttyhook_defrint(struct tty *tp, char c, int flags) if (ttyhook_rint_bypass(tp, &c, 1) != 1) return (-1); - + return (0); } @@ -1812,7 +1826,7 @@ ttyhook_register(struct tty **rtp, struct proc *p, int fd, error = EBADF; goto done1; } - + /* * Make sure the vnode is bound to a character device. * Unlocked check for the vnode type is ok there, because we @@ -1910,7 +1924,7 @@ ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) /* System console has no TTY associated. */ if (dev_console->si_drv1 == NULL) return (ENXIO); - + return (ttydev_open(dev, oflags, devtype, td)); } diff --git a/sys/kern/tty_inq.c b/sys/kern/tty_inq.c index b0e9b18..0c39a29 100644 --- a/sys/kern/tty_inq.c +++ b/sys/kern/tty_inq.c @@ -142,7 +142,7 @@ void ttyinq_free(struct ttyinq *ti) { struct ttyinq_block *tib; - + ttyinq_flush(ti); ti->ti_quota = 0; @@ -276,7 +276,7 @@ ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote) struct ttyinq_block *tib; unsigned int boff; size_t l; - + while (nbytes > 0) { boff = ti->ti_end % TTYINQ_DATASIZE; @@ -313,7 +313,7 @@ ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote) nbytes -= l; ti->ti_end += l; } - + return (cbuf - (const char *)buf); } @@ -397,7 +397,7 @@ ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote) *c = tib->tib_data[boff]; *quote = GETBIT(tib, boff); - + return (0); } diff --git a/sys/kern/tty_outq.c b/sys/kern/tty_outq.c index d5ed221..5d40abe 100644 --- a/sys/kern/tty_outq.c +++ b/sys/kern/tty_outq.c @@ -119,7 +119,7 @@ void ttyoutq_free(struct ttyoutq *to) { struct ttyoutq_block *tob; - + ttyoutq_flush(to); to->to_quota = 0; diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index d89c183..a3db59b 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -295,7 +295,7 @@ ptsdev_ioctl(struct file *fp, u_long cmd, void *data, return (EINVAL); return copyout(p, fgn->buf, i); } - + /* * We need to implement TIOCGPGRP and TIOCGSID here again. When * called on the pseudo-terminal master, it should not check if @@ -563,7 +563,7 @@ ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, sb->st_uid = dev->si_uid; sb->st_gid = dev->si_gid; sb->st_mode = dev->si_mode | S_IFCHR; - + return (0); } @@ -823,7 +823,7 @@ posix_openpt(struct thread *td, struct posix_openpt_args *uap) */ if (uap->flags & ~(O_RDWR|O_NOCTTY)) return (EINVAL); - + error = falloc(td, &fp, &fd, 0); if (error) return (error); diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c index 6afac8d..2a0bb4b 100644 --- a/sys/kern/tty_ttydisc.c +++ b/sys/kern/tty_ttydisc.c @@ -270,13 +270,13 @@ ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag) MPASS(tp->t_termios.c_cc[VMIN] != 0); MPASS(tp->t_termios.c_cc[VTIME] != 0); - + /* * When using the interbyte timer, the timer should be started * after the first byte has been received. We just call into the * generic read timer code after we've received the first byte. */ - + for (;;) { error = ttyinq_read_uio(&tp->t_inq, tp, uio, uio->uio_resid, 0); @@ -331,7 +331,7 @@ ttydisc_read(struct tty *tp, struct uio *uio, int ioflag) /* Unset the input watermark when we've got enough space. */ tty_hiwat_in_unblock(tp); } - + return (error); } @@ -521,7 +521,7 @@ ttydisc_write(struct tty *tp, struct uio *uio, int ioflag) error = EWOULDBLOCK; goto done; } - + /* * The driver may write back the data * synchronously. Be sure to check the high @@ -567,7 +567,7 @@ ttydisc_optimize(struct tty *tp) } else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) && (!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) && (!CMP_FLAG(i, PARMRK) || - CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) && + CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) && !CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) { tp->t_flags |= TF_BYPASS; } else { @@ -583,7 +583,7 @@ ttydisc_modem(struct tty *tp, int open) if (open) cv_broadcast(&tp->t_dcdwait); - + /* * Ignore modem status lines when CLOCAL is turned on, but don't * enter the zombie state when the TTY isn't opened, because @@ -834,7 +834,7 @@ ttydisc_rint(struct tty *tp, char c, int flags) if (ttyhook_hashook(tp, rint)) return ttyhook_rint(tp, c, flags); - + if (tp->t_flags & TF_BYPASS) goto processed; @@ -1072,7 +1072,7 @@ ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len) size_t ret; tty_lock_assert(tp, MA_OWNED); - + MPASS(tp->t_flags & TF_BYPASS); atomic_add_long(&tty_nin, len); @@ -1122,7 +1122,7 @@ ttydisc_rint_poll(struct tty *tp) l = ttyinq_bytesleft(&tp->t_inq); if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0) return (1); - + return (l); } @@ -1201,7 +1201,7 @@ ttydisc_getc_uio(struct tty *tp, struct uio *uio) tty_unlock(tp); error = uiomove(buf, len, uio); tty_lock(tp); - + if (error != 0) break; } diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 00496af..0414f12 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -55,7 +55,10 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_capsicum.h" + #include <sys/param.h> +#include <sys/capability.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> @@ -264,7 +267,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, - FALSE); + 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) @@ -486,6 +489,14 @@ shm_open(struct thread *td, struct shm_open_args *uap) mode_t cmode; int fd, error; +#ifdef CAPABILITY_MODE + /* + * shm_open(2) is only allowed for anonymous objects. + */ + if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON)) + return (ECAPMODE); +#endif + if ((uap->flags & O_ACCMODE) != O_RDONLY && (uap->flags & O_ACCMODE) != O_RDWR) return (EINVAL); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 3334fc2..990c6ba 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1915,7 +1915,6 @@ release: /* * Optimized version of soreceive() for stream (TCP) sockets. */ -#ifdef TCP_SORECEIVE_STREAM int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) @@ -1955,20 +1954,9 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, } oresid = uio->uio_resid; - /* We will never ever get anything unless we are connected. */ + /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { - /* When disconnecting there may be still some data left. */ - if (sb->sb_cc > 0) - goto deliver; - if (!(so->so_state & SS_ISDISCONNECTED)) - error = ENOTCONN; - goto out; - } - - /* Socket buffer is empty and we shall not block. */ - if (sb->sb_cc == 0 && - ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { - error = EAGAIN; + error = ENOTCONN; goto out; } @@ -1995,6 +1983,13 @@ restart: goto out; } + /* Socket buffer is empty and we shall not block. */ + if (sb->sb_cc == 0 && + ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { + error = EAGAIN; + goto out; + } + /* Socket buffer got some data that we shall deliver now. */ if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && ((sb->sb_flags & SS_NBIO) || @@ -2109,7 +2104,6 @@ out: sbunlock(sb); return (error); } -#endif /* TCP_SORECEIVE_STREAM */ /* * Optimized version of soreceive() for simple datagram cases from userspace. diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 19aaee0..c434973 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -35,6 +35,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" @@ -43,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/capability.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> @@ -675,6 +677,11 @@ sendit(td, s, mp, flags) struct sockaddr *to; int error; +#ifdef CAPABILITY_MODE + if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) + return (ECAPMODE); +#endif + if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) { diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 2743089..a6ad81e 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1625,6 +1625,7 @@ vfs_vmio_release(struct buf *bp) int i; vm_page_t m; + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; @@ -1658,7 +1659,6 @@ vfs_vmio_release(struct buf *bp) vm_page_unlock(m); } VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); - pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) { bufspacewakeup(); @@ -3012,6 +3012,10 @@ allocbuf(struct buf *bp, int size) if (desiredpages < bp->b_npages) { vm_page_t m; + pmap_qremove((vm_offset_t)trunc_page( + (vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), + (bp->b_npages - desiredpages)); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* @@ -3032,8 +3036,6 @@ allocbuf(struct buf *bp, int size) vm_page_unlock(m); } VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); - pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + - (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 2e07cf1..5edf0f5 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/filedesc.h> #include <sys/reboot.h> +#include <sys/sbuf.h> #include <sys/syscallsubr.h> #include <sys/sysproto.h> #include <sys/sx.h> diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c index 496ea70..ccbcb33 100644 --- a/sys/kern/vfs_mountroot.c +++ b/sys/kern/vfs_mountroot.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/filedesc.h> #include <sys/reboot.h> +#include <sys/sbuf.h> #include <sys/stat.h> #include <sys/syscallsubr.h> #include <sys/sysproto.h> diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 741061d..934745b 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1190,8 +1190,8 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) { VM_OBJECT_LOCK(bo->bo_object); - vm_object_page_remove(bo->bo_object, 0, 0, - (flags & V_SAVE) ? TRUE : FALSE); + vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? + OBJPR_CLEANONLY : 0); VM_OBJECT_UNLOCK(bo->bo_object); } @@ -3590,9 +3590,6 @@ vn_isdisk(struct vnode *vp, int *errp) * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. - * - * The ifdef'd CAPABILITIES version is here for reference, but is not - * actually used. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, |