diff options
author | Renato Botelho <renato@netgate.com> | 2016-05-03 08:53:59 -0300 |
---|---|---|
committer | Renato Botelho <renato@netgate.com> | 2016-05-03 08:53:59 -0300 |
commit | 501575fb1206644a3ea8c2cd64a81084745445cc (patch) | |
tree | e07e5ad3f3ff6f6cf2841dd2d2eb0dcb0e54521a /sys/kern | |
parent | 91f599cbc0d103dd112a2472b589573724b8d70a (diff) | |
parent | 04acf11bf47629b82fc88ce0e6d6dc642b1e641b (diff) | |
download | FreeBSD-src-501575fb1206644a3ea8c2cd64a81084745445cc.zip FreeBSD-src-501575fb1206644a3ea8c2cd64a81084745445cc.tar.gz |
Merge remote-tracking branch 'origin/stable/10' into devel
Diffstat (limited to 'sys/kern')
-rw-r--r-- | sys/kern/kern_jail.c | 214 | ||||
-rw-r--r-- | sys/kern/kern_osd.c | 263 | ||||
-rw-r--r-- | sys/kern/sysv_msg.c | 378 | ||||
-rw-r--r-- | sys/kern/sysv_sem.c | 390 | ||||
-rw-r--r-- | sys/kern/sysv_shm.c | 330 |
5 files changed, 1274 insertions, 301 deletions
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 0d52c7b..0ea7276 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -560,8 +560,9 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) void *op; #endif unsigned long hid; - size_t namelen, onamelen; - int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos; + size_t namelen, onamelen, pnamelen; + int born, created, cuflags, descend, enforce; + int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int fi, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; @@ -584,7 +585,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); - mypr = ppr = td->td_ucred->cr_prison; + mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) @@ -611,6 +612,13 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) #endif g_path = NULL; + cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); + if (!cuflags) { + error = EINVAL; + vfs_opterror(opts, "no valid operation (create or update)"); + goto done_errmsg; + } + error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; @@ -1020,42 +1028,18 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } /* - * Grab the allprison lock before letting modules check their - * parameters. Once we have it, do not let go so we'll have a - * consistent view of the OSD list. - */ - sx_xlock(&allprison_lock); - error = osd_jail_call(NULL, PR_METHOD_CHECK, opts); - if (error) - goto done_unlock_list; - - /* By now, all parameters should have been noted. */ - TAILQ_FOREACH(opt, opts, link) { - if (!opt->seen && strcmp(opt->name, "errmsg")) { - error = EINVAL; - vfs_opterror(opts, "unknown parameter: %s", opt->name); - goto done_unlock_list; - } - } - - /* - * See if we are creating a new record or updating an existing one. + * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ - cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); - if (!cuflags) { - error = EINVAL; - vfs_opterror(opts, "no valid operation (create or update)"); - goto done_unlock_list; - } pr = NULL; - namelc = NULL; + ppr = mypr; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } + sx_xlock(&allprison_lock); if (jid != 0) { /* * See if a requested jid already exists. There is an @@ -1121,6 +1105,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ + namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) @@ -1131,7 +1116,6 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * parent and child names, and make sure the parent * exists or matches an already found jail. */ - *namelc = '\0'; if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { @@ -1142,6 +1126,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_unlock_list; } } else { + *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; @@ -1150,17 +1135,18 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_unlock_list; } mtx_unlock(&ppr->pr_mtx); + *namelc = '.'; } - name = ++namelc; + namelc++; } - if (name[0] != '\0') { - namelen = + if (namelc[0] != '\0') { + pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; name_again: deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && tpr->pr_ref > 0 && - !strcmp(tpr->pr_name + namelen, name)) { + !strcmp(tpr->pr_name + pnamelen, namelc)) { if (pr == NULL && cuflags != JAIL_CREATE) { mtx_lock(&tpr->pr_mtx); @@ -1234,10 +1220,11 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } created = 1; mtx_lock(&ppr->pr_mtx); - if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) { + if (ppr->pr_ref == 0) { mtx_unlock(&ppr->pr_mtx); error = ENOENT; - vfs_opterror(opts, "parent jail went away!"); + vfs_opterror(opts, "jail \"%s\" not found", + prison_name(mypr, ppr)); goto done_unlock_list; } ppr->pr_ref++; @@ -1291,8 +1278,8 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) pr->pr_id = jid; /* Set some default values, and inherit some from the parent. */ - if (name == NULL) - name = ""; + if (namelc == NULL) + namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; @@ -1355,6 +1342,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); + TASK_INIT(&pr->pr_task, 0, prison_complete, pr); #ifdef VIMAGE /* Allocate a new vnet if specified. */ @@ -1374,7 +1362,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) mtx_lock(&pr->pr_mtx); /* * New prisons do not yet have a reference, because we do not - * want other to see the incomplete prison once the + * want others to see the incomplete prison once the * allprison_lock is downgraded. */ } else { @@ -1588,13 +1576,13 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } #endif onamelen = namelen = 0; - if (name != NULL) { + if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ - if (name[0] == '\0') - snprintf(name = numbuf, sizeof(numbuf), "%d", jid); + if (namelc[0] == '\0') + snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; @@ -1606,9 +1594,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * Make sure the name isn't too long for the prison or its * children. */ - onamelen = strlen(pr->pr_name); - namelen = strlen(name); - if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) { + pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; + onamelen = strlen(pr->pr_name + pnamelen); + namelen = strlen(namelc); + if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } @@ -1625,6 +1614,30 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_deref_locked; } + /* + * Let modules check their parameters. This requires unlocking and + * then re-locking the prison, but this is still a valid state as long + * as allprison_lock remains xlocked. + */ + mtx_unlock(&pr->pr_mtx); + error = osd_jail_call(pr, PR_METHOD_CHECK, opts); + if (error != 0) { + prison_deref(pr, created + ? PD_LIST_XLOCKED + : PD_DEREF | PD_LIST_XLOCKED); + goto done_releroot; + } + mtx_lock(&pr->pr_mtx); + + /* At this point, all valid parameters should have been noted. */ + TAILQ_FOREACH(opt, opts, link) { + if (!opt->seen && strcmp(opt->name, "errmsg")) { + error = EINVAL; + vfs_opterror(opts, "unknown parameter: %s", opt->name); + goto done_deref_locked; + } + } + /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; @@ -1698,12 +1711,12 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } - if (name != NULL) { + if (namelc != NULL) { if (ppr == &prison0) - strlcpy(pr->pr_name, name, sizeof(pr->pr_name)); + strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", - ppr->pr_name, name); + ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, @@ -1781,6 +1794,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * for now, so new ones will remain unseen until after the module * handlers have completed. */ + born = pr->pr_uref == 0; if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { if (pr_flags & PR_PERSIST) { pr->pr_ref++; @@ -1850,15 +1864,20 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) /* Let the modules do their work. */ sx_downgrade(&allprison_lock); - if (created) { + if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) { - prison_deref(pr, PD_LIST_SLOCKED); + (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); + prison_deref(pr, created + ? PD_LIST_SLOCKED + : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) { + if (born) + (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); @@ -1910,7 +1929,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) sx_sunlock(&allprison_lock); } - goto done_errmsg; + goto done_free; done_deref_locked: prison_deref(pr, created @@ -2290,7 +2309,6 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap) /* Remove all descendants of this prison, then remove this prison. */ pr->pr_ref++; - pr->pr_flags |= PR_REMOVE; if (!LIST_EMPTY(&pr->pr_children)) { mtx_unlock(&pr->pr_mtx); lpr = NULL; @@ -2299,7 +2317,6 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap) if (cpr->pr_ref > 0) { tpr = cpr; cpr->pr_ref++; - cpr->pr_flags |= PR_REMOVE; } else { /* Already removed - do not do it again. */ tpr = NULL; @@ -2406,7 +2423,6 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap) static int do_jail_attach(struct thread *td, struct prison *pr) { - struct prison *ppr; struct proc *p; struct ucred *newcred, *oldcred; int error; @@ -2434,7 +2450,6 @@ do_jail_attach(struct thread *td, struct prison *pr) /* * Reparent the newly attached process to this jail. */ - ppr = td->td_ucred->cr_prison; p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) @@ -2453,23 +2468,23 @@ do_jail_attach(struct thread *td, struct prison *pr) newcred = crget(); PROC_LOCK(p); - oldcred = p->p_ucred; - setsugid(p); - crcopy(newcred, oldcred); + oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; p->p_ucred = newcred; + setsugid(p); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif + prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF); crfree(oldcred); - prison_deref(ppr, PD_DEREF | PD_DEUREF); return (0); + e_unlock: VOP_UNLOCK(pr->pr_root, 0); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ - (void)osd_jail_call(ppr, PR_METHOD_ATTACH, td); + (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, PD_DEREF | PD_DEUREF); return (error); } @@ -2578,16 +2593,13 @@ prison_allow(struct ucred *cred, unsigned flag) void prison_free_locked(struct prison *pr) { + int ref; mtx_assert(&pr->pr_mtx, MA_OWNED); - pr->pr_ref--; - if (pr->pr_ref == 0) { - mtx_unlock(&pr->pr_mtx); - TASK_INIT(&pr->pr_task, 0, prison_complete, pr); - taskqueue_enqueue(taskqueue_thread, &pr->pr_task); - return; - } + ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); + if (ref == 0) + taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } void @@ -2598,11 +2610,17 @@ prison_free(struct prison *pr) prison_free_locked(pr); } +/* + * Complete a call to either prison_free or prison_proc_free. + */ static void prison_complete(void *context, int pending) { + struct prison *pr = context; - prison_deref((struct prison *)context, 0); + mtx_lock(&pr->pr_mtx); + prison_deref(pr, pr->pr_uref + ? PD_DEREF | PD_DEUREF | PD_LOCKED : PD_LOCKED); } /* @@ -2615,19 +2633,53 @@ static void prison_deref(struct prison *pr, int flags) { struct prison *ppr, *tpr; + int ref, lasturef; if (!(flags & PD_LOCKED)) mtx_lock(&pr->pr_mtx); for (;;) { if (flags & PD_DEUREF) { + KASSERT(pr->pr_uref > 0, + ("prison_deref PD_DEUREF on a dead prison (jid=%d)", + pr->pr_id)); pr->pr_uref--; + lasturef = pr->pr_uref == 0; + if (lasturef) + pr->pr_ref++; KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); - } - if (flags & PD_DEREF) + } else + lasturef = 0; + if (flags & PD_DEREF) { + KASSERT(pr->pr_ref > 0, + ("prison_deref PD_DEREF on a dead prison (jid=%d)", + pr->pr_id)); pr->pr_ref--; - /* If the prison still has references, nothing else to do. */ - if (pr->pr_ref > 0) { + } + ref = pr->pr_ref; + mtx_unlock(&pr->pr_mtx); + + /* + * Tell the modules if the last user reference was removed + * (even it sticks around in dying state). + */ + if (lasturef) { + if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { + if (ref > 1) { + sx_slock(&allprison_lock); + flags |= PD_LIST_SLOCKED; + } else { + sx_xlock(&allprison_lock); + flags |= PD_LIST_XLOCKED; + } + } + (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); + mtx_lock(&pr->pr_mtx); + ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); + } + + /* If the prison still has references, nothing else to do. */ + if (ref > 0) { if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) @@ -2635,7 +2687,6 @@ prison_deref(struct prison *pr, int flags) return; } - mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); @@ -2717,7 +2768,20 @@ prison_proc_free(struct prison *pr) mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); - prison_deref(pr, PD_DEUREF | PD_LOCKED); + if (pr->pr_uref > 1) + pr->pr_uref--; + else { + /* + * Don't remove the last user reference in this context, which + * is expected to be a process that is not only locked, but + * also half dead. + */ + pr->pr_ref++; + mtx_unlock(&pr->pr_mtx); + taskqueue_enqueue(taskqueue_thread, &pr->pr_task); + return; + } + mtx_unlock(&pr->pr_mtx); } diff --git a/sys/kern/kern_osd.c b/sys/kern/kern_osd.c index 184c4f0..26831c3 100644 --- a/sys/kern/kern_osd.c +++ b/sys/kern/kern_osd.c @@ -44,6 +44,23 @@ __FBSDID("$FreeBSD$"); /* OSD (Object Specific Data) */ +/* + * Lock key: + * (m) osd_module_lock + * (o) osd_object_lock + * (l) osd_list_lock + */ +struct osd_master { + struct sx osd_module_lock; + struct rmlock osd_object_lock; + struct mtx osd_list_lock; + LIST_HEAD(, osd) osd_list; /* (l) */ + osd_destructor_t *osd_destructors; /* (o) */ + osd_method_t *osd_methods; /* (m) */ + u_int osd_ntslots; /* (m) */ + const u_int osd_nmethods; +}; + static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data"); static int osd_debug = 0; @@ -62,25 +79,12 @@ static void do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked); /* - * Lists of objects with OSD. - * - * Lock key: - * (m) osd_module_lock - * (o) osd_object_lock - * (l) osd_list_lock + * List of objects with OSD. */ -static LIST_HEAD(, osd) osd_list[OSD_LAST + 1]; /* (m) */ -static osd_method_t *osd_methods[OSD_LAST + 1]; /* (m) */ -static u_int osd_nslots[OSD_LAST + 1]; /* (m) */ -static osd_destructor_t *osd_destructors[OSD_LAST + 1]; /* (o) */ -static const u_int osd_nmethods[OSD_LAST + 1] = { - [OSD_JAIL] = PR_MAXMETHOD, +struct osd_master osdm[OSD_LAST + 1] = { + [OSD_JAIL] = { .osd_nmethods = PR_MAXMETHOD }, }; -static struct sx osd_module_lock[OSD_LAST + 1]; -static struct rmlock osd_object_lock[OSD_LAST + 1]; -static struct mtx osd_list_lock[OSD_LAST + 1]; - static void osd_default_destructor(void *value __unused) { @@ -102,12 +106,12 @@ osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods) if (destructor == NULL) destructor = osd_default_destructor; - sx_xlock(&osd_module_lock[type]); + sx_xlock(&osdm[type].osd_module_lock); /* * First, we try to find unused slot. */ - for (i = 0; i < osd_nslots[type]; i++) { - if (osd_destructors[type][i] == NULL) { + for (i = 0; i < osdm[type].osd_ntslots; i++) { + if (osdm[type].osd_destructors[i] == NULL) { OSD_DEBUG("Unused slot found (type=%u, slot=%u).", type, i); break; @@ -116,31 +120,31 @@ osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods) /* * If no unused slot was found, allocate one. */ - if (i == osd_nslots[type]) { - osd_nslots[type]++; - if (osd_nmethods[type] != 0) - osd_methods[type] = realloc(osd_methods[type], - sizeof(osd_method_t) * osd_nslots[type] * - osd_nmethods[type], M_OSD, M_WAITOK); - newptr = malloc(sizeof(osd_destructor_t) * osd_nslots[type], - M_OSD, M_WAITOK); - rm_wlock(&osd_object_lock[type]); - bcopy(osd_destructors[type], newptr, + if (i == osdm[type].osd_ntslots) { + osdm[type].osd_ntslots++; + if (osdm[type].osd_nmethods != 0) + osdm[type].osd_methods = realloc(osdm[type].osd_methods, + sizeof(osd_method_t) * osdm[type].osd_ntslots * + osdm[type].osd_nmethods, M_OSD, M_WAITOK); + newptr = malloc(sizeof(osd_destructor_t) * + osdm[type].osd_ntslots, M_OSD, M_WAITOK); + rm_wlock(&osdm[type].osd_object_lock); + bcopy(osdm[type].osd_destructors, newptr, sizeof(osd_destructor_t) * i); - free(osd_destructors[type], M_OSD); - osd_destructors[type] = newptr; - rm_wunlock(&osd_object_lock[type]); + free(osdm[type].osd_destructors, M_OSD); + osdm[type].osd_destructors = newptr; + rm_wunlock(&osdm[type].osd_object_lock); OSD_DEBUG("New slot allocated (type=%u, slot=%u).", type, i + 1); } - osd_destructors[type][i] = destructor; - if (osd_nmethods[type] != 0) { - for (m = 0; m < osd_nmethods[type]; m++) - osd_methods[type][i * osd_nmethods[type] + m] = - methods != NULL ? methods[m] : NULL; + osdm[type].osd_destructors[i] = destructor; + if (osdm[type].osd_nmethods != 0) { + for (m = 0; m < osdm[type].osd_nmethods; m++) + osdm[type].osd_methods[i * osdm[type].osd_nmethods + m] + = methods != NULL ? methods[m] : NULL; } - sx_xunlock(&osd_module_lock[type]); + sx_xunlock(&osdm[type].osd_module_lock); return (i + 1); } @@ -151,105 +155,142 @@ osd_deregister(u_int type, u_int slot) KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); - sx_xlock(&osd_module_lock[type]); - rm_wlock(&osd_object_lock[type]); + sx_xlock(&osdm[type].osd_module_lock); + rm_wlock(&osdm[type].osd_object_lock); /* * Free all OSD for the given slot. */ - mtx_lock(&osd_list_lock[type]); - LIST_FOREACH_SAFE(osd, &osd_list[type], osd_next, tosd) + mtx_lock(&osdm[type].osd_list_lock); + LIST_FOREACH_SAFE(osd, &osdm[type].osd_list, osd_next, tosd) do_osd_del(type, osd, slot, 1); - mtx_unlock(&osd_list_lock[type]); + mtx_unlock(&osdm[type].osd_list_lock); /* * Set destructor to NULL to free the slot. */ - osd_destructors[type][slot - 1] = NULL; - if (slot == osd_nslots[type]) { - osd_nslots[type]--; - osd_destructors[type] = realloc(osd_destructors[type], - sizeof(osd_destructor_t) * osd_nslots[type], M_OSD, + osdm[type].osd_destructors[slot - 1] = NULL; + if (slot == osdm[type].osd_ntslots) { + osdm[type].osd_ntslots--; + osdm[type].osd_destructors = realloc(osdm[type].osd_destructors, + sizeof(osd_destructor_t) * osdm[type].osd_ntslots, M_OSD, M_NOWAIT | M_ZERO); - if (osd_nmethods[type] != 0) - osd_methods[type] = realloc(osd_methods[type], - sizeof(osd_method_t) * osd_nslots[type] * - osd_nmethods[type], M_OSD, M_NOWAIT | M_ZERO); + if (osdm[type].osd_nmethods != 0) + osdm[type].osd_methods = realloc(osdm[type].osd_methods, + sizeof(osd_method_t) * osdm[type].osd_ntslots * + osdm[type].osd_nmethods, M_OSD, M_NOWAIT | M_ZERO); /* * We always reallocate to smaller size, so we assume it will * always succeed. */ - KASSERT(osd_destructors[type] != NULL && - (osd_nmethods[type] == 0 || osd_methods[type] != NULL), - ("realloc() failed")); + KASSERT(osdm[type].osd_destructors != NULL && + (osdm[type].osd_nmethods == 0 || + osdm[type].osd_methods != NULL), ("realloc() failed")); OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).", type, slot); } else { OSD_DEBUG("Slot deregistration (type=%u, slot=%u).", type, slot); } - rm_wunlock(&osd_object_lock[type]); - sx_xunlock(&osd_module_lock[type]); + rm_wunlock(&osdm[type].osd_object_lock); + sx_xunlock(&osdm[type].osd_module_lock); } int osd_set(u_int type, struct osd *osd, u_int slot, void *value) { + + return (osd_set_reserved(type, osd, slot, NULL, value)); +} + +void * +osd_reserve(u_int slot) +{ + + KASSERT(slot > 0, ("Invalid slot.")); + + OSD_DEBUG("Reserving slot array (slot=%u).", slot); + return (malloc(sizeof(void *) * slot, M_OSD, M_WAITOK | M_ZERO)); +} + +int +osd_set_reserved(u_int type, struct osd *osd, u_int slot, void *rsv, + void *value) +{ struct rm_priotracker tracker; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); if (slot > osd->osd_nslots) { + void *newptr; + if (value == NULL) { OSD_DEBUG( "Not allocating null slot (type=%u, slot=%u).", type, slot); - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); + if (rsv) + osd_free_reserved(rsv); return (0); - } else if (osd->osd_nslots == 0) { + } + + /* + * Too few slots allocated here, so we need to extend or create + * the array. + */ + if (rsv) { /* - * First OSD for this object, so we need to allocate - * space and put it onto the list. + * Use the reserve passed in (assumed to be + * the right size). */ - osd->osd_slots = malloc(sizeof(void *) * slot, M_OSD, - M_NOWAIT | M_ZERO); - if (osd->osd_slots == NULL) { - rm_runlock(&osd_object_lock[type], &tracker); - return (ENOMEM); + newptr = rsv; + if (osd->osd_nslots != 0) { + memcpy(newptr, osd->osd_slots, + sizeof(void *) * osd->osd_nslots); + free(osd->osd_slots, M_OSD); } - osd->osd_nslots = slot; - mtx_lock(&osd_list_lock[type]); - LIST_INSERT_HEAD(&osd_list[type], osd, osd_next); - mtx_unlock(&osd_list_lock[type]); - OSD_DEBUG("Setting first slot (type=%u).", type); } else { - void *newptr; - - /* - * Too few slots allocated here, needs to extend - * the array. - */ newptr = realloc(osd->osd_slots, sizeof(void *) * slot, M_OSD, M_NOWAIT | M_ZERO); if (newptr == NULL) { - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, + &tracker); return (ENOMEM); } - osd->osd_slots = newptr; - osd->osd_nslots = slot; - OSD_DEBUG("Growing slots array (type=%u).", type); } - } + if (osd->osd_nslots == 0) { + /* + * First OSD for this object, so we need to put it + * onto the list. + */ + mtx_lock(&osdm[type].osd_list_lock); + LIST_INSERT_HEAD(&osdm[type].osd_list, osd, osd_next); + mtx_unlock(&osdm[type].osd_list_lock); + OSD_DEBUG("Setting first slot (type=%u).", type); + } else + OSD_DEBUG("Growing slots array (type=%u).", type); + osd->osd_slots = newptr; + osd->osd_nslots = slot; + } else if (rsv) + osd_free_reserved(rsv); OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type, slot, value); osd->osd_slots[slot - 1] = value; - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); return (0); } +void +osd_free_reserved(void *rsv) +{ + + OSD_DEBUG("Discarding reserved slot array."); + free(rsv, M_OSD); +} + void * osd_get(u_int type, struct osd *osd, u_int slot) { @@ -258,9 +299,9 @@ osd_get(u_int type, struct osd *osd, u_int slot) KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); if (slot > osd->osd_nslots) { value = NULL; OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot); @@ -269,7 +310,7 @@ osd_get(u_int type, struct osd *osd, u_int slot) OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).", type, slot, value); } - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); return (value); } @@ -278,9 +319,9 @@ osd_del(u_int type, struct osd *osd, u_int slot) { struct rm_priotracker tracker; - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); do_osd_del(type, osd, slot, 0); - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); } static void @@ -290,7 +331,7 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot); @@ -299,7 +340,7 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) return; } if (osd->osd_slots[slot - 1] != NULL) { - osd_destructors[type][slot - 1](osd->osd_slots[slot - 1]); + osdm[type].osd_destructors[slot - 1](osd->osd_slots[slot - 1]); osd->osd_slots[slot - 1] = NULL; } for (i = osd->osd_nslots - 1; i >= 0; i--) { @@ -313,10 +354,10 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) /* No values left for this object. */ OSD_DEBUG("No more slots left (type=%u).", type); if (!list_locked) - mtx_lock(&osd_list_lock[type]); + mtx_lock(&osdm[type].osd_list_lock); LIST_REMOVE(osd, osd_next); if (!list_locked) - mtx_unlock(&osd_list_lock[type]); + mtx_unlock(&osdm[type].osd_list_lock); free(osd->osd_slots, M_OSD); osd->osd_slots = NULL; osd->osd_nslots = 0; @@ -342,21 +383,21 @@ osd_call(u_int type, u_int method, void *obj, void *data) int error, i; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); - KASSERT(method < osd_nmethods[type], ("Invalid method.")); + KASSERT(method < osdm[type].osd_nmethods, ("Invalid method.")); /* * Call this method for every slot that defines it, stopping if an * error is encountered. */ error = 0; - sx_slock(&osd_module_lock[type]); - for (i = 0; i < osd_nslots[type]; i++) { - methodfun = - osd_methods[type][i * osd_nmethods[type] + method]; + sx_slock(&osdm[type].osd_module_lock); + for (i = 0; i < osdm[type].osd_ntslots; i++) { + methodfun = osdm[type].osd_methods[i * osdm[type].osd_nmethods + + method]; if (methodfun != NULL && (error = methodfun(obj, data)) != 0) break; } - sx_sunlock(&osd_module_lock[type]); + sx_sunlock(&osdm[type].osd_module_lock); return (error); } @@ -374,14 +415,14 @@ osd_exit(u_int type, struct osd *osd) return; } - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); for (i = 1; i <= osd->osd_nslots; i++) { - if (osd_destructors[type][i - 1] != NULL) + if (osdm[type].osd_destructors[i - 1] != NULL) do_osd_del(type, osd, i, 0); else OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i); } - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); OSD_DEBUG("Object exit (type=%u).", type); } @@ -391,13 +432,13 @@ osd_init(void *arg __unused) u_int i; for (i = OSD_FIRST; i <= OSD_LAST; i++) { - osd_nslots[i] = 0; - LIST_INIT(&osd_list[i]); - sx_init(&osd_module_lock[i], "osd_module"); - rm_init(&osd_object_lock[i], "osd_object"); - mtx_init(&osd_list_lock[i], "osd_list", NULL, MTX_DEF); - osd_destructors[i] = NULL; - osd_methods[i] = NULL; + sx_init(&osdm[i].osd_module_lock, "osd_module"); + rm_init(&osdm[i].osd_object_lock, "osd_object"); + mtx_init(&osdm[i].osd_list_lock, "osd_list", NULL, MTX_DEF); + LIST_INIT(&osdm[i].osd_list); + osdm[i].osd_destructors = NULL; + osdm[i].osd_ntslots = 0; + osdm[i].osd_methods = NULL; } } SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL); diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c index 3248278..dcbe092 100644 --- a/sys/kern/sysv_msg.c +++ b/sys/kern/sysv_msg.c @@ -62,8 +62,10 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/mutex.h> #include <sys/module.h> +#include <sys/mount.h> #include <sys/msg.h> #include <sys/racct.h> +#include <sys/sx.h> #include <sys/syscall.h> #include <sys/syscallsubr.h> #include <sys/sysent.h> @@ -80,6 +82,14 @@ static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues"); static int msginit(void); static int msgunload(void); static int sysvmsg_modload(struct module *, int, void *); +static void msq_remove(struct msqid_kernel *); +static struct prison *msg_find_prison(struct ucred *); +static int msq_prison_cansee(struct prison *, struct msqid_kernel *); +static int msg_prison_check(void *, void *); +static int msg_prison_set(void *, void *); +static int msg_prison_get(void *, void *); +static int msg_prison_remove(void *, void *); +static void msg_prison_cleanup(struct prison *); #ifdef MSG_DEBUG @@ -155,6 +165,7 @@ static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ static struct msg *msghdrs; /* MSGTQL msg headers */ static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */ static struct mtx msq_mtx; /* global mutex for message queues. */ +static unsigned msg_prison_slot;/* prison OSD slot */ static struct syscall_helper_data msg_syscalls[] = { SYSCALL_INIT_HELPER(msgctl), @@ -194,7 +205,15 @@ static struct syscall_helper_data msg32_syscalls[] = { static int msginit() { + struct prison *pr; + void *rsv; int i, error; + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_CHECK] = msg_prison_check, + [PR_METHOD_SET] = msg_prison_set, + [PR_METHOD_GET] = msg_prison_get, + [PR_METHOD_REMOVE] = msg_prison_remove, + }; TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg); TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz); @@ -258,6 +277,29 @@ msginit() } mtx_init(&msq_mtx, "msq", NULL, MTX_DEF); + /* Set current prisons according to their allow.sysvipc. */ + msg_prison_slot = osd_jail_register(NULL, methods); + rsv = osd_reserve(msg_prison_slot); + prison_lock(&prison0); + (void)osd_jail_set_reserved(&prison0, msg_prison_slot, rsv, &prison0); + prison_unlock(&prison0); + rsv = NULL; + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + if (rsv == NULL) + rsv = osd_reserve(msg_prison_slot); + prison_lock(pr); + if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { + (void)osd_jail_set_reserved(pr, msg_prison_slot, rsv, + &prison0); + rsv = NULL; + } + prison_unlock(pr); + } + if (rsv != NULL) + osd_free_reserved(rsv); + sx_sunlock(&allprison_lock); + error = syscall_helper_register(msg_syscalls); if (error != 0) return (error); @@ -298,6 +340,8 @@ msgunload() if (msqid != msginfo.msgmni) return (EBUSY); + if (msg_prison_slot != 0) + osd_jail_deregister(msg_prison_slot); #ifdef MAC for (i = 0; i < msginfo.msgtql; i++) mac_sysvmsg_destroy(&msghdrs[i]); @@ -372,6 +416,67 @@ msg_freehdr(msghdr) #endif } +static void +msq_remove(struct msqid_kernel *msqkptr) +{ + struct msg *msghdr; + + racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1); + racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum); + racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes); + crfree(msqkptr->cred); + msqkptr->cred = NULL; + + /* Free the message headers */ + msghdr = msqkptr->u.msg_first; + while (msghdr != NULL) { + struct msg *msghdr_tmp; + + /* Free the segments of each message */ + msqkptr->u.msg_cbytes -= msghdr->msg_ts; + msqkptr->u.msg_qnum--; + msghdr_tmp = msghdr; + msghdr = msghdr->msg_next; + msg_freehdr(msghdr_tmp); + } + + if (msqkptr->u.msg_cbytes != 0) + panic("msg_cbytes is screwed up"); + if (msqkptr->u.msg_qnum != 0) + panic("msg_qnum is screwed up"); + + msqkptr->u.msg_qbytes = 0; /* Mark it as free */ + +#ifdef MAC + mac_sysvmsq_cleanup(msqkptr); +#endif + + wakeup(msqkptr); +} + +static struct prison * +msg_find_prison(struct ucred *cred) +{ + struct prison *pr, *rpr; + + pr = cred->cr_prison; + prison_lock(pr); + rpr = osd_jail_get(pr, msg_prison_slot); + prison_unlock(pr); + return rpr; +} + +static int +msq_prison_cansee(struct prison *rpr, struct msqid_kernel *msqkptr) +{ + + if (msqkptr->cred == NULL || + !(rpr == msqkptr->cred->cr_prison || + prison_ischild(rpr, msqkptr->cred->cr_prison))) + return (EINVAL); + return (0); +} + #ifndef _SYS_SYSPROTO_H_ struct msgctl_args { int msqid; @@ -408,8 +513,10 @@ kern_msgctl(td, msqid, cmd, msqbuf) { int rval, error, msqix; register struct msqid_kernel *msqkptr; + struct prison *rpr; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = msg_find_prison(td->td_ucred); + if (rpr == NULL) return (ENOSYS); msqix = IPCID_TO_IX(msqid); @@ -433,6 +540,13 @@ kern_msgctl(td, msqid, cmd, msqbuf) error = EINVAL; goto done2; } + + error = msq_prison_cansee(rpr, msqkptr); + if (error != 0) { + DPRINTF(("requester can't see prison\n")); + goto done2; + } + #ifdef MAC error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd); if (error != 0) @@ -446,7 +560,9 @@ kern_msgctl(td, msqid, cmd, msqbuf) case IPC_RMID: { +#ifdef MAC struct msg *msghdr; +#endif if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; @@ -468,37 +584,7 @@ kern_msgctl(td, msqid, cmd, msqbuf) } #endif - racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1); - racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum); - racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes); - crfree(msqkptr->cred); - msqkptr->cred = NULL; - - /* Free the message headers */ - msghdr = msqkptr->u.msg_first; - while (msghdr != NULL) { - struct msg *msghdr_tmp; - - /* Free the segments of each message */ - msqkptr->u.msg_cbytes -= msghdr->msg_ts; - msqkptr->u.msg_qnum--; - msghdr_tmp = msghdr; - msghdr = msghdr->msg_next; - msg_freehdr(msghdr_tmp); - } - - if (msqkptr->u.msg_cbytes != 0) - panic("msg_cbytes is screwed up"); - if (msqkptr->u.msg_qnum != 0) - panic("msg_qnum is screwed up"); - - msqkptr->u.msg_qbytes = 0; /* Mark it as free */ - -#ifdef MAC - mac_sysvmsq_cleanup(msqkptr); -#endif - - wakeup(msqkptr); + msq_remove(msqkptr); } break; @@ -535,6 +621,8 @@ kern_msgctl(td, msqid, cmd, msqbuf) goto done2; } *msqbuf = msqkptr->u; + if (td->td_ucred->cr_prison != msqkptr->cred->cr_prison) + msqbuf->msg_perm.key = IPC_PRIVATE; break; default: @@ -570,7 +658,7 @@ sys_msgget(td, uap) DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + if (msg_find_prison(cred) == NULL) return (ENOSYS); mtx_lock(&msq_mtx); @@ -578,6 +666,8 @@ sys_msgget(td, uap) for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 && + msqkptr->cred != NULL && + msqkptr->cred->cr_prison == cred->cr_prison && msqkptr->u.msg_perm.key == key) break; } @@ -690,12 +780,14 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) int msqix, segs_needed, error = 0; register struct msqid_kernel *msqkptr; register struct msg *msghdr; + struct prison *rpr; short next; #ifdef RACCT size_t saved_msgsz; #endif - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = msg_find_prison(td->td_ucred); + if (rpr == NULL) return (ENOSYS); mtx_lock(&msq_mtx); @@ -720,6 +812,11 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) goto done2; } + if ((error = msq_prison_cansee(rpr, msqkptr))) { + DPRINTF(("requester can't see prison\n")); + goto done2; + } + if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) { DPRINTF(("requester doesn't have write access\n")); goto done2; @@ -1058,10 +1155,12 @@ kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype) size_t len; register struct msqid_kernel *msqkptr; register struct msg *msghdr; + struct prison *rpr; int msqix, error = 0; short next; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = msg_find_prison(td->td_ucred); + if (rpr == NULL) return (ENOSYS); msqix = IPCID_TO_IX(msqid); @@ -1085,6 +1184,11 @@ kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype) goto done2; } + if ((error = msq_prison_cansee(rpr, msqkptr))) { + DPRINTF(("requester can't see prison\n")); + goto done2; + } + if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; @@ -1324,9 +1428,29 @@ sys_msgrcv(td, uap) static int sysctl_msqids(SYSCTL_HANDLER_ARGS) { + struct msqid_kernel tmsqk; + struct prison *pr, *rpr; + int error, i; - return (SYSCTL_OUT(req, msqids, - sizeof(struct msqid_kernel) * msginfo.msgmni)); + pr = req->td->td_ucred->cr_prison; + rpr = msg_find_prison(req->td->td_ucred); + error = 0; + for (i = 0; i < msginfo.msgmni; i++) { + mtx_lock(&msq_mtx); + if (msqids[i].u.msg_qbytes == 0 || rpr == NULL || + msq_prison_cansee(rpr, &msqids[i]) != 0) + bzero(&tmsqk, sizeof(tmsqk)); + else { + tmsqk = msqids[i]; + if (tmsqk.cred->cr_prison != pr) + tmsqk.u.msg_perm.key = IPC_PRIVATE; + } + mtx_unlock(&msq_mtx); + error = SYSCTL_OUT(req, &tmsqk, sizeof(tmsqk)); + if (error != 0) + break; + } + return (error); } SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, @@ -1341,9 +1465,185 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0, "Size of a message segment"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0, "Number of message segments"); -SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD, +SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_msqids, "", "Message queue IDs"); +static int +msg_prison_check(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *prpr; + struct vfsoptlist *opts = data; + int error, jsys; + + /* + * sysvmsg is a jailsys integer. + * It must be "disable" if the parent jail is disabled. + */ + error = vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)); + if (error != ENOENT) { + if (error != 0) + return (error); + switch (jsys) { + case JAIL_SYS_DISABLE: + break; + case JAIL_SYS_NEW: + case JAIL_SYS_INHERIT: + prison_lock(pr->pr_parent); + prpr = osd_jail_get(pr->pr_parent, msg_prison_slot); + prison_unlock(pr->pr_parent); + if (prpr == NULL) + return (EPERM); + break; + default: + return (EINVAL); + } + } + + return (0); +} + +static int +msg_prison_set(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *tpr, *orpr, *nrpr, *trpr; + struct vfsoptlist *opts = data; + void *rsv; + int jsys, descend; + + /* + * sysvmsg controls which jail is the root of the associated msgs (this + * jail or same as the parent), or if the feature is available at all. + */ + if (vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)) == ENOENT) + jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) + ? JAIL_SYS_INHERIT + : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) + ? JAIL_SYS_DISABLE + : -1; + if (jsys == JAIL_SYS_DISABLE) { + prison_lock(pr); + orpr = osd_jail_get(pr, msg_prison_slot); + if (orpr != NULL) + osd_jail_del(pr, msg_prison_slot); + prison_unlock(pr); + if (orpr != NULL) { + if (orpr == pr) + msg_prison_cleanup(pr); + /* Disable all child jails as well. */ + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + prison_lock(tpr); + trpr = osd_jail_get(tpr, msg_prison_slot); + if (trpr != NULL) { + osd_jail_del(tpr, msg_prison_slot); + prison_unlock(tpr); + if (trpr == tpr) + msg_prison_cleanup(tpr); + } else { + prison_unlock(tpr); + descend = 0; + } + } + } + } else if (jsys != -1) { + if (jsys == JAIL_SYS_NEW) + nrpr = pr; + else { + prison_lock(pr->pr_parent); + nrpr = osd_jail_get(pr->pr_parent, msg_prison_slot); + prison_unlock(pr->pr_parent); + } + rsv = osd_reserve(msg_prison_slot); + prison_lock(pr); + orpr = osd_jail_get(pr, msg_prison_slot); + if (orpr != nrpr) + (void)osd_jail_set_reserved(pr, msg_prison_slot, rsv, + nrpr); + else + osd_free_reserved(rsv); + prison_unlock(pr); + if (orpr != nrpr) { + if (orpr == pr) + msg_prison_cleanup(pr); + if (orpr != NULL) { + /* Change child jails matching the old root, */ + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + prison_lock(tpr); + trpr = osd_jail_get(tpr, + msg_prison_slot); + if (trpr == orpr) { + (void)osd_jail_set(tpr, + msg_prison_slot, nrpr); + prison_unlock(tpr); + if (trpr == tpr) + msg_prison_cleanup(tpr); + } else { + prison_unlock(tpr); + descend = 0; + } + } + } + } + } + + return (0); +} + +static int +msg_prison_get(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *rpr; + struct vfsoptlist *opts = data; + int error, jsys; + + /* Set sysvmsg based on the jail's root prison. */ + prison_lock(pr); + rpr = osd_jail_get(pr, msg_prison_slot); + prison_unlock(pr); + jsys = rpr == NULL ? JAIL_SYS_DISABLE + : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; + error = vfs_setopt(opts, "sysvmsg", &jsys, sizeof(jsys)); + if (error == ENOENT) + error = 0; + return (error); +} + +static int +msg_prison_remove(void *obj, void *data __unused) +{ + struct prison *pr = obj; + struct prison *rpr; + + prison_lock(pr); + rpr = osd_jail_get(pr, msg_prison_slot); + prison_unlock(pr); + if (rpr == pr) + msg_prison_cleanup(pr); + return (0); +} + +static void +msg_prison_cleanup(struct prison *pr) +{ + struct msqid_kernel *msqkptr; + int i; + + /* Remove any msqs that belong to this jail. */ + mtx_lock(&msq_mtx); + for (i = 0; i < msginfo.msgmni; i++) { + msqkptr = &msqids[i]; + if (msqkptr->u.msg_qbytes != 0 && + msqkptr->cred != NULL && msqkptr->cred->cr_prison == pr) + msq_remove(msqkptr); + } + mtx_unlock(&msq_mtx); +} + +SYSCTL_JAIL_PARAM_SYS_NODE(sysvmsg, CTLFLAG_RW, "SYSV message queues"); + #ifdef COMPAT_FREEBSD32 int freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap) @@ -1522,8 +1822,6 @@ sys_msgsys(td, uap) { int error; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) - return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) return (EINVAL); diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c index 4337d4d..c39d93d 100644 --- a/sys/kern/sysv_sem.c +++ b/sys/kern/sysv_sem.c @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mutex.h> #include <sys/racct.h> #include <sys/sem.h> +#include <sys/sx.h> #include <sys/syscall.h> #include <sys/syscallsubr.h> #include <sys/sysent.h> @@ -78,7 +79,16 @@ static int sysvsem_modload(struct module *, int, void *); static int semunload(void); static void semexit_myhook(void *arg, struct proc *p); static int sysctl_sema(SYSCTL_HANDLER_ARGS); -static int semvalid(int semid, struct semid_kernel *semakptr); +static int semvalid(int semid, struct prison *rpr, + struct semid_kernel *semakptr); +static void sem_remove(int semidx, struct ucred *cred); +static struct prison *sem_find_prison(struct ucred *); +static int sem_prison_cansee(struct prison *, struct semid_kernel *); +static int sem_prison_check(void *, void *); +static int sem_prison_set(void *, void *); +static int sem_prison_get(void *, void *); +static int sem_prison_remove(void *, void *); +static void sem_prison_cleanup(struct prison *); #ifndef _SYS_SYSPROTO_H_ struct __semctl_args; @@ -104,6 +114,7 @@ LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */ LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */ static int *semu; /* undo structure pool */ static eventhandler_tag semexit_tag; +static unsigned sem_prison_slot; /* prison OSD slot */ #define SEMUNDO_MTX sem_undo_mtx #define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX); @@ -208,7 +219,8 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0, "Semaphore maximum value"); SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0, "Adjust on exit max value"); -SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD, +SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_sema, "", "Semaphore id pool"); static struct syscall_helper_data sem_syscalls[] = { @@ -247,7 +259,15 @@ static struct syscall_helper_data sem32_syscalls[] = { static int seminit(void) { + struct prison *pr; + void *rsv; int i, error; + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_CHECK] = sem_prison_check, + [PR_METHOD_SET] = sem_prison_set, + [PR_METHOD_GET] = sem_prison_get, + [PR_METHOD_REMOVE] = sem_prison_remove, + }; TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni); TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns); @@ -288,6 +308,29 @@ seminit(void) semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL, EVENTHANDLER_PRI_ANY); + /* Set current prisons according to their allow.sysvipc. */ + sem_prison_slot = osd_jail_register(NULL, methods); + rsv = osd_reserve(sem_prison_slot); + prison_lock(&prison0); + (void)osd_jail_set_reserved(&prison0, sem_prison_slot, rsv, &prison0); + prison_unlock(&prison0); + rsv = NULL; + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + if (rsv == NULL) + rsv = osd_reserve(sem_prison_slot); + prison_lock(pr); + if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { + (void)osd_jail_set_reserved(pr, sem_prison_slot, rsv, + &prison0); + rsv = NULL; + } + prison_unlock(pr); + } + if (rsv != NULL) + osd_free_reserved(rsv); + sx_sunlock(&allprison_lock); + error = syscall_helper_register(sem_syscalls); if (error != 0) return (error); @@ -313,6 +356,8 @@ semunload(void) #endif syscall_helper_unregister(sem_syscalls); EVENTHANDLER_DEREGISTER(process_exit, semexit_tag); + if (sem_prison_slot != 0) + osd_jail_deregister(sem_prison_slot); #ifdef MAC for (i = 0; i < seminfo.semmni; i++) mac_sysvsem_destroy(&sema[i]); @@ -499,11 +544,74 @@ semundo_clear(int semid, int semnum) } static int -semvalid(int semid, struct semid_kernel *semakptr) +semvalid(int semid, struct prison *rpr, struct semid_kernel *semakptr) { return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || - semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0); + semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) || + sem_prison_cansee(rpr, semakptr) ? EINVAL : 0); +} + +static void +sem_remove(int semidx, struct ucred *cred) +{ + struct semid_kernel *semakptr; + int i; + + KASSERT(semidx >= 0 && semidx < seminfo.semmni, + ("semidx out of bounds")); + semakptr = &sema[semidx]; + semakptr->u.sem_perm.cuid = cred ? cred->cr_uid : 0; + semakptr->u.sem_perm.uid = cred ? cred->cr_uid : 0; + semakptr->u.sem_perm.mode = 0; + racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems); + crfree(semakptr->cred); + semakptr->cred = NULL; + SEMUNDO_LOCK(); + semundo_clear(semidx, -1); + SEMUNDO_UNLOCK(); +#ifdef MAC + mac_sysvsem_cleanup(semakptr); +#endif + wakeup(semakptr); + for (i = 0; i < seminfo.semmni; i++) { + if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && + sema[i].u.sem_base > semakptr->u.sem_base) + mtx_lock_flags(&sema_mtx[i], LOP_DUPOK); + } + for (i = semakptr->u.sem_base - sem; i < semtot; i++) + sem[i] = sem[i + semakptr->u.sem_nsems]; + for (i = 0; i < seminfo.semmni; i++) { + if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && + sema[i].u.sem_base > semakptr->u.sem_base) { + sema[i].u.sem_base -= semakptr->u.sem_nsems; + mtx_unlock(&sema_mtx[i]); + } + } + semtot -= semakptr->u.sem_nsems; +} + +static struct prison * +sem_find_prison(struct ucred *cred) +{ + struct prison *pr, *rpr; + + pr = cred->cr_prison; + prison_lock(pr); + rpr = osd_jail_get(pr, sem_prison_slot); + prison_unlock(pr); + return rpr; +} + +static int +sem_prison_cansee(struct prison *rpr, struct semid_kernel *semakptr) +{ + + if (semakptr->cred == NULL || + !(rpr == semakptr->cred->cr_prison || + prison_ischild(rpr, semakptr->cred->cr_prison))) + return (EINVAL); + return (0); } /* @@ -582,6 +690,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, u_short *array; struct ucred *cred = td->td_ucred; int i, error; + struct prison *rpr; struct semid_ds *sbuf; struct semid_kernel *semakptr; struct mtx *sema_mtxp; @@ -590,7 +699,9 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n", semid, semnum, cmd, arg)); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + + rpr = sem_find_prison(td->td_ucred); + if (sem == NULL) return (ENOSYS); array = NULL; @@ -610,6 +721,8 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, error = EINVAL; goto done2; } + if ((error = sem_prison_cansee(rpr, semakptr))) + goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; #ifdef MAC @@ -618,6 +731,8 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, goto done2; #endif bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); + if (cred->cr_prison != semakptr->cred->cr_prison) + arg->buf->sem_perm.key = IPC_PRIVATE; *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm); mtx_unlock(sema_mtxp); return (0); @@ -632,6 +747,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, if (cmd == IPC_RMID) mtx_lock(&sem_mtx); mtx_lock(sema_mtxp); + #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) @@ -643,42 +759,15 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, switch (cmd) { case IPC_RMID: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; - semakptr->u.sem_perm.cuid = cred->cr_uid; - semakptr->u.sem_perm.uid = cred->cr_uid; - semakptr->u.sem_perm.mode = 0; - racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems); - crfree(semakptr->cred); - semakptr->cred = NULL; - SEMUNDO_LOCK(); - semundo_clear(semidx, -1); - SEMUNDO_UNLOCK(); -#ifdef MAC - mac_sysvsem_cleanup(semakptr); -#endif - wakeup(semakptr); - for (i = 0; i < seminfo.semmni; i++) { - if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && - sema[i].u.sem_base > semakptr->u.sem_base) - mtx_lock_flags(&sema_mtx[i], LOP_DUPOK); - } - for (i = semakptr->u.sem_base - sem; i < semtot; i++) - sem[i] = sem[i + semakptr->u.sem_nsems]; - for (i = 0; i < seminfo.semmni; i++) { - if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && - sema[i].u.sem_base > semakptr->u.sem_base) { - sema[i].u.sem_base -= semakptr->u.sem_nsems; - mtx_unlock(&sema_mtx[i]); - } - } - semtot -= semakptr->u.sem_nsems; + sem_remove(semidx, cred); break; case IPC_SET: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; @@ -691,15 +780,17 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, break; case IPC_STAT: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); + if (cred->cr_prison != semakptr->cred->cr_prison) + arg->buf->sem_perm.key = IPC_PRIVATE; break; case GETNCNT: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; @@ -711,7 +802,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, break; case GETPID: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; @@ -723,7 +814,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, break; case GETVAL: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; @@ -759,7 +850,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); mtx_lock(sema_mtxp); - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) @@ -772,7 +863,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, break; case GETZCNT: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; @@ -784,7 +875,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, break; case SETVAL: - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; @@ -815,7 +906,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, mtx_lock(sema_mtxp); if (error) break; - if ((error = semvalid(semid, semakptr)) != 0) + if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) @@ -865,13 +956,16 @@ sys_semget(struct thread *td, struct semget_args *uap) struct ucred *cred = td->td_ucred; DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + + if (sem_find_prison(cred) == NULL) return (ENOSYS); mtx_lock(&sem_mtx); if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) && + sema[semid].cred != NULL && + sema[semid].cred->cr_prison == cred->cr_prison && sema[semid].u.sem_perm.key == key) break; } @@ -988,6 +1082,7 @@ sys_semop(struct thread *td, struct semop_args *uap) struct sembuf small_sops[SMALL_SOPS]; int semid = uap->semid; size_t nsops = uap->nsops; + struct prison *rpr; struct sembuf *sops; struct semid_kernel *semakptr; struct sembuf *sopptr = 0; @@ -1004,7 +1099,8 @@ sys_semop(struct thread *td, struct semop_args *uap) #endif DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops)); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = sem_find_prison(td->td_ucred); + if (sem == NULL) return (ENOSYS); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ @@ -1054,6 +1150,8 @@ sys_semop(struct thread *td, struct semop_args *uap) error = EINVAL; goto done2; } + if ((error = sem_prison_cansee(rpr, semakptr)) != 0) + goto done2; /* * Initial pass thru sops to see what permissions are needed. * Also perform any checks that don't need repeating on each @@ -1377,11 +1475,207 @@ semexit_myhook(void *arg, struct proc *p) static int sysctl_sema(SYSCTL_HANDLER_ARGS) { + struct prison *pr, *rpr; + struct semid_kernel tsemak; + int error, i; + + pr = req->td->td_ucred->cr_prison; + rpr = sem_find_prison(req->td->td_ucred); + error = 0; + for (i = 0; i < seminfo.semmni; i++) { + mtx_lock(&sema_mtx[i]); + if ((sema[i].u.sem_perm.mode & SEM_ALLOC) == 0 || + rpr == NULL || sem_prison_cansee(rpr, &sema[i]) != 0) + bzero(&tsemak, sizeof(tsemak)); + else { + tsemak = sema[i]; + if (tsemak.cred->cr_prison != pr) + tsemak.u.sem_perm.key = IPC_PRIVATE; + } + mtx_unlock(&sema_mtx[i]); + error = SYSCTL_OUT(req, &tsemak, sizeof(tsemak)); + if (error != 0) + break; + } + return (error); +} + +static int +sem_prison_check(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *prpr; + struct vfsoptlist *opts = data; + int error, jsys; + + /* + * sysvsem is a jailsys integer. + * It must be "disable" if the parent jail is disabled. + */ + error = vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)); + if (error != ENOENT) { + if (error != 0) + return (error); + switch (jsys) { + case JAIL_SYS_DISABLE: + break; + case JAIL_SYS_NEW: + case JAIL_SYS_INHERIT: + prison_lock(pr->pr_parent); + prpr = osd_jail_get(pr->pr_parent, sem_prison_slot); + prison_unlock(pr->pr_parent); + if (prpr == NULL) + return (EPERM); + break; + default: + return (EINVAL); + } + } - return (SYSCTL_OUT(req, sema, - sizeof(struct semid_kernel) * seminfo.semmni)); + return (0); } +static int +sem_prison_set(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *tpr, *orpr, *nrpr, *trpr; + struct vfsoptlist *opts = data; + void *rsv; + int jsys, descend; + + /* + * sysvsem controls which jail is the root of the associated sems (this + * jail or same as the parent), or if the feature is available at all. + */ + if (vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)) == ENOENT) + jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) + ? JAIL_SYS_INHERIT + : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) + ? JAIL_SYS_DISABLE + : -1; + if (jsys == JAIL_SYS_DISABLE) { + prison_lock(pr); + orpr = osd_jail_get(pr, sem_prison_slot); + if (orpr != NULL) + osd_jail_del(pr, sem_prison_slot); + prison_unlock(pr); + if (orpr != NULL) { + if (orpr == pr) + sem_prison_cleanup(pr); + /* Disable all child jails as well. */ + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + prison_lock(tpr); + trpr = osd_jail_get(tpr, sem_prison_slot); + if (trpr != NULL) { + osd_jail_del(tpr, sem_prison_slot); + prison_unlock(tpr); + if (trpr == tpr) + sem_prison_cleanup(tpr); + } else { + prison_unlock(tpr); + descend = 0; + } + } + } + } else if (jsys != -1) { + if (jsys == JAIL_SYS_NEW) + nrpr = pr; + else { + prison_lock(pr->pr_parent); + nrpr = osd_jail_get(pr->pr_parent, sem_prison_slot); + prison_unlock(pr->pr_parent); + } + rsv = osd_reserve(sem_prison_slot); + prison_lock(pr); + orpr = osd_jail_get(pr, sem_prison_slot); + if (orpr != nrpr) + (void)osd_jail_set_reserved(pr, sem_prison_slot, rsv, + nrpr); + else + osd_free_reserved(rsv); + prison_unlock(pr); + if (orpr != nrpr) { + if (orpr == pr) + sem_prison_cleanup(pr); + if (orpr != NULL) { + /* Change child jails matching the old root, */ + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + prison_lock(tpr); + trpr = osd_jail_get(tpr, + sem_prison_slot); + if (trpr == orpr) { + (void)osd_jail_set(tpr, + sem_prison_slot, nrpr); + prison_unlock(tpr); + if (trpr == tpr) + sem_prison_cleanup(tpr); + } else { + prison_unlock(tpr); + descend = 0; + } + } + } + } + } + + return (0); +} + +static int +sem_prison_get(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *rpr; + struct vfsoptlist *opts = data; + int error, jsys; + + /* Set sysvsem based on the jail's root prison. */ + prison_lock(pr); + rpr = osd_jail_get(pr, sem_prison_slot); + prison_unlock(pr); + jsys = rpr == NULL ? JAIL_SYS_DISABLE + : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; + error = vfs_setopt(opts, "sysvsem", &jsys, sizeof(jsys)); + if (error == ENOENT) + error = 0; + return (error); +} + +static int +sem_prison_remove(void *obj, void *data __unused) +{ + struct prison *pr = obj; + struct prison *rpr; + + prison_lock(pr); + rpr = osd_jail_get(pr, sem_prison_slot); + prison_unlock(pr); + if (rpr == pr) + sem_prison_cleanup(pr); + return (0); +} + +static void +sem_prison_cleanup(struct prison *pr) +{ + int i; + + /* Remove any sems that belong to this jail. */ + mtx_lock(&sem_mtx); + for (i = 0; i < seminfo.semmni; i++) { + if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && + sema[i].cred != NULL && sema[i].cred->cr_prison == pr) { + mtx_lock(&sema_mtx[i]); + sem_remove(i, NULL); + mtx_unlock(&sema_mtx[i]); + } + } + mtx_unlock(&sem_mtx); +} + +SYSCTL_JAIL_PARAM_SYS_NODE(sysvsem, CTLFLAG_RW, "SYSV semaphores"); + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) @@ -1408,8 +1702,6 @@ sys_semsys(td, uap) { int error; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) - return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) return (EINVAL); diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 613a462..cced3e3 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -112,7 +112,8 @@ static int shmget_existing(struct thread *td, struct shmget_args *uap, static int shm_last_free, shm_nused, shmalloced; vm_size_t shm_committed; -static struct shmid_kernel *shmsegs; +static struct shmid_kernel *shmsegs; +static unsigned shm_prison_slot; struct shmmap_state { vm_offset_t va; @@ -120,8 +121,8 @@ struct shmmap_state { }; static void shm_deallocate_segment(struct shmid_kernel *); -static int shm_find_segment_by_key(key_t); -static struct shmid_kernel *shm_find_segment(int, bool); +static int shm_find_segment_by_key(struct prison *, key_t); +static struct shmid_kernel *shm_find_segment(struct prison *, int, bool); static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *); static void shmrealloc(void); static int shminit(void); @@ -130,6 +131,14 @@ static int shmunload(void); static void shmexit_myhook(struct vmspace *vm); static void shmfork_myhook(struct proc *p1, struct proc *p2); static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS); +static void shm_remove(struct shmid_kernel *, int); +static struct prison *shm_find_prison(struct ucred *); +static int shm_prison_cansee(struct prison *, struct shmid_kernel *); +static int shm_prison_check(void *, void *); +static int shm_prison_set(void *, void *); +static int shm_prison_get(void *, void *); +static int shm_prison_remove(void *, void *); +static void shm_prison_cleanup(struct prison *); /* * Tuneable values. @@ -189,12 +198,14 @@ static struct sx sysvshmsx; #define SYSVSHM_ASSERT_LOCKED() sx_assert(&sysvshmsx, SA_XLOCKED) static int -shm_find_segment_by_key(key_t key) +shm_find_segment_by_key(struct prison *pr, key_t key) { int i; for (i = 0; i < shmalloced; i++) if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) && + shmsegs[i].cred != NULL && + shmsegs[i].cred->cr_prison == pr && shmsegs[i].u.shm_perm.key == key) return (i); return (-1); @@ -205,7 +216,7 @@ shm_find_segment_by_key(key_t key) * is_shmid is false. */ static struct shmid_kernel * -shm_find_segment(int arg, bool is_shmid) +shm_find_segment(struct prison *rpr, int arg, bool is_shmid) { struct shmid_kernel *shmseg; int segnum; @@ -217,7 +228,8 @@ shm_find_segment(int arg, bool is_shmid) if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || (!shm_allow_removed && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) || - (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg))) + (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) || + shm_prison_cansee(rpr, shmseg) != 0) return (NULL); return (shmseg); } @@ -271,6 +283,41 @@ shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s) return (0); } +static void +shm_remove(struct shmid_kernel *shmseg, int segnum) +{ + + shmseg->u.shm_perm.key = IPC_PRIVATE; + shmseg->u.shm_perm.mode |= SHMSEG_REMOVED; + if (shmseg->u.shm_nattch <= 0) { + shm_deallocate_segment(shmseg); + shm_last_free = segnum; + } +} + +static struct prison * +shm_find_prison(struct ucred *cred) +{ + struct prison *pr, *rpr; + + pr = cred->cr_prison; + prison_lock(pr); + rpr = osd_jail_get(pr, shm_prison_slot); + prison_unlock(pr); + return rpr; +} + +static int +shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg) +{ + + if (shmseg->cred == NULL || + !(rpr == shmseg->cred->cr_prison || + prison_ischild(rpr, shmseg->cred->cr_prison))) + return (EINVAL); + return (0); +} + static int kern_shmdt_locked(struct thread *td, const void *shmaddr) { @@ -282,7 +329,7 @@ kern_shmdt_locked(struct thread *td, const void *shmaddr) int error, i; SYSVSHM_ASSERT_LOCKED(); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + if (shm_find_prison(td->td_ucred) == NULL) return (ENOSYS); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) @@ -325,6 +372,7 @@ static int kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr, int shmflg) { + struct prison *rpr; struct proc *p = td->td_proc; struct shmid_kernel *shmseg; struct shmmap_state *shmmap_s; @@ -334,7 +382,8 @@ kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr, int error, i, rv; SYSVSHM_ASSERT_LOCKED(); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = shm_find_prison(td->td_ucred); + if (rpr == NULL) return (ENOSYS); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) { @@ -345,7 +394,7 @@ kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr, KASSERT(p->p_vmspace->vm_shm == NULL, ("raced")); p->p_vmspace->vm_shm = shmmap_s; } - shmseg = shm_find_segment(shmid, true); + shmseg = shm_find_segment(rpr, shmid, true); if (shmseg == NULL) return (EINVAL); error = ipcperm(td, &shmseg->u.shm_perm, @@ -433,6 +482,7 @@ static int kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz) { + struct prison *rpr; struct shmid_kernel *shmseg; struct shmid_ds *shmidp; struct shm_info shm_info; @@ -440,7 +490,8 @@ kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, SYSVSHM_ASSERT_LOCKED(); - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = shm_find_prison(td->td_ucred); + if (rpr == NULL) return (ENOSYS); error = 0; @@ -474,7 +525,7 @@ kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, return (0); } } - shmseg = shm_find_segment(shmid, cmd != SHM_STAT); + shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT); if (shmseg == NULL) return (EINVAL); #ifdef MAC @@ -485,10 +536,13 @@ kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, switch (cmd) { case SHM_STAT: case IPC_STAT: + shmidp = (struct shmid_ds *)buf; error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error != 0) return (error); - memcpy(buf, &shmseg->u, sizeof(struct shmid_ds)); + memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds)); + if (td->td_ucred->cr_prison != shmseg->cred->cr_prison) + shmidp->shm_perm.key = IPC_PRIVATE; if (bufsz != NULL) *bufsz = sizeof(struct shmid_ds); if (cmd == SHM_STAT) { @@ -512,12 +566,7 @@ kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error != 0) return (error); - shmseg->u.shm_perm.key = IPC_PRIVATE; - shmseg->u.shm_perm.mode |= SHMSEG_REMOVED; - if (shmseg->u.shm_nattch <= 0) { - shm_deallocate_segment(shmseg); - shm_last_free = IPCID_TO_IX(shmid); - } + shm_remove(shmseg, IPCID_TO_IX(shmid)); break; #if 0 case SHM_LOCK: @@ -724,14 +773,15 @@ sys_shmget(struct thread *td, struct shmget_args *uap) int segnum, mode; int error; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + if (shm_find_prison(td->td_ucred) == NULL) return (ENOSYS); mode = uap->shmflg & ACCESSPERMS; SYSVSHM_LOCK(); if (uap->key == IPC_PRIVATE) { error = shmget_allocate_segment(td, uap, mode); } else { - segnum = shm_find_segment_by_key(uap->key); + segnum = shm_find_segment_by_key(td->td_ucred->cr_prison, + uap->key); if (segnum >= 0) error = shmget_existing(td, uap, mode, segnum); else if ((uap->shmflg & IPC_CREAT) == 0) @@ -852,7 +902,15 @@ static struct syscall_helper_data shm32_syscalls[] = { static int shminit(void) { + struct prison *pr; + void *rsv; int i, error; + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_CHECK] = shm_prison_check, + [PR_METHOD_SET] = shm_prison_set, + [PR_METHOD_GET] = shm_prison_get, + [PR_METHOD_REMOVE] = shm_prison_remove, + }; #ifndef BURN_BRIDGES if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0) @@ -888,6 +946,29 @@ shminit(void) shmexit_hook = &shmexit_myhook; shmfork_hook = &shmfork_myhook; + /* Set current prisons according to their allow.sysvipc. */ + shm_prison_slot = osd_jail_register(NULL, methods); + rsv = osd_reserve(shm_prison_slot); + prison_lock(&prison0); + (void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0); + prison_unlock(&prison0); + rsv = NULL; + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + if (rsv == NULL) + rsv = osd_reserve(shm_prison_slot); + prison_lock(pr); + if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { + (void)osd_jail_set_reserved(pr, shm_prison_slot, rsv, + &prison0); + rsv = NULL; + } + prison_unlock(pr); + } + if (rsv != NULL) + osd_free_reserved(rsv); + sx_sunlock(&allprison_lock); + error = syscall_helper_register(shm_syscalls); if (error != 0) return (error); @@ -911,6 +992,8 @@ shmunload(void) syscall32_helper_unregister(shm32_syscalls); #endif syscall_helper_unregister(shm_syscalls); + if (shm_prison_slot != 0) + osd_jail_deregister(shm_prison_slot); for (i = 0; i < shmalloced; i++) { #ifdef MAC @@ -934,14 +1017,209 @@ shmunload(void) static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS) { - int error; + struct shmid_kernel tshmseg; + struct prison *pr, *rpr; + int error, i; SYSVSHM_LOCK(); - error = SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])); + pr = req->td->td_ucred->cr_prison; + rpr = shm_find_prison(req->td->td_ucred); + error = 0; + for (i = 0; i < shmalloced; i++) { + if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || + rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) { + bzero(&tshmseg, sizeof(tshmseg)); + tshmseg.u.shm_perm.mode = SHMSEG_FREE; + } else { + tshmseg = shmsegs[i]; + if (tshmseg.cred->cr_prison != pr) + tshmseg.u.shm_perm.key = IPC_PRIVATE; + } + error = SYSCTL_OUT(req, &tshmseg, sizeof(tshmseg)); + if (error != 0) + break; + } SYSVSHM_UNLOCK(); return (error); } +static int +shm_prison_check(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *prpr; + struct vfsoptlist *opts = data; + int error, jsys; + + /* + * sysvshm is a jailsys integer. + * It must be "disable" if the parent jail is disabled. + */ + error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)); + if (error != ENOENT) { + if (error != 0) + return (error); + switch (jsys) { + case JAIL_SYS_DISABLE: + break; + case JAIL_SYS_NEW: + case JAIL_SYS_INHERIT: + prison_lock(pr->pr_parent); + prpr = osd_jail_get(pr->pr_parent, shm_prison_slot); + prison_unlock(pr->pr_parent); + if (prpr == NULL) + return (EPERM); + break; + default: + return (EINVAL); + } + } + + return (0); +} + +static int +shm_prison_set(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *tpr, *orpr, *nrpr, *trpr; + struct vfsoptlist *opts = data; + void *rsv; + int jsys, descend; + + /* + * sysvshm controls which jail is the root of the associated segments + * (this jail or same as the parent), or if the feature is available + * at all. + */ + if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT) + jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) + ? JAIL_SYS_INHERIT + : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) + ? JAIL_SYS_DISABLE + : -1; + if (jsys == JAIL_SYS_DISABLE) { + prison_lock(pr); + orpr = osd_jail_get(pr, shm_prison_slot); + if (orpr != NULL) + osd_jail_del(pr, shm_prison_slot); + prison_unlock(pr); + if (orpr != NULL) { + if (orpr == pr) + shm_prison_cleanup(pr); + /* Disable all child jails as well. */ + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + prison_lock(tpr); + trpr = osd_jail_get(tpr, shm_prison_slot); + if (trpr != NULL) { + osd_jail_del(tpr, shm_prison_slot); + prison_unlock(tpr); + if (trpr == tpr) + shm_prison_cleanup(tpr); + } else { + prison_unlock(tpr); + descend = 0; + } + } + } + } else if (jsys != -1) { + if (jsys == JAIL_SYS_NEW) + nrpr = pr; + else { + prison_lock(pr->pr_parent); + nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot); + prison_unlock(pr->pr_parent); + } + rsv = osd_reserve(shm_prison_slot); + prison_lock(pr); + orpr = osd_jail_get(pr, shm_prison_slot); + if (orpr != nrpr) + (void)osd_jail_set_reserved(pr, shm_prison_slot, rsv, + nrpr); + else + osd_free_reserved(rsv); + prison_unlock(pr); + if (orpr != nrpr) { + if (orpr == pr) + shm_prison_cleanup(pr); + if (orpr != NULL) { + /* Change child jails matching the old root, */ + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + prison_lock(tpr); + trpr = osd_jail_get(tpr, + shm_prison_slot); + if (trpr == orpr) { + (void)osd_jail_set(tpr, + shm_prison_slot, nrpr); + prison_unlock(tpr); + if (trpr == tpr) + shm_prison_cleanup(tpr); + } else { + prison_unlock(tpr); + descend = 0; + } + } + } + } + } + + return (0); +} + +static int +shm_prison_get(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *rpr; + struct vfsoptlist *opts = data; + int error, jsys; + + /* Set sysvshm based on the jail's root prison. */ + prison_lock(pr); + rpr = osd_jail_get(pr, shm_prison_slot); + prison_unlock(pr); + jsys = rpr == NULL ? JAIL_SYS_DISABLE + : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; + error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys)); + if (error == ENOENT) + error = 0; + return (error); +} + +static int +shm_prison_remove(void *obj, void *data __unused) +{ + struct prison *pr = obj; + struct prison *rpr; + + SYSVSHM_LOCK(); + prison_lock(pr); + rpr = osd_jail_get(pr, shm_prison_slot); + prison_unlock(pr); + if (rpr == pr) + shm_prison_cleanup(pr); + SYSVSHM_UNLOCK(); + return (0); +} + +static void +shm_prison_cleanup(struct prison *pr) +{ + struct shmid_kernel *shmseg; + int i; + + /* Remove any segments that belong to this jail. */ + for (i = 0; i < shmalloced; i++) { + shmseg = &shmsegs[i]; + if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) && + shmseg->cred != NULL && shmseg->cred->cr_prison == pr) { + shm_remove(shmseg, i); + } + } +} + +SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory"); + #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) struct oshmid_ds { struct ipc_perm_old shm_perm; /* operation perms */ @@ -966,17 +1244,19 @@ oshmctl(struct thread *td, struct oshmctl_args *uap) { #ifdef COMPAT_43 int error = 0; + struct prison *rpr; struct shmid_kernel *shmseg; struct oshmid_ds outbuf; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) + rpr = shm_find_prison(td->td_ucred); + if (rpr == NULL) return (ENOSYS); if (uap->cmd != IPC_STAT) { return (freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap)); } SYSVSHM_LOCK(); - shmseg = shm_find_segment(uap->shmid, true); + shmseg = shm_find_segment(rpr, uap->shmid, true); if (shmseg == NULL) { SYSVSHM_UNLOCK(); return (EINVAL); @@ -1031,8 +1311,6 @@ sys_shmsys(struct thread *td, struct shmsys_args *uap) { int error; - if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) - return (ENOSYS); if (uap->which < 0 || uap->which >= nitems(shmcalls)) return (EINVAL); error = (*shmcalls[uap->which])(td, &uap->a2); |