From 923c7538236564c46ee80c253a416705321f13e3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Dec 2012 11:39:12 +0800 Subject: userns: Allow unprivileged reboot In a container with its own pid namespace and user namespace, rebooting the system won't reboot the host, but terminate all the processes in it and thus have the container shutdown, so it's safe. Signed-off-by: Li Zefan Signed-off-by: Eric W. Biederman --- kernel/sys.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 265b376..24d1ef5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -433,11 +433,12 @@ static DEFINE_MUTEX(reboot_mutex); SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); char buffer[256]; int ret = 0; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; /* For safety, we require "magic" arguments. */ @@ -453,7 +454,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, * pid_namespace, the command is handled by reboot_pid_ns() which will * call do_exit(). */ - ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + ret = reboot_pid_ns(pid_ns, cmd); if (ret) return ret; -- cgit v1.1 From c61a2810a2161986353705b44d9503e6bb079f4f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Dec 2012 18:58:39 -0800 Subject: userns: Avoid recursion in put_user_ns When freeing a deeply nested user namespace free_user_ns calls put_user_ns on it's parent which may in turn call free_user_ns again. When -fno-optimize-sibling-calls is passed to gcc one stack frame per user namespace is left on the stack, potentially overflowing the kernel stack. CONFIG_FRAME_POINTER forces -fno-optimize-sibling-calls so we can't count on gcc to optimize this code. Remove struct kref and use a plain atomic_t. Making the code more flexible and easier to comprehend. Make the loop in free_user_ns explict to guarantee that the stack does not overflow with CONFIG_FRAME_POINTER enabled. I have tested this fix with a simple program that uses unshare to create a deeply nested user namespace structure and then calls exit. With 1000 nesteuser namespaces before this change running my test program causes the kernel to die a horrible death. With 10,000,000 nested user namespaces after this change my test program runs to completion and causes no harm. Acked-by: Serge Hallyn Pointed-out-by: Vasily Kulikov Signed-off-by: "Eric W. Biederman" --- kernel/user.c | 4 +--- kernel/user_namespace.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 33acb5e..57ebfd4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -47,9 +47,7 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, - .kref = { - .refcount = ATOMIC_INIT(3), - }, + .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2b042c4..24f8ec3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -78,7 +78,7 @@ int create_user_ns(struct cred *new) return ret; } - kref_init(&ns->kref); + atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; @@ -104,15 +104,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return create_user_ns(cred); } -void free_user_ns(struct kref *kref) +void free_user_ns(struct user_namespace *ns) { - struct user_namespace *parent, *ns = - container_of(kref, struct user_namespace, kref); + struct user_namespace *parent; - parent = ns->parent; - proc_free_inum(ns->proc_inum); - kmem_cache_free(user_ns_cachep, ns); - put_user_ns(parent); + do { + parent = ns->parent; + proc_free_inum(ns->proc_inum); + kmem_cache_free(user_ns_cachep, ns); + ns = parent; + } while (atomic_dec_and_test(&parent->count)); } EXPORT_SYMBOL(free_user_ns); -- cgit v1.1 From 0bd14b4fd72afd5df41e9fd59f356740f22fceba Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 27 Dec 2012 22:27:29 -0800 Subject: userns: Allow any uid or gid mappings that don't overlap. When I initially wrote the code for /proc//uid_map. I was lazy and avoided duplicate mappings by the simple expedient of ensuring the first number in a new extent was greater than any number in the previous extent. Unfortunately that precludes a number of valid mappings, and someone noticed and complained. So use a simple check to ensure that ranges in the mapping extents don't overlap. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 24f8ec3..8b65083 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -520,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; +static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +{ + u32 upper_first, lower_first, upper_last, lower_last; + unsigned idx; + + upper_first = extent->first; + lower_first = extent->lower_first; + upper_last = upper_first + extent->count - 1; + lower_last = lower_first + extent->count - 1; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + u32 prev_upper_first, prev_lower_first; + u32 prev_upper_last, prev_lower_last; + struct uid_gid_extent *prev; + + prev = &new_map->extent[idx]; + + prev_upper_first = prev->first; + prev_lower_first = prev->lower_first; + prev_upper_last = prev_upper_first + prev->count - 1; + prev_lower_last = prev_lower_first + prev->count - 1; + + /* Does the upper range intersect a previous extent? */ + if ((prev_upper_first <= upper_last) && + (prev_upper_last >= upper_first)) + return true; + + /* Does the lower range intersect a previous extent? */ + if ((prev_lower_first <= lower_last) && + (prev_lower_last >= lower_first)) + return true; + } + return false; +} + + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -532,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent, *last = NULL; + struct uid_gid_extent *extent = NULL; unsigned long page = 0; char *kbuf, *pos, *next_line; ssize_t ret = -EINVAL; @@ -635,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf, if ((extent->lower_first + extent->count) <= extent->lower_first) goto out; - /* For now only accept extents that are strictly in order */ - if (last && - (((last->first + last->count) > extent->first) || - ((last->lower_first + last->count) > extent->lower_first))) + /* Do the ranges in extent overlap any previous extents? */ + if (mappings_overlap(&new_map, extent)) goto out; new_map.nr_extents++; - last = extent; /* Fail if the file contains too many extents */ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && -- cgit v1.1