From eab172294d5e24464f332dd8e94a57a9819c81c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 17:03:22 +0800 Subject: sched: cleanup for alloc_rt/fair_sched_group() Impact: cleanup Remove checking parent == NULL. It won't be NULLL, because we dynamically create sub task_group only, and sub task_group always has its parent. (root task_group is statically defined) Also replace kmalloc_node(GFP_ZERO) with kzalloc_node(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc..7dd6c86 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8472,7 +8472,7 @@ static int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se, *parent_se; + struct sched_entity *se; struct rq *rq; int i; @@ -8488,18 +8488,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); if (!se) goto err; - parent_se = parent ? parent->se[i] : NULL; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; @@ -8560,7 +8559,7 @@ static int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se, *parent_se; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -8577,18 +8576,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - rt_rq = kmalloc_node(sizeof(struct rt_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) goto err; - rt_se = kmalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); if (!rt_se) goto err; - parent_se = parent ? parent->rt_se[i] : NULL; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; -- cgit v1.1 From 34f3a814eef8069a24e5b3ebcf27aba9dabac2ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 30 Oct 2008 15:23:32 +0800 Subject: sched: switch sched_features to seqfile Impact: cleanup So handling of sched_features read is simplified. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7dd6c86..5419df9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = { #undef SCHED_FEAT -static int sched_feat_open(struct inode *inode, struct file *filp) +static int sched_feat_show(struct seq_file *m, void *v) { - filp->private_data = inode->i_private; - return 0; -} - -static ssize_t -sched_feat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf; - int r = 0; - int len = 0; int i; for (i = 0; sched_feat_names[i]; i++) { - len += strlen(sched_feat_names[i]); - len += 4; + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); } + seq_puts(m, "\n"); - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; sched_feat_names[i]; i++) { - if (sysctl_sched_features & (1UL << i)) - r += sprintf(buf + r, "%s ", sched_feat_names[i]); - else - r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); - } - - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; + return 0; } static ssize_t @@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return cnt; } +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .read = sched_feat_read, - .write = sched_feat_write, + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static __init int sched_init_debug(void) -- cgit v1.1 From 8bb8c4386d08f2cc5d871d22f220d35032213f84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 1 Nov 2008 00:13:49 +0100 Subject: sched, ftrace: trace sched.c Impact: allow function tracing within sched.c Its useful to see what happens in sched.c. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66..e1af039 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -mno-spe -pg endif obj-$(CONFIG_FREEZER) += freezer.o -- cgit v1.1 From e113a745f693af196c8081b328bf42def086989b Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 31 Oct 2008 08:03:41 -0500 Subject: sched/rt: small optimization to update_curr_rt() Impact: micro-optimization to SCHED_FIFO/RR scheduling A very minor improvement, but might it be better to check sched_rt_runtime(rt_rq) before taking the rt_runtime_lock? Peter Zijlstra observes: > Yes, I think its ok to do so. > > Like pointed out in the other thread, there are two races: > > - sched_rt_runtime() going to RUNTIME_INF, and that will be handled > properly by sched_rt_runtime_exceeded() > > - sched_rt_runtime() going to !RUNTIME_INF, and here we can miss an > accounting cycle, but I don't think that is something to worry too > much about. Signed-off-by: Dimitri Sivanich Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar -- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d9ba9d5..c7963d5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); - spin_lock(&rt_rq->rt_runtime_lock); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.1 From eefd796a8e831408ce17e633d73d70430748c47a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:15:37 +0800 Subject: sched debug: remove sd_level_to_string() Impact: cleanup Just use the newly introduced sd->name. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5419df9..7ac59ba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6602,28 +6602,6 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG -static inline const char *sd_level_to_string(enum sched_domain_level lvl) -{ - switch (lvl) { - case SD_LV_NONE: - return "NONE"; - case SD_LV_SIBLING: - return "SIBLING"; - case SD_LV_MC: - return "MC"; - case SD_LV_CPU: - return "CPU"; - case SD_LV_NODE: - return "NODE"; - case SD_LV_ALLNODES: - return "ALLNODES"; - case SD_LV_MAX: - return "MAX"; - - } - return "MAX"; -} - static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6643,8 +6621,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", - str, sd_level_to_string(sd->level)); + printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " -- cgit v1.1 From 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:17:05 +0800 Subject: sched debug: remove NULL checking in print_cfs/rt_rq() Impact: cleanup cfs->tg is initialized in init_tg_cfs_entry() with tg != NULL, and will never be invalidated to NULL. And the underlying cgroup of a valid task_group is always valid. Same for rt->tg. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae1776..d25cefe 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -121,14 +121,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) char path[128] = ""; - struct cgroup *cgroup = NULL; struct task_group *tg = cfs_rq->tg; - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); + cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); #else @@ -193,14 +188,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) char path[128] = ""; - struct cgroup *cgroup = NULL; struct task_group *tg = rt_rq->tg; - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); + cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); #else -- cgit v1.1 From a17e2260926f681a0eb983c1e3cb859ba2064bce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:19:13 +0800 Subject: sched: remove redundant call to unregister_sched_domain_sysctl() Impact: cleanup The sysctl has been unregistered by partition_sched_domains(). Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7ac59ba..3cb94fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7691,8 +7691,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - unregister_sched_domain_sysctl(); - for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); -- cgit v1.1 From faa2f98f856e89d1afb6e4a91707284d242e816e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:20:23 +0800 Subject: sched: add sanity check in partition_sched_domains() Impact: cleanup, add debug check It's wrong to make dattr_new = NULL if doms_new == NULL, it introduces memory leak if dattr_new != NULL. Fortunately dattr_new is always NULL in this case. So remove the code and add a sanity check. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3cb94fa..213cad5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7767,7 +7767,7 @@ match1: ndoms_cur = 0; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; + WARN_ON_ONCE(dattr_new); } /* Build new domains */ -- cgit v1.1 From 1f29fae29709b4668979e244c09b2fa78ff1ad59 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 5 Nov 2008 16:08:52 -0600 Subject: file capabilities: add no_file_caps switch (v4) Add a no_file_caps boot option when file capabilities are compiled into the kernel (CONFIG_SECURITY_FILE_CAPABILITIES=y). This allows distributions to ship a kernel with file capabilities compiled in, without forcing users to use (and understand and trust) them. When no_file_caps is specified at boot, then when a process executes a file, any file capabilities stored with that file will not be used in the calculation of the process' new capability sets. This means that booting with the no_file_caps boot option will not be the same as booting a kernel with file capabilities compiled out - in particular a task with CAP_SETPCAP will not have any chance of passing capabilities to another task (which isn't "really" possible anyway, and which may soon by killed altogether by David Howells in any case), and it will instead be able to put new capabilities in its pI. However since fI will always be empty and pI is masked with fI, it gains the task nothing. We also support the extra prctl options, setting securebits and dropping capabilities from the per-process bounding set. The other remaining difference is that killpriv, task_setscheduler, setioprio, and setnice will continue to be hooked. That will be noticable in the case where a root task changed its uid while keeping some caps, and another task owned by the new uid tries to change settings for the more privileged task. Changelog: Nov 05 2008: (v4) trivial port on top of always-start-\ with-clear-caps patch Sep 23 2008: nixed file_caps_enabled when file caps are not compiled in as it isn't used. Document no_file_caps in kernel-parameters.txt. Signed-off-by: Serge Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- kernel/capability.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 33e51e7..e13a685 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -33,6 +33,17 @@ EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); EXPORT_SYMBOL(__cap_init_eff_set); +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +int file_caps_enabled = 1; + +static int __init file_caps_disable(char *str) +{ + file_caps_enabled = 0; + return 1; +} +__setup("no_file_caps", file_caps_disable); +#endif + /* * More recent versions of libcap are available from: * -- cgit v1.1 From cf7f8690e864c6fe11e77202dd847fa60f483418 Mon Sep 17 00:00:00 2001 From: Sripathi Kodi Date: Wed, 5 Nov 2008 18:57:14 +0530 Subject: sched, lockdep: inline double_unlock_balance() We have a test case which measures the variation in the amount of time needed to perform a fixed amount of work on the preempt_rt kernel. We started seeing deterioration in it's performance recently. The test should never take more than 10 microseconds, but we started 5-10% failure rate. Using elimination method, we traced the problem to commit 1b12bbc747560ea68bcc132c3d05699e52271da0 (lockdep: re-annotate scheduler runqueues). When LOCKDEP is disabled, this patch only adds an additional function call to double_unlock_balance(). Hence I inlined double_unlock_balance() and the problem went away. Here is a patch to make this change. Signed-off-by: Sripathi Kodi Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_rt.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc..ad10d0a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2825,7 +2825,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { spin_unlock(&busiest->lock); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c7963d5..2bdd444 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -910,7 +910,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) #define RT_MAX_TRIES 3 static int double_lock_balance(struct rq *this_rq, struct rq *busiest); -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); +static inline void double_unlock_balance(struct rq *this_rq, + struct rq *busiest); static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); -- cgit v1.1 From 6d21cd62516a9697cb7ec33cc52e6b814fb65a13 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Nov 2008 17:03:18 +0800 Subject: sched: clean up SCHED_CPUMASK_ALLOC Impact: cleanup The #if/#endif is ugly. Change SCHED_CPUMASK_ALLOC and SCHED_CPUMASK_FREE to static inline functions. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b24e57a..59db86c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7279,13 +7279,21 @@ struct allmasks { }; #if NR_CPUS > 128 -#define SCHED_CPUMASK_ALLOC 1 -#define SCHED_CPUMASK_FREE(v) kfree(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +static inline void sched_cpumask_alloc(struct allmasks **masks) +{ + *masks = kmalloc(sizeof(**masks), GFP_KERNEL); +} +static inline void sched_cpumask_free(struct allmasks *masks) +{ + kfree(masks); +} #else -#define SCHED_CPUMASK_ALLOC 0 -#define SCHED_CPUMASK_FREE(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +static inline void sched_cpumask_alloc(struct allmasks **masks) +{ } +static inline void sched_cpumask_free(struct allmasks *masks) +{ } #endif #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ @@ -7361,9 +7369,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, return -ENOMEM; } -#if SCHED_CPUMASK_ALLOC /* get space for all scratch cpumask variables */ - allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + sched_cpumask_alloc(&allmasks); if (!allmasks) { printk(KERN_WARNING "Cannot alloc cpumask array\n"); kfree(rd); @@ -7372,7 +7379,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif return -ENOMEM; } -#endif + tmpmask = (cpumask_t *)allmasks; @@ -7626,13 +7633,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpu_attach_domain(sd, rd, i); } - SCHED_CPUMASK_FREE((void *)allmasks); + sched_cpumask_free(allmasks); return 0; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - SCHED_CPUMASK_FREE((void *)allmasks); + sched_cpumask_free(allmasks); kfree(rd); return -ENOMEM; #endif -- cgit v1.1 From ae1e9130bfb9ad55eb97ec3fb17a122b7a118f98 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 09:05:16 +0100 Subject: sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER => SCHED_OMIT_FRAME_POINTER Impact: cleanup, change .config option name We had this ugly config name for a long time for hysteric raisons. Rename it to a saner name. We still cannot get rid of it completely, until /proc//stack usage replaces WCHAN usage for good. We'll be able to do that in the v2.6.29/v2.6.30 timeframe. Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index e1af039..46e67a3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,7 +91,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure -- cgit v1.1 From ff9b48c3598732926fa09afd7f526981c32a48cc Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 10 Nov 2008 21:34:09 +0530 Subject: sched: include group statistics in /proc/sched_debug Impact: extend /proc/sched_debug info Since the statistics of a group entity isn't exported directly from the kernel, it becomes difficult to obtain some of the group statistics. For example, the current method to obtain exec time of a group entity is not always accurate. One has to read the exec times of all the tasks(/proc//sched) in the group and add them. This method fails (or becomes difficult) if we want to collect stats of a group over a duration where tasks get created and terminated. This patch makes it easier to obtain group stats by directly including them in /proc/sched_debug. Stats like group exec time would help user programs (like LTP) to accurately measure the group fairness. An example output of group stats from /proc/sched_debug: cfs_rq[3]:/3/a/1 .exec_clock : 89.598007 .MIN_vruntime : 0.000001 .min_vruntime : 256300.970506 .max_vruntime : 0.000001 .spread : 0.000000 .spread0 : -25373.372248 .nr_running : 0 .load : 0 .yld_exp_empty : 0 .yld_act_empty : 0 .yld_both_empty : 0 .yld_count : 4474 .sched_switch : 0 .sched_count : 40507 .sched_goidle : 12686 .ttwu_count : 15114 .ttwu_local : 11950 .bkl_count : 67 .nr_spread_over : 0 .shares : 0 .se->exec_start : 113676.727170 .se->vruntime : 1592.612714 .se->sum_exec_runtime : 89.598007 .se->wait_start : 0.000000 .se->sleep_start : 0.000000 .se->block_start : 0.000000 .se->sleep_max : 0.000000 .se->block_max : 0.000000 .se->exec_max : 1.000282 .se->slice_max : 1.999750 .se->wait_max : 54.981093 .se->wait_sum : 217.610521 .se->wait_count : 50 .se->load.weight : 2 Signed-off-by: Bharata B Rao Acked-by: Srivatsa Vaddagiri Acked-by: Dhaval Giani Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index d25cefe..3625a65 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#ifdef CONFIG_FAIR_GROUP_SCHED +static void print_cfs_group_stats(struct seq_file *m, int cpu, + struct task_group *tg) +{ + struct sched_entity *se = tg->se[cpu]; + if (!se) + return; + +#define P(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define PN(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + + PN(se->exec_start); + PN(se->vruntime); + PN(se->sum_exec_runtime); +#ifdef CONFIG_SCHEDSTATS + PN(se->wait_start); + PN(se->sleep_start); + PN(se->block_start); + PN(se->sleep_max); + PN(se->block_max); + PN(se->exec_max); + PN(se->slice_max); + PN(se->wait_max); + PN(se->wait_sum); + P(se->wait_count); +#endif + P(se->load.weight); +#undef PN +#undef P +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -181,6 +215,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); #endif + print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -261,7 +296,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); -- cgit v1.1 From 851f7ff56d9c21272f289dd85fb3f1b6cf7a6e10 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:14 +1100 Subject: This patch will print cap_permitted and cap_inheritable data in the PATH records of any file that has file capabilities set. Files which do not have fcaps set will not have different PATH records. An example audit record if you run: setcap "cap_net_admin+pie" /bin/bash /bin/bash type=SYSCALL msg=audit(1225741937.363:230): arch=c000003e syscall=59 success=yes exit=0 a0=2119230 a1=210da30 a2=20ee290 a3=8 items=2 ppid=2149 pid=2923 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=EXECVE msg=audit(1225741937.363:230): argc=2 a0="ping" a1="www.google.com" type=CWD msg=audit(1225741937.363:230): cwd="/root" type=PATH msg=audit(1225741937.363:230): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0104755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fi=0000000000002000 cap_fe=1 cap_fver=2 type=PATH msg=audit(1225741937.363:230): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/auditsc.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cf5bc2f..de7e9bc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -65,6 +65,7 @@ #include #include #include +#include #include "audit.h" @@ -84,6 +85,15 @@ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; +struct audit_cap_data { + kernel_cap_t permitted; + kernel_cap_t inheritable; + union { + unsigned int fE; /* effective bit of a file capability */ + kernel_cap_t effective; /* effective set of a process */ + }; +}; + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -100,6 +110,8 @@ struct audit_names { gid_t gid; dev_t rdev; u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; }; struct audit_aux_data { @@ -1171,6 +1183,35 @@ static void audit_log_execve_info(struct audit_context *context, kfree(buf); } +static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) { + audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + } +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + kernel_cap_t *perm = &name->fcap.permitted; + kernel_cap_t *inh = &name->fcap.inheritable; + int log = 0; + + if (!cap_isclear(*perm)) { + audit_log_cap(ab, "cap_fp", perm); + log = 1; + } + if (!cap_isclear(*inh)) { + audit_log_cap(ab, "cap_fi", inh); + log = 1; + } + + if (log) + audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1421,6 +1462,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } + audit_log_fcaps(ab, n); + audit_log_end(ab); } @@ -1787,8 +1830,36 @@ static int audit_inc_name_count(struct audit_context *context, return 0; } + +static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); + memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); + name->fcap.fE = 0; + name->fcap_ver = 0; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; + + return 0; +} + + /* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct inode *inode) +static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + const struct inode *inode) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; @@ -1797,6 +1868,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode name->gid = inode->i_gid; name->rdev = inode->i_rdev; security_inode_getsecid(inode, &name->osid); + audit_copy_fcaps(name, dentry); } /** @@ -1831,7 +1903,7 @@ void __audit_inode(const char *name, const struct dentry *dentry) context->names[idx].name = NULL; } handle_path(dentry); - audit_copy_inode(&context->names[idx], inode); + audit_copy_inode(&context->names[idx], dentry, inode); } /** @@ -1892,7 +1964,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, if (!strcmp(dname, n->name) || !audit_compare_dname_path(dname, n->name, &dirlen)) { if (inode) - audit_copy_inode(n, inode); + audit_copy_inode(n, NULL, inode); else n->ino = (unsigned long)-1; found_child = n->name; @@ -1906,7 +1978,7 @@ add_names: return; idx = context->name_count - 1; context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], parent); + audit_copy_inode(&context->names[idx], NULL, parent); } if (!found_child) { @@ -1927,7 +1999,7 @@ add_names: } if (inode) - audit_copy_inode(&context->names[idx], inode); + audit_copy_inode(&context->names[idx], NULL, inode); else context->names[idx].ino = (unsigned long)-1; } -- cgit v1.1 From 3fc689e96c0c90b6fede5946d6c31075e9464f69 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:18 +1100 Subject: Any time fcaps or a setuid app under SECURE_NOROOT is used to result in a non-zero pE we will crate a new audit record which contains the entire set of known information about the executable in question, fP, fI, fE, fversion and includes the process's pE, pI, pP. Before and after the bprm capability are applied. This record type will only be emitted from execve syscalls. an example of making ping use fcaps instead of setuid: setcap "cat_net_raw+pe" /bin/ping type=SYSCALL msg=audit(1225742021.015:236): arch=c000003e syscall=59 success=yes exit=0 a0=1457f30 a1=14606b0 a2=1463940 a3=321b770a70 items=2 ppid=2929 pid=2963 auid=0 uid=500 gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1321] msg=audit(1225742021.015:236): fver=2 fp=0000000000002000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 new_pp=0000000000002000 new_pi=0000000000000000 new_pe=0000000000002000 type=EXECVE msg=audit(1225742021.015:236): argc=2 a0="ping" a1="127.0.0.1" type=CWD msg=audit(1225742021.015:236): cwd="/home/test" type=PATH msg=audit(1225742021.015:236): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fe=1 cap_fver=2 type=PATH msg=audit(1225742021.015:236): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/auditsc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index de7e9bc..3229cd4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -196,6 +196,14 @@ struct audit_aux_data_pids { int pid_count; }; +struct audit_aux_data_bprm_fcaps { + struct audit_aux_data d; + struct audit_cap_data fcap; + unsigned int fcap_ver; + struct audit_cap_data old_pcap; + struct audit_cap_data new_pcap; +}; + struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -1375,6 +1383,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); break; } + case AUDIT_BPRM_FCAPS: { + struct audit_aux_data_bprm_fcaps *axs = (void *)aux; + audit_log_format(ab, "fver=%x", axs->fcap_ver); + audit_log_cap(ab, "fp", &axs->fcap.permitted); + audit_log_cap(ab, "fi", &axs->fcap.inheritable); + audit_log_format(ab, " fe=%d", axs->fcap.fE); + audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); + audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); + audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); + audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); + audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); + audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); + break; } + } audit_log_end(ab); } @@ -2502,6 +2524,52 @@ int __audit_signal_info(int sig, struct task_struct *t) } /** + * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps + * @bprm pointer to the bprm being processed + * @caps the caps read from the disk + * + * Simply check if the proc already has the caps given by the file and if not + * store the priv escalation info for later auditing at the end of the syscall + * + * this can fail and we don't care. See the note in audit.h for + * audit_log_bprm_fcaps() for my explaination.... + * + * -Eric + */ +void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +{ + struct audit_aux_data_bprm_fcaps *ax; + struct audit_context *context = current->audit_context; + struct cpu_vfs_cap_data vcaps; + struct dentry *dentry; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return; + + ax->d.type = AUDIT_BPRM_FCAPS; + ax->d.next = context->aux; + context->aux = (void *)ax; + + dentry = dget(bprm->file->f_dentry); + get_vfs_caps_from_disk(dentry, &vcaps); + dput(dentry); + + ax->fcap.permitted = vcaps.permitted; + ax->fcap.inheritable = vcaps.inheritable; + ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; + + ax->old_pcap.permitted = *pP; + ax->old_pcap.inheritable = current->cap_inheritable; + ax->old_pcap.effective = *pE; + + ax->new_pcap.permitted = current->cap_permitted; + ax->new_pcap.inheritable = current->cap_inheritable; + ax->new_pcap.effective = current->cap_effective; +} + +/** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value * -- cgit v1.1 From e68b75a027bb94066576139ee33676264f867b87 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Nov 2008 21:48:22 +1100 Subject: When the capset syscall is used it is not possible for audit to record the actual capbilities being added/removed. This patch adds a new record type which emits the target pid and the eff, inh, and perm cap sets. example output if you audit capset syscalls would be: type=SYSCALL msg=audit(1225743140.465:76): arch=c000003e syscall=126 success=yes exit=0 a0=17f2014 a1=17f201c a2=80000000 a3=7fff2ab7f060 items=0 ppid=2160 pid=2223 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="setcap" exe="/usr/sbin/setcap" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1322] msg=audit(1225743140.465:76): pid=0 cap_pi=ffffffffffffffff cap_pp=ffffffffffffffff cap_pe=ffffffffffffffff Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/auditsc.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/capability.c | 5 +++++ 2 files changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3229cd4..cef3423 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -204,6 +204,12 @@ struct audit_aux_data_bprm_fcaps { struct audit_cap_data new_pcap; }; +struct audit_aux_data_capset { + struct audit_aux_data d; + pid_t pid; + struct audit_cap_data cap; +}; + struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -1397,6 +1403,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); break; } + case AUDIT_CAPSET: { + struct audit_aux_data_capset *axs = (void *)aux; + audit_log_format(ab, "pid=%d", axs->pid); + audit_log_cap(ab, "cap_pi", &axs->cap.inheritable); + audit_log_cap(ab, "cap_pp", &axs->cap.permitted); + audit_log_cap(ab, "cap_pe", &axs->cap.effective); + break; } + } audit_log_end(ab); } @@ -2570,6 +2584,40 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ } /** + * __audit_log_capset - store information about the arguments to the capset syscall + * @pid target pid of the capset call + * @eff effective cap set + * @inh inheritible cap set + * @perm permited cap set + * + * Record the aguments userspace sent to sys_capset for later printing by the + * audit system if applicable + */ +int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +{ + struct audit_aux_data_capset *ax; + struct audit_context *context = current->audit_context; + + if (likely(!audit_enabled || !context || context->dummy)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->d.type = AUDIT_CAPSET; + ax->d.next = context->aux; + context->aux = (void *)ax; + + ax->pid = pid; + ax->cap.effective = *eff; + ax->cap.inheritable = *eff; + ax->cap.permitted = *perm; + + return 0; +} + +/** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value * diff --git a/kernel/capability.c b/kernel/capability.c index e13a685..19f9eda 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -7,6 +7,7 @@ * 30 May 2002: Cleanup, Robert M. Love */ +#include #include #include #include @@ -468,6 +469,10 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) i++; } + ret = audit_log_capset(pid, &effective, &inheritable, &permitted); + if (ret) + return ret; + if (pid && (pid != task_pid_vnr(current))) ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, &permitted); -- cgit v1.1 From 637d32dc720897616e8a1a4f9e9609e29d431800 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 29 Oct 2008 15:42:12 +1100 Subject: Capabilities: BUG when an invalid capability is requested If an invalid (large) capability is requested the capabilities system may panic as it is dereferencing an array of fixed (short) length. Its possible (and actually often happens) that the capability system accidentally stumbled into a valid memory region but it also regularly happens that it hits invalid memory and BUGs. If such an operation does get past cap_capable then the selinux system is sure to have problems as it already does a (simple) validity check and BUG. This is known to happen by the broken and buggy firegl driver. This patch cleanly checks all capable calls and BUG if a call is for an invalid capability. This will likely break the firegl driver for some situations, but it is the right thing to do. Garbage into a security system gets you killed/bugged Signed-off-by: Eric Paris Acked-by: Arjan van de Ven Acked-by: Serge Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- kernel/capability.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 19f9eda..adb262f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -514,6 +514,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) */ int capable(int cap) { + if (unlikely(!cap_valid(cap))) { + printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + BUG(); + } + if (has_capability(current, cap)) { current->flags |= PF_SUPERPRIV; return 1; -- cgit v1.1 From 934352f214b3251eb0793c1209d346595a661d80 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 10 Nov 2008 20:41:13 +0530 Subject: sched: add hierarchical accounting to cpu accounting controller Impact: improve CPU time accounting of tasks under the cpu accounting controller Add hierarchical accounting to cpu accounting controller and include cpuacct documentation. Currently, while charging the task's cputime to its accounting group, the accounting group hierarchy isn't updated. This patch charges the cputime of a task to its accounting group and all its parent accounting groups. Reported-by: Srivatsa Vaddagiri Signed-off-by: Bharata B Rao Reviewed-by: Paul Menage Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 59db86c..ebaf432 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9196,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ -/* track cpu usage of a group of tasks */ +/* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct cpuacct *parent; }; struct cgroup_subsys cpuacct_subsys; @@ -9234,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create( return ERR_PTR(-ENOMEM); } + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + return &ca->css; } @@ -9313,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) static void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int cpu; if (!cpuacct_subsys.active) return; + cpu = task_cpu(tsk); ca = task_ca(tsk); - if (ca) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); + for (; ca; ca = ca->parent) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } -- cgit v1.1 From 76aac0e9a17742e60d408be1a706e9aaad370891 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:12 +1100 Subject: CRED: Wrap task credential accesses in the core kernel Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: Al Viro Cc: linux-audit@redhat.com Cc: containers@lists.linux-foundation.org Cc: linux-mm@kvack.org Signed-off-by: James Morris --- kernel/acct.c | 7 +++---- kernel/auditsc.c | 6 ++++-- kernel/cgroup.c | 9 +++++---- kernel/futex.c | 8 +++++--- kernel/futex_compat.c | 3 ++- kernel/ptrace.c | 15 +++++++++------ kernel/sched.c | 11 +++++++---- kernel/signal.c | 15 +++++++++------ kernel/sys.c | 16 ++++++++-------- kernel/sysctl.c | 2 +- kernel/timer.c | 8 ++++---- kernel/user_namespace.c | 2 +- 12 files changed, 58 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f6006a6..d57b7cb 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - ac.ac_uid = current->uid; - ac.ac_gid = current->gid; + current_uid_gid(&ac.ac_uid, &ac.ac_gid); #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif #if ACCT_VERSION==1 || ACCT_VERSION==2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = current->uid; - ac.ac_gid16 = current->gid; + ac.ac_uid16 = ac.ac_uid; + ac.ac_gid16 = ac.ac_gid; #endif #if ACCT_VERSION==3 ac.ac_pid = task_tgid_nr_ns(current, ns); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cef3423..9c7e47a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2628,7 +2628,8 @@ void audit_core_dumps(long signr) { struct audit_buffer *ab; u32 sid; - uid_t auid = audit_get_loginuid(current); + uid_t auid = audit_get_loginuid(current), uid; + gid_t gid; unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) @@ -2638,8 +2639,9 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, current->uid, current->gid, sessionid); + auid, uid, gid, sessionid); security_task_getsecid(current, &sid); if (sid) { char *ctx = NULL; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35eebd5..78f9b31 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -571,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) if (inode) { inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; @@ -1279,6 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { struct task_struct *tsk; + uid_t euid; int ret; if (pid) { @@ -1291,8 +1292,8 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) get_task_struct(tsk); rcu_read_unlock(); - if ((current->euid) && (current->euid != tsk->uid) - && (current->euid != tsk->suid)) { + euid = current_euid(); + if (euid && euid != tsk->uid && euid != tsk->suid) { put_task_struct(tsk); return -EACCES; } diff --git a/kernel/futex.c b/kernel/futex.c index 8af1002..e069621 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -439,10 +439,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; + uid_t euid = current_euid(); rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) + if (!p || (euid != p->euid && euid != p->uid)) p = ERR_PTR(-ESRCH); else get_task_struct(p); @@ -1829,6 +1830,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, { struct robust_list_head __user *head; unsigned long ret; + uid_t euid = current_euid(); if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -1844,8 +1846,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_PTRACE)) + if (euid != p->euid && euid != p->uid && + !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; rcu_read_unlock(); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 04ac3a9..3254d4e 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, { struct compat_robust_list_head __user *head; unsigned long ret; + uid_t euid = current_euid(); if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -150,7 +151,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && + if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1e68e4c..937f6b5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -123,16 +123,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ + uid_t uid; + gid_t gid; int dumpable = 0; /* Don't let security modules deny introspection */ if (task == current) return 0; - if (((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + current_uid_gid(&uid, &gid); + if ((uid != task->euid || + uid != task->suid || + uid != task->uid || + gid != task->egid || + gid != task->sgid || + gid != task->gid) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); if (task->mm) diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc..c3b8b1f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5128,6 +5128,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + uid_t euid; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -5180,8 +5181,9 @@ recheck: return -EPERM; /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) + euid = current_euid(); + if (euid != p->euid && + euid != p->uid) return -EPERM; } @@ -5392,6 +5394,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) cpumask_t cpus_allowed; cpumask_t new_mask = *in_mask; struct task_struct *p; + uid_t euid; int retval; get_online_cpus(); @@ -5412,9 +5415,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + euid = current_euid(); retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) + if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 4530fc6..167b535 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -567,6 +567,7 @@ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { struct pid *sid; + uid_t uid, euid; int error; if (!valid_signal(sig)) @@ -579,8 +580,10 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && - (current->uid ^ t->suid) && (current->uid ^ t->uid) && + uid = current_uid(); + euid = current_euid(); + if ((euid ^ t->suid) && (euid ^ t->uid) && + (uid ^ t->suid) && (uid ^ t->uid) && !capable(CAP_KILL)) { switch (sig) { case SIGCONT: @@ -844,7 +847,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_pid_vnr(current); - q->info.si_uid = current->uid; + q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; @@ -1598,7 +1601,7 @@ void ptrace_notify(int exit_code) info.si_signo = SIGTRAP; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); - info.si_uid = current->uid; + info.si_uid = current_uid(); /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); @@ -2211,7 +2214,7 @@ sys_kill(pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_USER; info.si_pid = task_tgid_vnr(current); - info.si_uid = current->uid; + info.si_uid = current_uid(); return kill_something_info(sig, &info, pid); } @@ -2228,7 +2231,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_TKILL; info.si_pid = task_tgid_vnr(current); - info.si_uid = current->uid; + info.si_uid = current_uid(); rcu_read_lock(); p = find_task_by_vpid(pid); diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8..ed5c29c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -114,10 +114,10 @@ void (*pm_power_off_prepare)(void); static int set_one_prio(struct task_struct *p, int niceval, int error) { + uid_t euid = current_euid(); int no_nice; - if (p->uid != current->euid && - p->euid != current->euid && !capable(CAP_SYS_NICE)) { + if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) { error = -EPERM; goto out; } @@ -176,16 +176,16 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) case PRIO_USER: user = current->user; if (!who) - who = current->uid; + who = current_uid(); else - if ((who != current->uid) && !(user = find_user(who))) + if (who != current_uid() && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) if (p->uid == who) error = set_one_prio(p, niceval, error); while_each_thread(g, p); - if (who != current->uid) + if (who != current_uid()) free_uid(user); /* For find_user() */ break; } @@ -238,9 +238,9 @@ asmlinkage long sys_getpriority(int which, int who) case PRIO_USER: user = current->user; if (!who) - who = current->uid; + who = current_uid(); else - if ((who != current->uid) && !(user = find_user(who))) + if (who != current_uid() && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) @@ -250,7 +250,7 @@ asmlinkage long sys_getpriority(int which, int who) retval = niceval; } while_each_thread(g, p); - if (who != current->uid) + if (who != current_uid()) free_uid(user); /* for find_user() */ break; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d048fa..5110313 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1641,7 +1641,7 @@ out: static int test_perm(int mode, int op) { - if (!current->euid) + if (!current_euid()) mode >>= 6; else if (in_egroup_p(0)) mode >>= 3; diff --git a/kernel/timer.c b/kernel/timer.c index 56becf3..b54e464 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1123,25 +1123,25 @@ asmlinkage long sys_getppid(void) asmlinkage long sys_getuid(void) { /* Only we change this so SMP safe */ - return current->uid; + return current_uid(); } asmlinkage long sys_geteuid(void) { /* Only we change this so SMP safe */ - return current->euid; + return current_euid(); } asmlinkage long sys_getgid(void) { /* Only we change this so SMP safe */ - return current->gid; + return current_gid(); } asmlinkage long sys_getegid(void) { /* Only we change this so SMP safe */ - return current->egid; + return current_egid(); } #endif diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 532858f..f82730a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -38,7 +38,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) } /* Reset current->user with a new one */ - new_user = alloc_uid(ns, current->uid); + new_user = alloc_uid(ns, current_uid()); if (!new_user) { free_uid(ns->root_user); kfree(ns); -- cgit v1.1 From 8bbf4976b59fc9fc2861e79cab7beb3f6d647640 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:14 +1100 Subject: KEYS: Alter use of key instantiation link-to-keyring argument Alter the use of the key instantiation and negation functions' link-to-keyring arguments. Currently this specifies a keyring in the target process to link the key into, creating the keyring if it doesn't exist. This, however, can be a problem for copy-on-write credentials as it means that the instantiating process can alter the credentials of the requesting process. This patch alters the behaviour such that: (1) If keyctl_instantiate_key() or keyctl_negate_key() are given a specific keyring by ID (ringid >= 0), then that keyring will be used. (2) If keyctl_instantiate_key() or keyctl_negate_key() are given one of the special constants that refer to the requesting process's keyrings (KEY_SPEC_*_KEYRING, all <= 0), then: (a) If sys_request_key() was given a keyring to use (destringid) then the key will be attached to that keyring. (b) If sys_request_key() was given a NULL keyring, then the key being instantiated will be attached to the default keyring as set by keyctl_set_reqkey_keyring(). (3) No extra link will be made. Decision point (1) follows current behaviour, and allows those instantiators who've searched for a specifically named keyring in the requestor's keyring so as to partition the keys by type to still have their named keyrings. Decision point (2) allows the requestor to make sure that the key or keys that get produced by request_key() go where they want, whilst allowing the instantiator to request that the key is retained. This is mainly useful for situations where the instantiator makes a secondary request, the key for which should be retained by the initial requestor: +-----------+ +--------------+ +--------------+ | | | | | | | Requestor |------->| Instantiator |------->| Instantiator | | | | | | | +-----------+ +--------------+ +--------------+ request_key() request_key() This might be useful, for example, in Kerberos, where the requestor requests a ticket, and then the ticket instantiator requests the TGT, which someone else then has to go and fetch. The TGT, however, should be retained in the keyrings of the requestor, not the first instantiator. To make this explict an extra special keyring constant is also added. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 3d3c3ea..f044f8f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -140,7 +140,7 @@ static int ____call_usermodehelper(void *data) /* Unblock all signals and set the session keyring. */ new_session = key_get(sub_info->ring); spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(current, new_session); + old_session = __install_session_keyring(new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); -- cgit v1.1 From 1cdcbec1a3372c0c49c59d292e708fd07b509f18 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:14 +1100 Subject: CRED: Neuter sys_capset() Take away the ability for sys_capset() to affect processes other than current. This means that current will not need to lock its own credentials when reading them against interference by other processes. This has effectively been the case for a while anyway, since: (1) Without LSM enabled, sys_capset() is disallowed. (2) With file-based capabilities, sys_capset() is neutered. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: Andrew G. Morgan Acked-by: James Morris Signed-off-by: James Morris --- kernel/capability.c | 227 ++++++---------------------------------------------- 1 file changed, 23 insertions(+), 204 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index adb262f..58b0051 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -127,160 +127,6 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) return 0; } -#ifndef CONFIG_SECURITY_FILE_CAPABILITIES - -/* - * Without filesystem capability support, we nominally support one process - * setting the capabilities of another - */ -static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, - kernel_cap_t *pIp, kernel_cap_t *pPp) -{ - struct task_struct *target; - int ret; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = security_capget(target, pEp, pIp, pPp); - -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - return ret; -} - -/* - * cap_set_pg - set capabilities for all processes in a given process - * group. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - struct pid *pgrp; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - pgrp = find_vpid(pgrp_nr); - do_each_pid_task(pgrp, PIDTYPE_PGID, g) { - target = g; - while_each_thread(g, target) { - if (!security_capset_check(target, effective, - inheritable, permitted)) { - security_capset_set(target, effective, - inheritable, permitted); - ret = 0; - } - found = 1; - } - } while_each_pid_task(pgrp, PIDTYPE_PGID, g); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - if (!found) - ret = 0; - return ret; -} - -/* - * cap_set_all - set capabilities for all processes other than init - * and self. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_all(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - do_each_thread(g, target) { - if (target == current - || is_container_init(target->group_leader)) - continue; - found = 1; - if (security_capset_check(target, effective, inheritable, - permitted)) - continue; - ret = 0; - security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - if (!found) - ret = 0; - - return ret; -} - -/* - * Given the target pid does not refer to the current process we - * need more elaborate support... (This support is not present when - * filesystem capabilities are configured.) - */ -static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *target; - int ret; - - if (!capable(CAP_SETPCAP)) - return -EPERM; - - if (pid == -1) /* all procs other than current and init */ - return cap_set_all(effective, inheritable, permitted); - - else if (pid < 0) /* all procs in process group */ - return cap_set_pg(-pid, effective, inheritable, permitted); - - /* target != current */ - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - target = find_task_by_vpid(pid); - if (!target) - ret = -ESRCH; - else { - ret = security_capset_check(target, effective, inheritable, - permitted); - - /* having verified that the proposed changes are legal, - we now put them into effect. */ - if (!ret) - security_capset_set(target, effective, inheritable, - permitted); - } - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - return ret; -} - -#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */ - /* * If we have configured with filesystem capability support, then the * only thing that can change the capabilities of the current process @@ -315,22 +161,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, } /* - * With filesystem capability support configured, the kernel does not - * permit the changing of capabilities in one process by another - * process. (CAP_SETPCAP has much less broad semantics when configured - * this way.) - */ -static inline int do_sys_capset_other_tasks(pid_t pid, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - return -EPERM; -} - -#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */ - -/* * Atomically modify the effective capabilities returning the original * value. No permission check is performed here - it is assumed that the * caller is permitted to set the desired effective capabilities. @@ -424,16 +254,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * - * Set capabilities for a given process, all processes, or all - * processes in a given process group. + * Set capabilities for the current process only. The ability to any other + * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * - * [pid is for the 'target' task. 'current' is the calling task.] - * - * I: any raised capabilities must be a subset of the (old current) permitted - * P: any raised capabilities must be a subset of the (old current) permitted - * E: must be set to a subset of (new target) permitted + * I: any raised capabilities must be a subset of the old permitted + * P: any raised capabilities must be a subset of the old permitted + * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ @@ -452,10 +280,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(pid, &header->pid)) return -EFAULT; + /* may only affect current now */ + if (pid != 0 && pid != task_pid_vnr(current)) + return -EPERM; + if (copy_from_user(&kdata, data, tocopy - * sizeof(struct __user_cap_data_struct))) { + * sizeof(struct __user_cap_data_struct))) return -EFAULT; - } for (i = 0; i < tocopy; i++) { effective.cap[i] = kdata[i].effective; @@ -473,32 +304,20 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (ret) return ret; - if (pid && (pid != task_pid_vnr(current))) - ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, - &permitted); - else { - /* - * This lock is required even when filesystem - * capability support is configured - it protects the - * sys_capget() call from returning incorrect data in - * the case that the targeted process is not the - * current one. - */ - spin_lock(&task_capability_lock); - - ret = security_capset_check(current, &effective, &inheritable, - &permitted); - /* - * Having verified that the proposed changes are - * legal, we now put them into effect. - */ - if (!ret) - security_capset_set(current, &effective, &inheritable, - &permitted); - spin_unlock(&task_capability_lock); - } - + /* This lock is required even when filesystem capability support is + * configured - it protects the sys_capget() call from returning + * incorrect data in the case that the targeted process is not the + * current one. + */ + spin_lock(&task_capability_lock); + ret = security_capset_check(&effective, &inheritable, &permitted); + /* Having verified that the proposed changes are legal, we now put them + * into effect. + */ + if (!ret) + security_capset_set(&effective, &inheritable, &permitted); + spin_unlock(&task_capability_lock); return ret; } -- cgit v1.1 From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:16 +1100 Subject: CRED: Separate task security context from task_struct Separate the task security context from task_struct. At this point, the security data is temporarily embedded in the task_struct with two pointers pointing to it. Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in entry.S via asm-offsets. With comment fixes Signed-off-by: Marc Dionne Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/auditsc.c | 52 +++++----- kernel/capability.c | 4 +- kernel/cgroup.c | 4 +- kernel/exit.c | 10 +- kernel/fork.c | 24 ++--- kernel/futex.c | 6 +- kernel/futex_compat.c | 5 +- kernel/ptrace.c | 19 ++-- kernel/sched.c | 10 +- kernel/signal.c | 16 +-- kernel/sys.c | 266 ++++++++++++++++++++++++++++---------------------- kernel/trace/trace.c | 2 +- kernel/tsacct.c | 4 +- kernel/uid16.c | 28 +++--- kernel/user.c | 4 +- 15 files changed, 250 insertions(+), 204 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9c7e47a..2febf51 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -447,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_names *name, enum audit_state *state) { + struct cred *cred = tsk->cred; int i, j, need_sid = 1; u32 sid; @@ -466,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_UID: - result = audit_comparator(tsk->uid, f->op, f->val); + result = audit_comparator(cred->uid, f->op, f->val); break; case AUDIT_EUID: - result = audit_comparator(tsk->euid, f->op, f->val); + result = audit_comparator(cred->euid, f->op, f->val); break; case AUDIT_SUID: - result = audit_comparator(tsk->suid, f->op, f->val); + result = audit_comparator(cred->suid, f->op, f->val); break; case AUDIT_FSUID: - result = audit_comparator(tsk->fsuid, f->op, f->val); + result = audit_comparator(cred->fsuid, f->op, f->val); break; case AUDIT_GID: - result = audit_comparator(tsk->gid, f->op, f->val); + result = audit_comparator(cred->gid, f->op, f->val); break; case AUDIT_EGID: - result = audit_comparator(tsk->egid, f->op, f->val); + result = audit_comparator(cred->egid, f->op, f->val); break; case AUDIT_SGID: - result = audit_comparator(tsk->sgid, f->op, f->val); + result = audit_comparator(cred->sgid, f->op, f->val); break; case AUDIT_FSGID: - result = audit_comparator(tsk->fsgid, f->op, f->val); + result = audit_comparator(cred->fsgid, f->op, f->val); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); @@ -1228,6 +1229,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { + struct cred *cred = tsk->cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; @@ -1237,14 +1239,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->pid = tsk->pid; if (!context->ppid) context->ppid = sys_getppid(); - context->uid = tsk->uid; - context->gid = tsk->gid; - context->euid = tsk->euid; - context->suid = tsk->suid; - context->fsuid = tsk->fsuid; - context->egid = tsk->egid; - context->sgid = tsk->sgid; - context->fsgid = tsk->fsgid; + context->uid = cred->uid; + context->gid = cred->gid; + context->euid = cred->euid; + context->suid = cred->suid; + context->fsuid = cred->fsuid; + context->egid = cred->egid; + context->sgid = cred->sgid; + context->fsgid = cred->fsgid; context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); @@ -2086,7 +2088,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task->uid, + task->pid, task->cred->uid, task->loginuid, loginuid, task->sessionid, sessionid); audit_log_end(ab); @@ -2469,7 +2471,7 @@ void __audit_ptrace(struct task_struct *t) context->target_pid = t->pid; context->target_auid = audit_get_loginuid(t); - context->target_uid = t->uid; + context->target_uid = t->cred->uid; context->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); @@ -2495,7 +2497,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; else - audit_sig_uid = tsk->uid; + audit_sig_uid = tsk->cred->uid; security_task_getsecid(tsk, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) @@ -2507,7 +2509,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (!ctx->target_pid) { ctx->target_pid = t->tgid; ctx->target_auid = audit_get_loginuid(t); - ctx->target_uid = t->uid; + ctx->target_uid = t->cred->uid; ctx->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); @@ -2528,7 +2530,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->target_pid[axp->pid_count] = t->tgid; axp->target_auid[axp->pid_count] = audit_get_loginuid(t); - axp->target_uid[axp->pid_count] = t->uid; + axp->target_uid[axp->pid_count] = t->cred->uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getsecid(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); @@ -2575,12 +2577,12 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = *pP; - ax->old_pcap.inheritable = current->cap_inheritable; + ax->old_pcap.inheritable = current->cred->cap_inheritable; ax->old_pcap.effective = *pE; - ax->new_pcap.permitted = current->cap_permitted; - ax->new_pcap.inheritable = current->cap_inheritable; - ax->new_pcap.effective = current->cap_effective; + ax->new_pcap.permitted = current->cred->cap_permitted; + ax->new_pcap.inheritable = current->cred->cap_inheritable; + ax->new_pcap.effective = current->cred->cap_effective; } /** diff --git a/kernel/capability.c b/kernel/capability.c index 58b0051..a404b98 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -171,8 +171,8 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) spin_lock(&task_capability_lock); - pE_old = current->cap_effective; - current->cap_effective = pE_new; + pE_old = current->cred->cap_effective; + current->cred->cap_effective = pE_new; spin_unlock(&task_capability_lock); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 78f9b31..e210526 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1293,7 +1293,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) rcu_read_unlock(); euid = current_euid(); - if (euid && euid != tsk->uid && euid != tsk->suid) { + if (euid && + euid != tsk->cred->uid && + euid != tsk->cred->suid) { put_task_struct(tsk); return -EACCES; } diff --git a/kernel/exit.c b/kernel/exit.c index 80137a5..e0f6e18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -160,7 +160,7 @@ void release_task(struct task_struct * p) int zap_leader; repeat: tracehook_prepare_release_task(p); - atomic_dec(&p->user->processes); + atomic_dec(&p->cred->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); @@ -1272,7 +1272,7 @@ static int wait_task_zombie(struct task_struct *p, int options, return 0; if (unlikely(options & WNOWAIT)) { - uid_t uid = p->uid; + uid_t uid = p->cred->uid; int exit_code = p->exit_code; int why, status; @@ -1393,7 +1393,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); + retval = put_user(p->cred->uid, &infop->si_uid); if (!retval) retval = pid; @@ -1458,7 +1458,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p, if (!unlikely(options & WNOWAIT)) p->exit_code = 0; - uid = p->uid; + uid = p->cred->uid; unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1535,7 +1535,7 @@ static int wait_task_continued(struct task_struct *p, int options, spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); - uid = p->uid; + uid = p->cred->uid; get_task_struct(p); read_unlock(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index f608356..81fdc77 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -147,8 +147,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); security_task_free(tsk); - free_uid(tsk->user); - put_group_info(tsk->group_info); + free_uid(tsk->__temp_cred.user); + put_group_info(tsk->__temp_cred.group_info); delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) @@ -969,17 +969,18 @@ static struct task_struct *copy_process(unsigned long clone_flags, DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + p->cred = &p->__temp_cred; retval = -EAGAIN; - if (atomic_read(&p->user->processes) >= + if (atomic_read(&p->cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != current->nsproxy->user_ns->root_user) + p->cred->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } - atomic_inc(&p->user->__count); - atomic_inc(&p->user->processes); - get_group_info(p->group_info); + atomic_inc(&p->cred->user->__count); + atomic_inc(&p->cred->user->processes); + get_group_info(p->cred->group_info); /* * If multiple threads are within copy_process(), then this check @@ -1035,9 +1036,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); #ifdef CONFIG_SECURITY - p->security = NULL; + p->cred->security = NULL; #endif - p->cap_bset = current->cap_bset; p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); @@ -1298,9 +1298,9 @@ bad_fork_cleanup_cgroup: bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: - put_group_info(p->group_info); - atomic_dec(&p->user->processes); - free_uid(p->user); + put_group_info(p->cred->group_info); + atomic_dec(&p->cred->user->processes); + free_uid(p->cred->user); bad_fork_free: free_task(p); fork_out: diff --git a/kernel/futex.c b/kernel/futex.c index e069621..28421d8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -443,7 +443,8 @@ static struct task_struct * futex_find_get_task(pid_t pid) rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || (euid != p->euid && euid != p->uid)) + if (!p || (euid != p->cred->euid && + euid != p->cred->uid)) p = ERR_PTR(-ESRCH); else get_task_struct(p); @@ -1846,7 +1847,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->euid && euid != p->uid && + if (euid != p->cred->euid && + euid != p->cred->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 3254d4e..2c3fd5e 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->euid && euid != p->uid && - !capable(CAP_SYS_PTRACE)) + if (euid != p->cred->euid && + euid != p->cred->uid && + !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; read_unlock(&tasklist_lock); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 937f6b5..49849d1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -115,6 +115,8 @@ int ptrace_check_attach(struct task_struct *child, int kill) int __ptrace_may_access(struct task_struct *task, unsigned int mode) { + struct cred *cred = current->cred, *tcred = task->cred; + /* May we inspect the given task? * This check is used both for attaching with ptrace * and for allowing access to sensitive information in /proc. @@ -123,19 +125,18 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - uid_t uid; - gid_t gid; + uid_t uid = cred->uid; + gid_t gid = cred->gid; int dumpable = 0; /* Don't let security modules deny introspection */ if (task == current) return 0; - current_uid_gid(&uid, &gid); - if ((uid != task->euid || - uid != task->suid || - uid != task->uid || - gid != task->egid || - gid != task->sgid || - gid != task->gid) && !capable(CAP_SYS_PTRACE)) + if ((uid != tcred->euid || + uid != tcred->suid || + uid != tcred->uid || + gid != tcred->egid || + gid != tcred->sgid || + gid != tcred->gid) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); if (task->mm) diff --git a/kernel/sched.c b/kernel/sched.c index c3b8b1f..733c59e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -345,7 +345,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->user->tg; + tg = p->cred->user->tg; #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@ -5182,8 +5182,8 @@ recheck: /* can't change other user's priorities */ euid = current_euid(); - if (euid != p->euid && - euid != p->uid) + if (euid != p->cred->euid && + euid != p->cred->uid) return -EPERM; } @@ -5417,7 +5417,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) euid = current_euid(); retval = -EPERM; - if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE)) + if (euid != p->cred->euid && + euid != p->cred->uid && + !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 167b535..80e8a648 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -187,7 +187,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, * In order to avoid problems with "switch_user()", we want to make * sure that the compiler doesn't re-load "t->user" */ - user = t->user; + user = t->cred->user; barrier(); atomic_inc(&user->sigpending); if (override_rlimit || @@ -582,8 +582,8 @@ static int check_kill_permission(int sig, struct siginfo *info, uid = current_uid(); euid = current_euid(); - if ((euid ^ t->suid) && (euid ^ t->uid) && - (uid ^ t->suid) && (uid ^ t->uid) && + if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) && + (uid ^ t->cred->suid) && (uid ^ t->cred->uid) && !capable(CAP_KILL)) { switch (sig) { case SIGCONT: @@ -1100,8 +1100,8 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, goto out_unlock; } if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && (euid != p->suid) && (euid != p->uid) - && (uid != p->suid) && (uid != p->uid)) { + && (euid != p->cred->suid) && (euid != p->cred->uid) + && (uid != p->cred->suid) && (uid != p->cred->uid)) { ret = -EPERM; goto out_unlock; } @@ -1374,7 +1374,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); rcu_read_unlock(); - info.si_uid = tsk->uid; + info.si_uid = tsk->cred->uid; thread_group_cputime(tsk, &cputime); info.si_utime = cputime_to_jiffies(cputime.utime); @@ -1445,7 +1445,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); rcu_read_unlock(); - info.si_uid = tsk->uid; + info.si_uid = tsk->cred->uid; info.si_utime = cputime_to_clock_t(tsk->utime); info.si_stime = cputime_to_clock_t(tsk->stime); @@ -1713,7 +1713,7 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_errno = 0; info->si_code = SI_USER; info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->uid; + info->si_uid = current->parent->cred->uid; } /* If the (new) signal is now blocked, requeue it. */ diff --git a/kernel/sys.c b/kernel/sys.c index ed5c29c..5d81f07 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -117,7 +117,9 @@ static int set_one_prio(struct task_struct *p, int niceval, int error) uid_t euid = current_euid(); int no_nice; - if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) { + if (p->cred->uid != euid && + p->cred->euid != euid && + !capable(CAP_SYS_NICE)) { error = -EPERM; goto out; } @@ -174,7 +176,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->user; + user = current->cred->user; if (!who) who = current_uid(); else @@ -182,7 +184,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->uid == who) + if (p->cred->uid == who) error = set_one_prio(p, niceval, error); while_each_thread(g, p); if (who != current_uid()) @@ -236,7 +238,7 @@ asmlinkage long sys_getpriority(int which, int who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->user; + user = current->cred->user; if (!who) who = current_uid(); else @@ -244,7 +246,7 @@ asmlinkage long sys_getpriority(int which, int who) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->uid == who) { + if (p->cred->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; @@ -472,8 +474,9 @@ void ctrl_alt_del(void) */ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) { - int old_rgid = current->gid; - int old_egid = current->egid; + struct cred *cred = current->cred; + int old_rgid = cred->gid; + int old_egid = cred->egid; int new_rgid = old_rgid; int new_egid = old_egid; int retval; @@ -484,7 +487,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) if (rgid != (gid_t) -1) { if ((old_rgid == rgid) || - (current->egid==rgid) || + (cred->egid == rgid) || capable(CAP_SETGID)) new_rgid = rgid; else @@ -492,8 +495,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) } if (egid != (gid_t) -1) { if ((old_rgid == egid) || - (current->egid == egid) || - (current->sgid == egid) || + (cred->egid == egid) || + (cred->sgid == egid) || capable(CAP_SETGID)) new_egid = egid; else @@ -505,10 +508,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) } if (rgid != (gid_t) -1 || (egid != (gid_t) -1 && egid != old_rgid)) - current->sgid = new_egid; - current->fsgid = new_egid; - current->egid = new_egid; - current->gid = new_rgid; + cred->sgid = new_egid; + cred->fsgid = new_egid; + cred->egid = new_egid; + cred->gid = new_rgid; key_fsgid_changed(current); proc_id_connector(current, PROC_EVENT_GID); return 0; @@ -521,7 +524,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) */ asmlinkage long sys_setgid(gid_t gid) { - int old_egid = current->egid; + struct cred *cred = current->cred; + int old_egid = cred->egid; int retval; retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); @@ -533,13 +537,13 @@ asmlinkage long sys_setgid(gid_t gid) set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->gid = current->egid = current->sgid = current->fsgid = gid; - } else if ((gid == current->gid) || (gid == current->sgid)) { + cred->gid = cred->egid = cred->sgid = cred->fsgid = gid; + } else if ((gid == cred->gid) || (gid == cred->sgid)) { if (old_egid != gid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->egid = current->fsgid = gid; + cred->egid = cred->fsgid = gid; } else return -EPERM; @@ -570,7 +574,7 @@ static int set_user(uid_t new_ruid, int dumpclear) set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->uid = new_ruid; + current->cred->uid = new_ruid; return 0; } @@ -591,6 +595,7 @@ static int set_user(uid_t new_ruid, int dumpclear) */ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) { + struct cred *cred = current->cred; int old_ruid, old_euid, old_suid, new_ruid, new_euid; int retval; @@ -598,14 +603,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (retval) return retval; - new_ruid = old_ruid = current->uid; - new_euid = old_euid = current->euid; - old_suid = current->suid; + new_ruid = old_ruid = cred->uid; + new_euid = old_euid = cred->euid; + old_suid = cred->suid; if (ruid != (uid_t) -1) { new_ruid = ruid; if ((old_ruid != ruid) && - (current->euid != ruid) && + (cred->euid != ruid) && !capable(CAP_SETUID)) return -EPERM; } @@ -613,8 +618,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (euid != (uid_t) -1) { new_euid = euid; if ((old_ruid != euid) && - (current->euid != euid) && - (current->suid != euid) && + (cred->euid != euid) && + (cred->suid != euid) && !capable(CAP_SETUID)) return -EPERM; } @@ -626,11 +631,11 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsuid = current->euid = new_euid; + cred->fsuid = cred->euid = new_euid; if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old_ruid)) - current->suid = current->euid; - current->fsuid = current->euid; + cred->suid = cred->euid; + cred->fsuid = cred->euid; key_fsuid_changed(current); proc_id_connector(current, PROC_EVENT_UID); @@ -653,7 +658,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) */ asmlinkage long sys_setuid(uid_t uid) { - int old_euid = current->euid; + struct cred *cred = current->cred; + int old_euid = cred->euid; int old_ruid, old_suid, new_suid; int retval; @@ -661,23 +667,23 @@ asmlinkage long sys_setuid(uid_t uid) if (retval) return retval; - old_ruid = current->uid; - old_suid = current->suid; + old_ruid = cred->uid; + old_suid = cred->suid; new_suid = old_suid; if (capable(CAP_SETUID)) { if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) return -EAGAIN; new_suid = uid; - } else if ((uid != current->uid) && (uid != new_suid)) + } else if ((uid != cred->uid) && (uid != new_suid)) return -EPERM; if (old_euid != uid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsuid = current->euid = uid; - current->suid = new_suid; + cred->fsuid = cred->euid = uid; + cred->suid = new_suid; key_fsuid_changed(current); proc_id_connector(current, PROC_EVENT_UID); @@ -692,9 +698,10 @@ asmlinkage long sys_setuid(uid_t uid) */ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { - int old_ruid = current->uid; - int old_euid = current->euid; - int old_suid = current->suid; + struct cred *cred = current->cred; + int old_ruid = cred->uid; + int old_euid = cred->euid; + int old_suid = cred->suid; int retval; retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); @@ -702,30 +709,31 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) return retval; if (!capable(CAP_SETUID)) { - if ((ruid != (uid_t) -1) && (ruid != current->uid) && - (ruid != current->euid) && (ruid != current->suid)) + if ((ruid != (uid_t) -1) && (ruid != cred->uid) && + (ruid != cred->euid) && (ruid != cred->suid)) return -EPERM; - if ((euid != (uid_t) -1) && (euid != current->uid) && - (euid != current->euid) && (euid != current->suid)) + if ((euid != (uid_t) -1) && (euid != cred->uid) && + (euid != cred->euid) && (euid != cred->suid)) return -EPERM; - if ((suid != (uid_t) -1) && (suid != current->uid) && - (suid != current->euid) && (suid != current->suid)) + if ((suid != (uid_t) -1) && (suid != cred->uid) && + (suid != cred->euid) && (suid != cred->suid)) return -EPERM; } if (ruid != (uid_t) -1) { - if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) + if (ruid != cred->uid && + set_user(ruid, euid != cred->euid) < 0) return -EAGAIN; } if (euid != (uid_t) -1) { - if (euid != current->euid) { + if (euid != cred->euid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->euid = euid; + cred->euid = euid; } - current->fsuid = current->euid; + cred->fsuid = cred->euid; if (suid != (uid_t) -1) - current->suid = suid; + cred->suid = suid; key_fsuid_changed(current); proc_id_connector(current, PROC_EVENT_UID); @@ -735,11 +743,12 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) { + struct cred *cred = current->cred; int retval; - if (!(retval = put_user(current->uid, ruid)) && - !(retval = put_user(current->euid, euid))) - retval = put_user(current->suid, suid); + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) + retval = put_user(cred->suid, suid); return retval; } @@ -749,6 +758,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us */ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { + struct cred *cred = current->cred; int retval; retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); @@ -756,28 +766,28 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) return retval; if (!capable(CAP_SETGID)) { - if ((rgid != (gid_t) -1) && (rgid != current->gid) && - (rgid != current->egid) && (rgid != current->sgid)) + if ((rgid != (gid_t) -1) && (rgid != cred->gid) && + (rgid != cred->egid) && (rgid != cred->sgid)) return -EPERM; - if ((egid != (gid_t) -1) && (egid != current->gid) && - (egid != current->egid) && (egid != current->sgid)) + if ((egid != (gid_t) -1) && (egid != cred->gid) && + (egid != cred->egid) && (egid != cred->sgid)) return -EPERM; - if ((sgid != (gid_t) -1) && (sgid != current->gid) && - (sgid != current->egid) && (sgid != current->sgid)) + if ((sgid != (gid_t) -1) && (sgid != cred->gid) && + (sgid != cred->egid) && (sgid != cred->sgid)) return -EPERM; } if (egid != (gid_t) -1) { - if (egid != current->egid) { + if (egid != cred->egid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->egid = egid; + cred->egid = egid; } - current->fsgid = current->egid; + cred->fsgid = cred->egid; if (rgid != (gid_t) -1) - current->gid = rgid; + cred->gid = rgid; if (sgid != (gid_t) -1) - current->sgid = sgid; + cred->sgid = sgid; key_fsgid_changed(current); proc_id_connector(current, PROC_EVENT_GID); @@ -786,11 +796,12 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) { + struct cred *cred = current->cred; int retval; - if (!(retval = put_user(current->gid, rgid)) && - !(retval = put_user(current->egid, egid))) - retval = put_user(current->sgid, sgid); + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) + retval = put_user(cred->sgid, sgid); return retval; } @@ -804,20 +815,21 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us */ asmlinkage long sys_setfsuid(uid_t uid) { + struct cred *cred = current->cred; int old_fsuid; - old_fsuid = current->fsuid; + old_fsuid = cred->fsuid; if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) return old_fsuid; - if (uid == current->uid || uid == current->euid || - uid == current->suid || uid == current->fsuid || + if (uid == cred->uid || uid == cred->euid || + uid == cred->suid || uid == cred->fsuid || capable(CAP_SETUID)) { if (uid != old_fsuid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsuid = uid; + cred->fsuid = uid; } key_fsuid_changed(current); @@ -833,20 +845,21 @@ asmlinkage long sys_setfsuid(uid_t uid) */ asmlinkage long sys_setfsgid(gid_t gid) { + struct cred *cred = current->cred; int old_fsgid; - old_fsgid = current->fsgid; + old_fsgid = cred->fsgid; if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) return old_fsgid; - if (gid == current->gid || gid == current->egid || - gid == current->sgid || gid == current->fsgid || + if (gid == cred->gid || gid == cred->egid || + gid == cred->sgid || gid == cred->fsgid || capable(CAP_SETGID)) { if (gid != old_fsgid) { set_dumpable(current->mm, suid_dumpable); smp_wmb(); } - current->fsgid = gid; + cred->fsgid = gid; key_fsgid_changed(current); proc_id_connector(current, PROC_EVENT_GID); } @@ -1208,8 +1221,15 @@ int groups_search(struct group_info *group_info, gid_t grp) return 0; } -/* validate and set current->group_info */ -int set_current_groups(struct group_info *group_info) +/** + * set_groups - Change a group subscription in a security record + * @sec: The security record to alter + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon a task security + * record. + */ +int set_groups(struct cred *cred, struct group_info *group_info) { int retval; struct group_info *old_info; @@ -1221,20 +1241,34 @@ int set_current_groups(struct group_info *group_info) groups_sort(group_info); get_group_info(group_info); - task_lock(current); - old_info = current->group_info; - current->group_info = group_info; - task_unlock(current); + spin_lock(&cred->lock); + old_info = cred->group_info; + cred->group_info = group_info; + spin_unlock(&cred->lock); put_group_info(old_info); - return 0; } +EXPORT_SYMBOL(set_groups); + +/** + * set_current_groups - Change current's group subscription + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon current's task + * security record. + */ +int set_current_groups(struct group_info *group_info) +{ + return set_groups(current->cred, group_info); +} + EXPORT_SYMBOL(set_current_groups); asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) { + struct cred *cred = current->cred; int i = 0; /* @@ -1246,13 +1280,13 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) return -EINVAL; /* no need to grab task_lock here; it cannot change */ - i = current->group_info->ngroups; + i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups_to_user(grouplist, current->group_info)) { + if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } @@ -1296,9 +1330,10 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) */ int in_group_p(gid_t grp) { + struct cred *cred = current->cred; int retval = 1; - if (grp != current->fsgid) - retval = groups_search(current->group_info, grp); + if (grp != cred->fsgid) + retval = groups_search(cred->group_info, grp); return retval; } @@ -1306,9 +1341,10 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { + struct cred *cred = current->cred; int retval = 1; - if (grp != current->egid) - retval = groups_search(current->group_info, grp); + if (grp != cred->egid) + retval = groups_search(cred->group_info, grp); return retval; } @@ -1624,7 +1660,9 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long error = 0; + struct task_struct *me = current; + unsigned char comm[sizeof(me->comm)]; + long error; if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; @@ -1635,39 +1673,41 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = -EINVAL; break; } - current->pdeath_signal = arg2; + me->pdeath_signal = arg2; + error = 0; break; case PR_GET_PDEATHSIG: - error = put_user(current->pdeath_signal, (int __user *)arg2); + error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - error = get_dumpable(current->mm); + error = get_dumpable(me->mm); break; case PR_SET_DUMPABLE: if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } - set_dumpable(current->mm, arg2); + set_dumpable(me->mm, arg2); + error = 0; break; case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(current, arg2); + error = SET_UNALIGN_CTL(me, arg2); break; case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(current, arg2); + error = GET_UNALIGN_CTL(me, arg2); break; case PR_SET_FPEMU: - error = SET_FPEMU_CTL(current, arg2); + error = SET_FPEMU_CTL(me, arg2); break; case PR_GET_FPEMU: - error = GET_FPEMU_CTL(current, arg2); + error = GET_FPEMU_CTL(me, arg2); break; case PR_SET_FPEXC: - error = SET_FPEXC_CTL(current, arg2); + error = SET_FPEXC_CTL(me, arg2); break; case PR_GET_FPEXC: - error = GET_FPEXC_CTL(current, arg2); + error = GET_FPEXC_CTL(me, arg2); break; case PR_GET_TIMING: error = PR_TIMING_STATISTICAL; @@ -1675,33 +1715,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; + else + error = 0; break; - case PR_SET_NAME: { - struct task_struct *me = current; - unsigned char ncomm[sizeof(me->comm)]; - - ncomm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(ncomm, (char __user *)arg2, - sizeof(me->comm)-1) < 0) + case PR_SET_NAME: + comm[sizeof(me->comm)-1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) return -EFAULT; - set_task_comm(me, ncomm); + set_task_comm(me, comm); return 0; - } - case PR_GET_NAME: { - struct task_struct *me = current; - unsigned char tcomm[sizeof(me->comm)]; - - get_task_comm(tcomm, me); - if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, + sizeof(comm))) return -EFAULT; return 0; - } case PR_GET_ENDIAN: - error = GET_ENDIAN(current, arg2); + error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: - error = SET_ENDIAN(current, arg2); + error = SET_ENDIAN(me, arg2); break; case PR_GET_SECCOMP: @@ -1725,6 +1760,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, current->default_timer_slack_ns; else current->timer_slack_ns = arg2; + error = 0; break; default: error = -EINVAL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478..5c97c5b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -246,7 +246,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(data->comm, tsk->comm, TASK_COMM_LEN); data->pid = tsk->pid; - data->uid = tsk->uid; + data->uid = task_uid(tsk); data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; data->policy = tsk->policy; data->rt_priority = tsk->rt_priority; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 8ebcd85..6d1ed07 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -53,8 +53,8 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_uid = tsk->uid; - stats->ac_gid = tsk->gid; + stats->ac_uid = tsk->cred->uid; + stats->ac_gid = tsk->cred->gid; stats->ac_pid = tsk->pid; rcu_read_lock(); stats->ac_ppid = pid_alive(tsk) ? diff --git a/kernel/uid16.c b/kernel/uid16.c index 3e41c16..71f07fc 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -86,9 +86,9 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, { int retval; - if (!(retval = put_user(high2lowuid(current->uid), ruid)) && - !(retval = put_user(high2lowuid(current->euid), euid))) - retval = put_user(high2lowuid(current->suid), suid); + if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) && + !(retval = put_user(high2lowuid(current->cred->euid), euid))) + retval = put_user(high2lowuid(current->cred->suid), suid); return retval; } @@ -106,9 +106,9 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, { int retval; - if (!(retval = put_user(high2lowgid(current->gid), rgid)) && - !(retval = put_user(high2lowgid(current->egid), egid))) - retval = put_user(high2lowgid(current->sgid), sgid); + if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) && + !(retval = put_user(high2lowgid(current->cred->egid), egid))) + retval = put_user(high2lowgid(current->cred->sgid), sgid); return retval; } @@ -166,20 +166,20 @@ asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) if (gidsetsize < 0) return -EINVAL; - get_group_info(current->group_info); - i = current->group_info->ngroups; + get_group_info(current->cred->group_info); + i = current->cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->group_info)) { + if (groups16_to_user(grouplist, current->cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->group_info); + put_group_info(current->cred->group_info); return i; } @@ -210,20 +210,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) asmlinkage long sys_getuid16(void) { - return high2lowuid(current->uid); + return high2lowuid(current->cred->uid); } asmlinkage long sys_geteuid16(void) { - return high2lowuid(current->euid); + return high2lowuid(current->cred->euid); } asmlinkage long sys_getgid16(void) { - return high2lowgid(current->gid); + return high2lowgid(current->cred->gid); } asmlinkage long sys_getegid16(void) { - return high2lowgid(current->egid); + return high2lowgid(current->cred->egid); } diff --git a/kernel/user.c b/kernel/user.c index 39d6159..104d22a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -457,11 +457,11 @@ void switch_uid(struct user_struct *new_user) * cheaply with the new uid cache, so if it matters * we should be checking for it. -DaveM */ - old_user = current->user; + old_user = current->cred->user; atomic_inc(&new_user->processes); atomic_dec(&old_user->processes); switch_uid_keyring(new_user); - current->user = new_user; + current->cred->user = new_user; sched_switch_user(current); /* -- cgit v1.1 From f1752eec6145c97163dbce62d17cf5d928e28a27 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:17 +1100 Subject: CRED: Detach the credentials from task_struct Detach the credentials from task_struct, duplicating them in copy_process() and releasing them in __put_task_struct(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/Makefile | 2 +- kernel/cred.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 24 ++++----------- 3 files changed, 103 insertions(+), 19 deletions(-) create mode 100644 kernel/cred.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66..5a6a612 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o CFLAGS_REMOVE_sched.o = -mno-spe diff --git a/kernel/cred.c b/kernel/cred.c new file mode 100644 index 0000000..833244a --- /dev/null +++ b/kernel/cred.c @@ -0,0 +1,96 @@ +/* Task credentials management + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * The initial credentials for the initial task + */ +struct cred init_cred = { + .usage = ATOMIC_INIT(3), + .securebits = SECUREBITS_DEFAULT, + .cap_inheritable = CAP_INIT_INH_SET, + .cap_permitted = CAP_FULL_SET, + .cap_effective = CAP_INIT_EFF_SET, + .cap_bset = CAP_INIT_BSET, + .user = INIT_USER, + .group_info = &init_groups, +}; + +/* + * The RCU callback to actually dispose of a set of credentials + */ +static void put_cred_rcu(struct rcu_head *rcu) +{ + struct cred *cred = container_of(rcu, struct cred, rcu); + + BUG_ON(atomic_read(&cred->usage) != 0); + + key_put(cred->thread_keyring); + key_put(cred->request_key_auth); + put_group_info(cred->group_info); + free_uid(cred->user); + security_cred_free(cred); + kfree(cred); +} + +/** + * __put_cred - Destroy a set of credentials + * @sec: The record to release + * + * Destroy a set of credentials on which no references remain. + */ +void __put_cred(struct cred *cred) +{ + call_rcu(&cred->rcu, put_cred_rcu); +} +EXPORT_SYMBOL(__put_cred); + +/* + * Copy credentials for the new process created by fork() + */ +int copy_creds(struct task_struct *p, unsigned long clone_flags) +{ + struct cred *pcred; + int ret; + + pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL); + if (!pcred) + return -ENOMEM; + +#ifdef CONFIG_SECURITY + pcred->security = NULL; +#endif + + ret = security_cred_alloc(pcred); + if (ret < 0) { + kfree(pcred); + return ret; + } + + atomic_set(&pcred->usage, 1); + get_group_info(pcred->group_info); + get_uid(pcred->user); + key_get(pcred->thread_keyring); + key_get(pcred->request_key_auth); + + atomic_inc(&pcred->user->processes); + + /* RCU assignment is unneeded here as no-one can have accessed this + * pointer yet, barring us */ + p->cred = pcred; + return 0; +} diff --git a/kernel/fork.c b/kernel/fork.c index 81fdc77..c932e28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,9 +146,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - security_task_free(tsk); - free_uid(tsk->__temp_cred.user); - put_group_info(tsk->__temp_cred.group_info); + put_cred(tsk->cred); delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) @@ -969,7 +967,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif - p->cred = &p->__temp_cred; retval = -EAGAIN; if (atomic_read(&p->cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { @@ -978,9 +975,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_free; } - atomic_inc(&p->cred->user->__count); - atomic_inc(&p->cred->user->processes); - get_group_info(p->cred->group_info); + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check @@ -1035,9 +1032,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); -#ifdef CONFIG_SECURITY - p->cred->security = NULL; -#endif p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); @@ -1082,10 +1076,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - if ((retval = security_task_alloc(p))) - goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) - goto bad_fork_cleanup_security; + goto bad_fork_cleanup_policy; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; @@ -1284,8 +1276,6 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_security: - security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); @@ -1298,9 +1288,7 @@ bad_fork_cleanup_cgroup: bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: - put_group_info(p->cred->group_info); - atomic_dec(&p->cred->user->processes); - free_uid(p->cred->user); + put_cred(p->cred); bad_fork_free: free_task(p); fork_out: -- cgit v1.1 From 86a264abe542cfececb4df129bc45a0338d8cdb9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:18 +1100 Subject: CRED: Wrap current->cred and a few other accessors Wrap current->cred and a few other accessors to hide their actual implementation. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/sys.c | 59 +++++++++++++++++++++++++++++----------------------------- kernel/uid16.c | 31 +++++++++++++++--------------- 2 files changed, 45 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 5d81f07..c4d6b59 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -143,6 +143,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) { struct task_struct *g, *p; struct user_struct *user; + const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; @@ -176,18 +177,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->cred->user; + user = cred->user; if (!who) - who = current_uid(); - else - if (who != current_uid() && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->cred->uid == who) + if (__task_cred(p)->uid == who) error = set_one_prio(p, niceval, error); while_each_thread(g, p); - if (who != current_uid()) + if (who != cred->uid) free_uid(user); /* For find_user() */ break; } @@ -207,6 +208,7 @@ asmlinkage long sys_getpriority(int which, int who) { struct task_struct *g, *p; struct user_struct *user; + const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; @@ -238,21 +240,21 @@ asmlinkage long sys_getpriority(int which, int who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = current->cred->user; + user = (struct user_struct *) cred->user; if (!who) - who = current_uid(); - else - if (who != current_uid() && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ do_each_thread(g, p) - if (p->cred->uid == who) { + if (__task_cred(p)->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } while_each_thread(g, p); - if (who != current_uid()) + if (who != cred->uid) free_uid(user); /* for find_user() */ break; } @@ -743,11 +745,11 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) retval = put_user(cred->suid, suid); return retval; @@ -796,11 +798,11 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) retval = put_user(cred->sgid, sgid); return retval; @@ -1199,7 +1201,7 @@ static void groups_sort(struct group_info *group_info) } /* a simple bsearch */ -int groups_search(struct group_info *group_info, gid_t grp) +int groups_search(const struct group_info *group_info, gid_t grp) { unsigned int left, right; @@ -1268,13 +1270,8 @@ EXPORT_SYMBOL(set_current_groups); asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) { - struct cred *cred = current->cred; - int i = 0; - - /* - * SMP: Nobody else can change our grouplist. Thus we are - * safe. - */ + const struct cred *cred = current_cred(); + int i; if (gidsetsize < 0) return -EINVAL; @@ -1330,8 +1327,9 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) */ int in_group_p(gid_t grp) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval = 1; + if (grp != cred->fsgid) retval = groups_search(cred->group_info, grp); return retval; @@ -1341,8 +1339,9 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { - struct cred *cred = current->cred; + const struct cred *cred = current_cred(); int retval = 1; + if (grp != cred->egid) retval = groups_search(cred->group_info, grp); return retval; diff --git a/kernel/uid16.c b/kernel/uid16.c index 71f07fc..2460c31 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) { + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) && - !(retval = put_user(high2lowuid(current->cred->euid), euid))) - retval = put_user(high2lowuid(current->cred->suid), suid); + if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && + !(retval = put_user(high2lowuid(cred->euid), euid))) + retval = put_user(high2lowuid(cred->suid), suid); return retval; } @@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) { + const struct cred *cred = current_cred(); int retval; - if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) && - !(retval = put_user(high2lowgid(current->cred->egid), egid))) - retval = put_user(high2lowgid(current->cred->sgid), sgid); + if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && + !(retval = put_user(high2lowgid(cred->egid), egid))) + retval = put_user(high2lowgid(cred->sgid), sgid); return retval; } @@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info, asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) { - int i = 0; + const struct cred *cred = current_cred(); + int i; if (gidsetsize < 0) return -EINVAL; - get_group_info(current->cred->group_info); - i = current->cred->group_info->ngroups; + i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } - if (groups16_to_user(grouplist, current->cred->group_info)) { + if (groups16_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: - put_group_info(current->cred->group_info); return i; } @@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) asmlinkage long sys_getuid16(void) { - return high2lowuid(current->cred->uid); + return high2lowuid(current_uid()); } asmlinkage long sys_geteuid16(void) { - return high2lowuid(current->cred->euid); + return high2lowuid(current_euid()); } asmlinkage long sys_getgid16(void) { - return high2lowgid(current->cred->gid); + return high2lowgid(current_gid()); } asmlinkage long sys_getegid16(void) { - return high2lowgid(current->cred->egid); + return high2lowgid(current_egid()); } -- cgit v1.1 From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:19 +1100 Subject: CRED: Use RCU to access another task's creds and to release a task's own creds Use RCU to access another task's creds and to release a task's own creds. This means that it will be possible for the credentials of a task to be replaced without another task (a) requiring a full lock to read them, and (b) seeing deallocated memory. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/auditsc.c | 33 +++++++++++++++++++-------------- kernel/cgroup.c | 16 ++++++++-------- kernel/exit.c | 14 +++++++++----- kernel/futex.c | 22 ++++++++++++++-------- kernel/futex_compat.c | 7 ++++--- kernel/ptrace.c | 22 +++++++++++++--------- kernel/sched.c | 31 +++++++++++++++++++++---------- kernel/signal.c | 49 +++++++++++++++++++++++++++++++------------------ kernel/sys.c | 11 +++++++---- kernel/tsacct.c | 6 ++++-- 10 files changed, 130 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2febf51..ae8ef88 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -447,7 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_names *name, enum audit_state *state) { - struct cred *cred = tsk->cred; + const struct cred *cred = get_task_cred(tsk); int i, j, need_sid = 1; u32 sid; @@ -642,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; } - if (!result) + if (!result) { + put_cred(cred); return 0; + } } if (rule->filterkey && ctx) ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); @@ -651,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } + put_cred(cred); return 1; } @@ -1229,7 +1232,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { - struct cred *cred = tsk->cred; + const struct cred *cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; @@ -1239,13 +1242,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->pid = tsk->pid; if (!context->ppid) context->ppid = sys_getppid(); - context->uid = cred->uid; - context->gid = cred->gid; - context->euid = cred->euid; - context->suid = cred->suid; + cred = current_cred(); + context->uid = cred->uid; + context->gid = cred->gid; + context->euid = cred->euid; + context->suid = cred->suid; context->fsuid = cred->fsuid; - context->egid = cred->egid; - context->sgid = cred->sgid; + context->egid = cred->egid; + context->sgid = cred->sgid; context->fsgid = cred->fsgid; context->personality = tsk->personality; @@ -2088,7 +2092,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task->cred->uid, + task->pid, task_uid(task), task->loginuid, loginuid, task->sessionid, sessionid); audit_log_end(ab); @@ -2471,7 +2475,7 @@ void __audit_ptrace(struct task_struct *t) context->target_pid = t->pid; context->target_auid = audit_get_loginuid(t); - context->target_uid = t->cred->uid; + context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); @@ -2490,6 +2494,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; + uid_t uid = current_uid(), t_uid = task_uid(t); if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { @@ -2497,7 +2502,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; else - audit_sig_uid = tsk->cred->uid; + audit_sig_uid = uid; security_task_getsecid(tsk, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) @@ -2509,7 +2514,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (!ctx->target_pid) { ctx->target_pid = t->tgid; ctx->target_auid = audit_get_loginuid(t); - ctx->target_uid = t->cred->uid; + ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); security_task_getsecid(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); @@ -2530,7 +2535,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->target_pid[axp->pid_count] = t->tgid; axp->target_auid[axp->pid_count] = audit_get_loginuid(t); - axp->target_uid[axp->pid_count] = t->cred->uid; + axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getsecid(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e210526..a512a75 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1279,7 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { struct task_struct *tsk; - uid_t euid; + const struct cred *cred = current_cred(), *tcred; int ret; if (pid) { @@ -1289,16 +1289,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) rcu_read_unlock(); return -ESRCH; } - get_task_struct(tsk); - rcu_read_unlock(); - euid = current_euid(); - if (euid && - euid != tsk->cred->uid && - euid != tsk->cred->suid) { - put_task_struct(tsk); + tcred = __task_cred(tsk); + if (cred->euid && + cred->euid != tcred->uid && + cred->euid != tcred->suid) { + rcu_read_unlock(); return -EACCES; } + get_task_struct(tsk); + rcu_read_unlock(); } else { tsk = current; get_task_struct(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index e0f6e18..bbc2253 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -160,7 +160,10 @@ void release_task(struct task_struct * p) int zap_leader; repeat: tracehook_prepare_release_task(p); - atomic_dec(&p->cred->user->processes); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ + atomic_dec(&__task_cred(p)->user->processes); + proc_flush_task(p); write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); @@ -1267,12 +1270,12 @@ static int wait_task_zombie(struct task_struct *p, int options, unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; if (!likely(options & WEXITED)) return 0; if (unlikely(options & WNOWAIT)) { - uid_t uid = p->cred->uid; int exit_code = p->exit_code; int why, status; @@ -1393,7 +1396,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->cred->uid, &infop->si_uid); + retval = put_user(uid, &infop->si_uid); if (!retval) retval = pid; @@ -1458,7 +1461,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p, if (!unlikely(options & WNOWAIT)) p->exit_code = 0; - uid = p->cred->uid; + /* don't need the RCU readlock here as we're holding a spinlock */ + uid = __task_cred(p)->uid; unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1532,10 +1536,10 @@ static int wait_task_continued(struct task_struct *p, int options, } if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = __task_cred(p)->uid; spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); - uid = p->cred->uid; get_task_struct(p); read_unlock(&tasklist_lock); diff --git a/kernel/futex.c b/kernel/futex.c index 28421d8..4fe790e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -439,15 +439,20 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || (euid != p->cred->euid && - euid != p->cred->uid)) + if (!p) { p = ERR_PTR(-ESRCH); - else - get_task_struct(p); + } else { + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid) + p = ERR_PTR(-ESRCH); + else + get_task_struct(p); + } rcu_read_unlock(); @@ -1831,7 +1836,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, { struct robust_list_head __user *head; unsigned long ret; - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred; if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -1847,8 +1852,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->cred->euid && - euid != p->cred->uid && + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 2c3fd5e..d607a5b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -135,7 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, { struct compat_robust_list_head __user *head; unsigned long ret; - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred; if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!p) goto err_unlock; ret = -EPERM; - if (euid != p->cred->euid && - euid != p->cred->uid && + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 49849d1..b9d5f4e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -115,7 +115,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) int __ptrace_may_access(struct task_struct *task, unsigned int mode) { - struct cred *cred = current->cred, *tcred = task->cred; + const struct cred *cred = current_cred(), *tcred; /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -125,19 +125,23 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - uid_t uid = cred->uid; - gid_t gid = cred->gid; int dumpable = 0; /* Don't let security modules deny introspection */ if (task == current) return 0; - if ((uid != tcred->euid || - uid != tcred->suid || - uid != tcred->uid || - gid != tcred->egid || - gid != tcred->sgid || - gid != tcred->gid) && !capable(CAP_SYS_PTRACE)) + rcu_read_lock(); + tcred = __task_cred(task); + if ((cred->uid != tcred->euid || + cred->uid != tcred->suid || + cred->uid != tcred->uid || + cred->gid != tcred->egid || + cred->gid != tcred->sgid || + cred->gid != tcred->gid) && + !capable(CAP_SYS_PTRACE)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); diff --git a/kernel/sched.c b/kernel/sched.c index 733c59e..92992e2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -345,7 +345,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->cred->user->tg; + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@ -5121,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; +} + static int __sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param, bool user) { @@ -5128,7 +5146,6 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; - uid_t euid; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -5181,9 +5198,7 @@ recheck: return -EPERM; /* can't change other user's priorities */ - euid = current_euid(); - if (euid != p->cred->euid && - euid != p->cred->uid) + if (!check_same_owner(p)) return -EPERM; } @@ -5394,7 +5409,6 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) cpumask_t cpus_allowed; cpumask_t new_mask = *in_mask; struct task_struct *p; - uid_t euid; int retval; get_online_cpus(); @@ -5415,11 +5429,8 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); - euid = current_euid(); retval = -EPERM; - if (euid != p->cred->euid && - euid != p->cred->uid && - !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index 80e8a648..8498912 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -177,6 +177,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask) return sig; } +/* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appopriate lock must be held to protect t's user_struct + */ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { @@ -184,11 +189,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, struct user_struct *user; /* - * In order to avoid problems with "switch_user()", we want to make - * sure that the compiler doesn't re-load "t->user" + * We won't get problems with the target's UID changing under us + * because changing it requires RCU be used, and if t != current, the + * caller must be holding the RCU readlock (by way of a spinlock) and + * we use RCU protection here */ - user = t->cred->user; - barrier(); + user = __task_cred(t)->user; atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= @@ -562,12 +568,13 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) /* * Bad permissions for sending the signal + * - the caller must hold at least the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { + const struct cred *cred = current_cred(), *tcred; struct pid *sid; - uid_t uid, euid; int error; if (!valid_signal(sig)) @@ -580,10 +587,11 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - uid = current_uid(); - euid = current_euid(); - if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) && - (uid ^ t->cred->suid) && (uid ^ t->cred->uid) && + tcred = __task_cred(t); + if ((cred->euid ^ tcred->suid) && + (cred->euid ^ tcred->uid) && + (cred->uid ^ tcred->suid) && + (cred->uid ^ tcred->uid) && !capable(CAP_KILL)) { switch (sig) { case SIGCONT: @@ -1011,6 +1019,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long return sighand; } +/* + * send signal info to all the members of a group + * - the caller must hold the RCU read lock at least + */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { unsigned long flags; @@ -1032,8 +1044,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) /* * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) + * - the caller must hold at least a readlock on tasklist_lock */ - int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; @@ -1089,6 +1101,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, { int ret = -EINVAL; struct task_struct *p; + const struct cred *pcred; if (!valid_signal(sig)) return ret; @@ -1099,9 +1112,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && (euid != p->cred->suid) && (euid != p->cred->uid) - && (uid != p->cred->suid) && (uid != p->cred->uid)) { + pcred = __task_cred(p); + if ((info == SEND_SIG_NOINFO || + (!is_si_special(info) && SI_FROMUSER(info))) && + euid != pcred->suid && euid != pcred->uid && + uid != pcred->suid && uid != pcred->uid) { ret = -EPERM; goto out_unlock; } @@ -1372,10 +1387,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - info.si_uid = tsk->cred->uid; - thread_group_cputime(tsk, &cputime); info.si_utime = cputime_to_jiffies(cputime.utime); info.si_stime = cputime_to_jiffies(cputime.stime); @@ -1443,10 +1457,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - info.si_uid = tsk->cred->uid; - info.si_utime = cputime_to_clock_t(tsk->utime); info.si_stime = cputime_to_clock_t(tsk->stime); @@ -1713,7 +1726,7 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_errno = 0; info->si_code = SI_USER; info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->cred->uid; + info->si_uid = task_uid(current->parent); } /* If the (new) signal is now blocked, requeue it. */ diff --git a/kernel/sys.c b/kernel/sys.c index c4d6b59..ccc9eb7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -112,14 +112,17 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +/* + * set the priority of a task + * - the caller must hold the RCU read lock + */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - uid_t euid = current_euid(); + const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (p->cred->uid != euid && - p->cred->euid != euid && - !capable(CAP_SYS_NICE)) { + if (pcred->uid != cred->euid && + pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { error = -EPERM; goto out; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 6d1ed07..2dc06ab 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -27,6 +27,7 @@ */ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) { + const struct cred *tcred; struct timespec uptime, ts; u64 ac_etime; @@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_uid = tsk->cred->uid; - stats->ac_gid = tsk->cred->gid; stats->ac_pid = tsk->pid; rcu_read_lock(); + tcred = __task_cred(tsk); + stats->ac_uid = tcred->uid; + stats->ac_gid = tcred->gid; stats->ac_ppid = pid_alive(tsk) ? rcu_dereference(tsk->real_parent)->tgid : 0; rcu_read_unlock(); -- cgit v1.1 From bb952bb98a7e479262c7eb25d5592545a3af147d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:20 +1100 Subject: CRED: Separate per-task-group keyrings from signal_struct Separate per-task-group keyrings from signal_struct and dangle their anchor from the cred struct rather than the signal_struct. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- kernel/cred.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 7 ------- 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 833244a..ac73e36 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -17,6 +17,17 @@ #include /* + * The common credentials for the initial task's thread group + */ +#ifdef CONFIG_KEYS +static struct thread_group_cred init_tgcred = { + .usage = ATOMIC_INIT(2), + .tgid = 0, + .lock = SPIN_LOCK_UNLOCKED, +}; +#endif + +/* * The initial credentials for the initial task */ struct cred init_cred = { @@ -28,9 +39,42 @@ struct cred init_cred = { .cap_bset = CAP_INIT_BSET, .user = INIT_USER, .group_info = &init_groups, +#ifdef CONFIG_KEYS + .tgcred = &init_tgcred, +#endif }; /* + * Dispose of the shared task group credentials + */ +#ifdef CONFIG_KEYS +static void release_tgcred_rcu(struct rcu_head *rcu) +{ + struct thread_group_cred *tgcred = + container_of(rcu, struct thread_group_cred, rcu); + + BUG_ON(atomic_read(&tgcred->usage) != 0); + + key_put(tgcred->session_keyring); + key_put(tgcred->process_keyring); + kfree(tgcred); +} +#endif + +/* + * Release a set of thread group credentials. + */ +static void release_tgcred(struct cred *cred) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = cred->tgcred; + + if (atomic_dec_and_test(&tgcred->usage)) + call_rcu(&tgcred->rcu, release_tgcred_rcu); +#endif +} + +/* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) @@ -41,6 +85,7 @@ static void put_cred_rcu(struct rcu_head *rcu) key_put(cred->thread_keyring); key_put(cred->request_key_auth); + release_tgcred(cred); put_group_info(cred->group_info); free_uid(cred->user); security_cred_free(cred); @@ -71,12 +116,30 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) if (!pcred) return -ENOMEM; +#ifdef CONFIG_KEYS + if (clone_flags & CLONE_THREAD) { + atomic_inc(&pcred->tgcred->usage); + } else { + pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL); + if (!pcred->tgcred) { + kfree(pcred); + return -ENOMEM; + } + atomic_set(&pcred->tgcred->usage, 1); + spin_lock_init(&pcred->tgcred->lock); + pcred->tgcred->process_keyring = NULL; + pcred->tgcred->session_keyring = + key_get(p->cred->tgcred->session_keyring); + } +#endif + #ifdef CONFIG_SECURITY pcred->security = NULL; #endif ret = security_cred_alloc(pcred); if (ret < 0) { + release_tgcred(pcred); kfree(pcred); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index c932e28..ded1972 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -802,12 +802,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - ret = copy_thread_group_keys(tsk); - if (ret < 0) { - kmem_cache_free(signal_cachep, sig); - return ret; - } - atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); @@ -852,7 +846,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { thread_group_cputime_free(sig); - exit_thread_group_keys(sig); tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } -- cgit v1.1 From 6cc88bc45ce8043171089c9592da223dfab91823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:21 +1100 Subject: CRED: Rename is_single_threaded() to is_wq_single_threaded() Rename is_single_threaded() to is_wq_single_threaded() so that a new is_single_threaded() can be created that refers to tasks rather than waitqueues. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: James Morris --- kernel/workqueue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f928f2a..f12ab5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -84,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly; static cpumask_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_single_threaded(struct workqueue_struct *wq) +static inline int is_wq_single_threaded(struct workqueue_struct *wq) { return wq->singlethread; } static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) { - return is_single_threaded(wq) + return is_wq_single_threaded(wq) ? &cpu_singlethread_map : &cpu_populated_map; } static struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) { - if (unlikely(is_single_threaded(wq))) + if (unlikely(is_wq_single_threaded(wq))) cpu = singlethread_cpu; return per_cpu_ptr(wq->cpu_wq, cpu); } @@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; + const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; struct task_struct *p; p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); -- cgit v1.1 From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:23 +1100 Subject: CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells Acked-by: James Morris Signed-off-by: James Morris --- kernel/auditsc.c | 42 ++--- kernel/capability.c | 78 +++------ kernel/cred-internals.h | 21 +++ kernel/cred.c | 321 ++++++++++++++++++++++++++++++---- kernel/exit.c | 9 +- kernel/fork.c | 7 +- kernel/kmod.c | 30 +++- kernel/ptrace.c | 9 + kernel/signal.c | 10 +- kernel/sys.c | 450 ++++++++++++++++++++++++++---------------------- kernel/user.c | 37 +--- kernel/user_namespace.c | 12 +- 12 files changed, 652 insertions(+), 374 deletions(-) create mode 100644 kernel/cred-internals.h (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ae8ef88..bc1e2d8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2546,18 +2546,17 @@ int __audit_signal_info(int sig, struct task_struct *t) /** * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps - * @bprm pointer to the bprm being processed - * @caps the caps read from the disk + * @bprm: pointer to the bprm being processed + * @new: the proposed new credentials + * @old: the old credentials * * Simply check if the proc already has the caps given by the file and if not * store the priv escalation info for later auditing at the end of the syscall * - * this can fail and we don't care. See the note in audit.h for - * audit_log_bprm_fcaps() for my explaination.... - * * -Eric */ -void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE) +int __audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = current->audit_context; @@ -2566,7 +2565,7 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) - return; + return -ENOMEM; ax->d.type = AUDIT_BPRM_FCAPS; ax->d.next = context->aux; @@ -2581,26 +2580,27 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_ ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - ax->old_pcap.permitted = *pP; - ax->old_pcap.inheritable = current->cred->cap_inheritable; - ax->old_pcap.effective = *pE; + ax->old_pcap.permitted = old->cap_permitted; + ax->old_pcap.inheritable = old->cap_inheritable; + ax->old_pcap.effective = old->cap_effective; - ax->new_pcap.permitted = current->cred->cap_permitted; - ax->new_pcap.inheritable = current->cred->cap_inheritable; - ax->new_pcap.effective = current->cred->cap_effective; + ax->new_pcap.permitted = new->cap_permitted; + ax->new_pcap.inheritable = new->cap_inheritable; + ax->new_pcap.effective = new->cap_effective; + return 0; } /** * __audit_log_capset - store information about the arguments to the capset syscall - * @pid target pid of the capset call - * @eff effective cap set - * @inh inheritible cap set - * @perm permited cap set + * @pid: target pid of the capset call + * @new: the new credentials + * @old: the old (current) credentials * * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm) +int __audit_log_capset(pid_t pid, + const struct cred *new, const struct cred *old) { struct audit_aux_data_capset *ax; struct audit_context *context = current->audit_context; @@ -2617,9 +2617,9 @@ int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_c context->aux = (void *)ax; ax->pid = pid; - ax->cap.effective = *eff; - ax->cap.inheritable = *eff; - ax->cap.permitted = *perm; + ax->cap.effective = new->cap_effective; + ax->cap.inheritable = new->cap_effective; + ax->cap.permitted = new->cap_permitted; return 0; } diff --git a/kernel/capability.c b/kernel/capability.c index a404b98..36b4b4d 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -15,12 +15,7 @@ #include #include #include - -/* - * This lock protects task->cap_* for all tasks including current. - * Locking rule: acquire this prior to tasklist_lock. - */ -static DEFINE_SPINLOCK(task_capability_lock); +#include "cred-internals.h" /* * Leveraged for setting/resetting capabilities @@ -128,12 +123,11 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) } /* - * If we have configured with filesystem capability support, then the - * only thing that can change the capabilities of the current process - * is the current process. As such, we can't be in this code at the - * same time as we are in the process of setting capabilities in this - * process. The net result is that we can limit our use of locks to - * when we are reading the caps of another process. + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code + * at the same time as we are in the process of setting capabilities + * in this process. The net result is that we can limit our use of + * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) @@ -143,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; - spin_lock(&task_capability_lock); read_lock(&tasklist_lock); target = find_task_by_vpid(pid); @@ -153,34 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, ret = security_capget(target, pEp, pIp, pPp); read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } -/* - * Atomically modify the effective capabilities returning the original - * value. No permission check is performed here - it is assumed that the - * caller is permitted to set the desired effective capabilities. - */ -kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) -{ - kernel_cap_t pE_old; - - spin_lock(&task_capability_lock); - - pE_old = current->cred->cap_effective; - current->cred->cap_effective = pE_new; - - spin_unlock(&task_capability_lock); - - return pE_old; -} - -EXPORT_SYMBOL(cap_set_effective); - /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and @@ -208,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); - if (!ret) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; @@ -270,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; + struct cred *new; int ret; pid_t pid; @@ -284,8 +255,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; - if (copy_from_user(&kdata, data, tocopy - * sizeof(struct __user_cap_data_struct))) + if (copy_from_user(&kdata, data, + tocopy * sizeof(struct __user_cap_data_struct))) return -EFAULT; for (i = 0; i < tocopy; i++) { @@ -300,24 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) i++; } - ret = audit_log_capset(pid, &effective, &inheritable, &permitted); - if (ret) + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = security_capset(new, current_cred(), + &effective, &inheritable, &permitted); + if (ret < 0) + goto error; + + ret = audit_log_capset(pid, new, current_cred()); + if (ret < 0) return ret; - /* This lock is required even when filesystem capability support is - * configured - it protects the sys_capget() call from returning - * incorrect data in the case that the targeted process is not the - * current one. - */ - spin_lock(&task_capability_lock); - - ret = security_capset_check(&effective, &inheritable, &permitted); - /* Having verified that the proposed changes are legal, we now put them - * into effect. - */ - if (!ret) - security_capset_set(&effective, &inheritable, &permitted); - spin_unlock(&task_capability_lock); + return commit_creds(new); + +error: + abort_creds(new); return ret; } diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h new file mode 100644 index 0000000..2dc4fc2 --- /dev/null +++ b/kernel/cred-internals.h @@ -0,0 +1,21 @@ +/* Internal credentials stuff + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +/* + * user.c + */ +static inline void sched_switch_user(struct task_struct *p) +{ +#ifdef CONFIG_USER_SCHED + sched_move_task(p); +#endif /* CONFIG_USER_SCHED */ +} + diff --git a/kernel/cred.c b/kernel/cred.c index ac73e36..cb6b5ed 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -15,6 +15,10 @@ #include #include #include +#include +#include "cred-internals.h" + +static struct kmem_cache *cred_jar; /* * The common credentials for the initial task's thread group @@ -64,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu) /* * Release a set of thread group credentials. */ -static void release_tgcred(struct cred *cred) +void release_tgcred(struct cred *cred) { #ifdef CONFIG_KEYS struct thread_group_cred *tgcred = cred->tgcred; @@ -81,79 +85,322 @@ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); - BUG_ON(atomic_read(&cred->usage) != 0); + if (atomic_read(&cred->usage) != 0) + panic("CRED: put_cred_rcu() sees %p with usage %d\n", + cred, atomic_read(&cred->usage)); + security_cred_free(cred); key_put(cred->thread_keyring); key_put(cred->request_key_auth); release_tgcred(cred); put_group_info(cred->group_info); free_uid(cred->user); - security_cred_free(cred); - kfree(cred); + kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials - * @sec: The record to release + * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { + BUG_ON(atomic_read(&cred->usage) != 0); + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); +/** + * prepare_creds - Prepare a new set of credentials for modification + * + * Prepare a new set of task credentials for modification. A task's creds + * shouldn't generally be modified directly, therefore this function is used to + * prepare a new copy, which the caller then modifies and then commits by + * calling commit_creds(). + * + * Returns a pointer to the new creds-to-be if successful, NULL otherwise. + * + * Call commit_creds() or abort_creds() to clean up. + */ +struct cred *prepare_creds(void) +{ + struct task_struct *task = current; + const struct cred *old; + struct cred *new; + + BUG_ON(atomic_read(&task->cred->usage) < 1); + + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + old = task->cred; + memcpy(new, old, sizeof(struct cred)); + + atomic_set(&new->usage, 1); + get_group_info(new->group_info); + get_uid(new->user); + +#ifdef CONFIG_KEYS + key_get(new->thread_keyring); + key_get(new->request_key_auth); + atomic_inc(&new->tgcred->usage); +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + + if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + goto error; + return new; + +error: + abort_creds(new); + return NULL; +} +EXPORT_SYMBOL(prepare_creds); + +/* + * prepare new credentials for the usermode helper dispatcher + */ +struct cred *prepare_usermodehelper_creds(void) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = NULL; +#endif + struct cred *new; + +#ifdef CONFIG_KEYS + tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC); + if (!tgcred) + return NULL; +#endif + + new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, &init_cred, sizeof(struct cred)); + + atomic_set(&new->usage, 1); + get_group_info(new->group_info); + get_uid(new->user); + +#ifdef CONFIG_KEYS + new->thread_keyring = NULL; + new->request_key_auth = NULL; + new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT; + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + new->tgcred = tgcred; +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) + goto error; + + BUG_ON(atomic_read(&new->usage) != 1); + return new; + +error: + put_cred(new); + return NULL; +} + /* * Copy credentials for the new process created by fork() + * + * We share if we can, but under some circumstances we have to generate a new + * set. */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { - struct cred *pcred; - int ret; +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif + struct cred *new; + + mutex_init(&p->cred_exec_mutex); - pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL); - if (!pcred) + if ( +#ifdef CONFIG_KEYS + !p->cred->thread_keyring && +#endif + clone_flags & CLONE_THREAD + ) { + get_cred(p->cred); + atomic_inc(&p->cred->user->processes); + return 0; + } + + new = prepare_creds(); + if (!new) return -ENOMEM; #ifdef CONFIG_KEYS - if (clone_flags & CLONE_THREAD) { - atomic_inc(&pcred->tgcred->usage); - } else { - pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL); - if (!pcred->tgcred) { - kfree(pcred); + /* new threads get their own thread keyrings if their parent already + * had one */ + if (new->thread_keyring) { + key_put(new->thread_keyring); + new->thread_keyring = NULL; + if (clone_flags & CLONE_THREAD) + install_thread_keyring_to_cred(new); + } + + /* we share the process and session keyrings between all the threads in + * a process - this is slightly icky as we violate COW credentials a + * bit */ + if (!(clone_flags & CLONE_THREAD)) { + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + put_cred(new); return -ENOMEM; } - atomic_set(&pcred->tgcred->usage, 1); - spin_lock_init(&pcred->tgcred->lock); - pcred->tgcred->process_keyring = NULL; - pcred->tgcred->session_keyring = - key_get(p->cred->tgcred->session_keyring); + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = key_get(new->tgcred->session_keyring); + + release_tgcred(new); + new->tgcred = tgcred; } #endif -#ifdef CONFIG_SECURITY - pcred->security = NULL; -#endif + atomic_inc(&new->user->processes); + p->cred = new; + return 0; +} - ret = security_cred_alloc(pcred); - if (ret < 0) { - release_tgcred(pcred); - kfree(pcred); - return ret; +/** + * commit_creds - Install new credentials upon the current task + * @new: The credentials to be assigned + * + * Install a new set of credentials to the current task, using RCU to replace + * the old set. + * + * This function eats the caller's reference to the new credentials. + * + * Always returns 0 thus allowing this function to be tail-called at the end + * of, say, sys_setgid(). + */ +int commit_creds(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old; + + BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_read(&task->cred->usage) < 1); + + old = task->cred; + security_commit_creds(new, old); + + /* dumpability changes */ + if (old->euid != new->euid || + old->egid != new->egid || + old->fsuid != new->fsuid || + old->fsgid != new->fsgid || + !cap_issubset(new->cap_permitted, old->cap_permitted)) { + set_dumpable(task->mm, suid_dumpable); + task->pdeath_signal = 0; + smp_wmb(); } - atomic_set(&pcred->usage, 1); - get_group_info(pcred->group_info); - get_uid(pcred->user); - key_get(pcred->thread_keyring); - key_get(pcred->request_key_auth); + /* alter the thread keyring */ + if (new->fsuid != old->fsuid) + key_fsuid_changed(task); + if (new->fsgid != old->fsgid) + key_fsgid_changed(task); + + /* do it + * - What if a process setreuid()'s and this brings the + * new uid over his NPROC rlimit? We can check this now + * cheaply with the new uid cache, so if it matters + * we should be checking for it. -DaveM + */ + if (new->user != old->user) + atomic_inc(&new->user->processes); + rcu_assign_pointer(task->cred, new); + if (new->user != old->user) + atomic_dec(&old->user->processes); + + sched_switch_user(task); + + /* send notifications */ + if (new->uid != old->uid || + new->euid != old->euid || + new->suid != old->suid || + new->fsuid != old->fsuid) + proc_id_connector(task, PROC_EVENT_UID); - atomic_inc(&pcred->user->processes); + if (new->gid != old->gid || + new->egid != old->egid || + new->sgid != old->sgid || + new->fsgid != old->fsgid) + proc_id_connector(task, PROC_EVENT_GID); - /* RCU assignment is unneeded here as no-one can have accessed this - * pointer yet, barring us */ - p->cred = pcred; + put_cred(old); return 0; } +EXPORT_SYMBOL(commit_creds); + +/** + * abort_creds - Discard a set of credentials and unlock the current task + * @new: The credentials that were going to be applied + * + * Discard a set of credentials that were under construction and unlock the + * current task. + */ +void abort_creds(struct cred *new) +{ + BUG_ON(atomic_read(&new->usage) < 1); + put_cred(new); +} +EXPORT_SYMBOL(abort_creds); + +/** + * override_creds - Temporarily override the current process's credentials + * @new: The credentials to be assigned + * + * Install a set of temporary override credentials on the current process, + * returning the old set for later reversion. + */ +const struct cred *override_creds(const struct cred *new) +{ + const struct cred *old = current->cred; + + rcu_assign_pointer(current->cred, get_cred(new)); + return old; +} +EXPORT_SYMBOL(override_creds); + +/** + * revert_creds - Revert a temporary credentials override + * @old: The credentials to be restored + * + * Revert a temporary set of override credentials to an old set, discarding the + * override set. + */ +void revert_creds(const struct cred *old) +{ + const struct cred *override = current->cred; + + rcu_assign_pointer(current->cred, old); + put_cred(override); +} +EXPORT_SYMBOL(revert_creds); + +/* + * initialise the credentials stuff + */ +void __init cred_init(void) +{ + /* allocate a slab in which we can store credentials */ + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +} diff --git a/kernel/exit.c b/kernel/exit.c index bbc2253..c0711da 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,12 +47,14 @@ #include #include #include +#include #include #include #include #include #include +#include "cred-internals.h" static void exit_mm(struct task_struct * tsk); @@ -338,12 +340,12 @@ static void reparent_to_kthreadd(void) /* cpus_allowed? */ /* rt_priority? */ /* signals? */ - security_task_reparent_to_init(current); memcpy(current->signal->rlim, init_task.signal->rlim, sizeof(current->signal->rlim)); - atomic_inc(&(INIT_USER->__count)); + + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); write_unlock_irq(&tasklist_lock); - switch_uid(INIT_USER); } void __set_special_pids(struct pid *pid) @@ -1085,7 +1087,6 @@ NORET_TYPE void do_exit(long code) check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); - exit_keys(tsk); if (group_dead && tsk->signal->leader) disassociate_ctty(1); diff --git a/kernel/fork.c b/kernel/fork.c index ded1972..82a7948 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1084,10 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; - if ((retval = copy_keys(clone_flags, p))) - goto bad_fork_cleanup_mm; if ((retval = copy_namespaces(clone_flags, p))) - goto bad_fork_cleanup_keys; + goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); @@ -1252,8 +1250,6 @@ bad_fork_cleanup_io: put_io_context(p->io_context); bad_fork_cleanup_namespaces: exit_task_namespaces(p); -bad_fork_cleanup_keys: - exit_keys(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); @@ -1281,6 +1277,7 @@ bad_fork_cleanup_cgroup: bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: + atomic_dec(&p->cred->user->processes); put_cred(p->cred); bad_fork_free: free_task(p); diff --git a/kernel/kmod.c b/kernel/kmod.c index f044f8f..b46dbb9 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module); struct subprocess_info { struct work_struct work; struct completion *complete; + struct cred *cred; char *path; char **argv; char **envp; - struct key *ring; enum umh_wait wait; int retval; struct file *stdin; @@ -134,19 +134,20 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - struct key *new_session, *old_session; int retval; - /* Unblock all signals and set the session keyring. */ - new_session = key_get(sub_info->ring); + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + + /* Unblock all signals */ spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - key_put(old_session); + /* Install the credentials */ + commit_creds(sub_info->cred); + sub_info->cred = NULL; /* Install input pipe when needed */ if (sub_info->stdin) { @@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info->argv, info->envp); + if (info->cred) + put_cred(info->cred); kfree(info); } EXPORT_SYMBOL(call_usermodehelper_freeinfo); @@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work) pid_t pid; enum umh_wait wait = sub_info->wait; + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; + sub_info->cred = prepare_usermodehelper_creds(); + if (!sub_info->cred) + return NULL; out: return sub_info; @@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup); void call_usermodehelper_setkeys(struct subprocess_info *info, struct key *session_keyring) { - info->ring = session_keyring; +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = info->cred->tgcred; + key_put(tgcred->session_keyring); + tgcred->session_keyring = key_get(session_keyring); +#else + BUG(); +#endif } EXPORT_SYMBOL(call_usermodehelper_setkeys); @@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + helper_lock(); if (sub_info->path[0] == '\0') goto out; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b9d5f4e..f764b88 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -171,6 +171,14 @@ int ptrace_attach(struct task_struct *task) if (same_thread_group(task, current)) goto out; + /* Protect exec's credential calculations against our interference; + * SUID, SGID and LSM creds get determined differently under ptrace. + */ + retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + if (retval < 0) + goto out; + + retval = -EPERM; repeat: /* * Nasty, nasty. @@ -210,6 +218,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); + mutex_unlock(¤t->cred_exec_mutex); out: return retval; } diff --git a/kernel/signal.c b/kernel/signal.c index 8498912..2a64304 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -180,7 +180,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask) /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to protect t's user_struct + * appopriate lock must be held to stop the target task from exiting */ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) @@ -194,7 +194,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, * caller must be holding the RCU readlock (by way of a spinlock) and * we use RCU protection here */ - user = __task_cred(t)->user; + user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= @@ -202,12 +202,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, q = kmem_cache_alloc(sigqueue_cachep, flags); if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = get_uid(user); + q->user = user; } - return(q); + + return q; } static void __sigqueue_free(struct sigqueue *q) diff --git a/kernel/sys.c b/kernel/sys.c index ccc9eb7..ab73504 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -180,7 +180,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = cred->user; + user = (struct user_struct *) cred->user; if (!who) who = cred->uid; else if ((who != cred->uid) && @@ -479,47 +479,48 @@ void ctrl_alt_del(void) */ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) { - struct cred *cred = current->cred; - int old_rgid = cred->gid; - int old_egid = cred->egid; - int new_rgid = old_rgid; - int new_egid = old_egid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); if (retval) - return retval; + goto error; + retval = -EPERM; if (rgid != (gid_t) -1) { - if ((old_rgid == rgid) || - (cred->egid == rgid) || + if (old->gid == rgid || + old->egid == rgid || capable(CAP_SETGID)) - new_rgid = rgid; + new->gid = rgid; else - return -EPERM; + goto error; } if (egid != (gid_t) -1) { - if ((old_rgid == egid) || - (cred->egid == egid) || - (cred->sgid == egid) || + if (old->gid == egid || + old->egid == egid || + old->sgid == egid || capable(CAP_SETGID)) - new_egid = egid; + new->egid = egid; else - return -EPERM; - } - if (new_egid != old_egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + goto error; } + if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old_rgid)) - cred->sgid = new_egid; - cred->fsgid = new_egid; - cred->egid = new_egid; - cred->gid = new_rgid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + (egid != (gid_t) -1 && egid != old->gid)) + new->sgid = new->egid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; } /* @@ -529,40 +530,42 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) */ asmlinkage long sys_setgid(gid_t gid) { - struct cred *cred = current->cred; - int old_egid = cred->egid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); if (retval) - return retval; + goto error; - if (capable(CAP_SETGID)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->gid = cred->egid = cred->sgid = cred->fsgid = gid; - } else if ((gid == cred->gid) || (gid == cred->sgid)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->egid = cred->fsgid = gid; - } + retval = -EPERM; + if (capable(CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = gid; + else if (gid == old->gid || gid == old->sgid) + new->egid = new->fsgid = gid; else - return -EPERM; + goto error; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } -static int set_user(uid_t new_ruid, int dumpclear) +/* + * change the user struct in a credentials set to match the new UID + */ +static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); + new_user = alloc_uid(current->nsproxy->user_ns, new->uid); if (!new_user) return -EAGAIN; @@ -573,13 +576,8 @@ static int set_user(uid_t new_ruid, int dumpclear) return -EAGAIN; } - switch_uid(new_user); - - if (dumpclear) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->cred->uid = new_ruid; + free_uid(new->user); + new->user = new_user; return 0; } @@ -600,55 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear) */ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) { - struct cred *cred = current->cred; - int old_ruid, old_euid, old_suid, new_ruid, new_euid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); if (retval) - return retval; - - new_ruid = old_ruid = cred->uid; - new_euid = old_euid = cred->euid; - old_suid = cred->suid; + goto error; + retval = -EPERM; if (ruid != (uid_t) -1) { - new_ruid = ruid; - if ((old_ruid != ruid) && - (cred->euid != ruid) && + new->uid = ruid; + if (old->uid != ruid && + old->euid != ruid && !capable(CAP_SETUID)) - return -EPERM; + goto error; } if (euid != (uid_t) -1) { - new_euid = euid; - if ((old_ruid != euid) && - (cred->euid != euid) && - (cred->suid != euid) && + new->euid = euid; + if (old->uid != euid && + old->euid != euid && + old->suid != euid && !capable(CAP_SETUID)) - return -EPERM; + goto error; } - if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) - return -EAGAIN; + retval = -EAGAIN; + if (new->uid != old->uid && set_user(new) < 0) + goto error; - if (new_euid != old_euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->fsuid = cred->euid = new_euid; if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old_ruid)) - cred->suid = cred->euid; - cred->fsuid = cred->euid; - - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + (euid != (uid_t) -1 && euid != old->uid)) + new->suid = new->euid; + new->fsuid = new->euid; - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); -} + retval = security_task_fix_setuid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + return commit_creds(new); +error: + abort_creds(new); + return retval; +} /* * setuid() is implemented like SysV with SAVED_IDS @@ -663,37 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) */ asmlinkage long sys_setuid(uid_t uid) { - struct cred *cred = current->cred; - int old_euid = cred->euid; - int old_ruid, old_suid, new_suid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); if (retval) - return retval; + goto error; - old_ruid = cred->uid; - old_suid = cred->suid; - new_suid = old_suid; - + retval = -EPERM; if (capable(CAP_SETUID)) { - if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) - return -EAGAIN; - new_suid = uid; - } else if ((uid != cred->uid) && (uid != new_suid)) - return -EPERM; - - if (old_euid != uid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->suid = new->uid = uid; + if (uid != old->uid && set_user(new) < 0) { + retval = -EAGAIN; + goto error; + } + } else if (uid != old->uid && uid != new->suid) { + goto error; } - cred->fsuid = cred->euid = uid; - cred->suid = new_suid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + new->fsuid = new->euid = uid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + + return commit_creds(new); - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); +error: + abort_creds(new); + return retval; } @@ -703,47 +706,53 @@ asmlinkage long sys_setuid(uid_t uid) */ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { - struct cred *cred = current->cred; - int old_ruid = cred->uid; - int old_euid = cred->euid; - int old_suid = cred->suid; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); if (retval) - return retval; + goto error; + old = current_cred(); + retval = -EPERM; if (!capable(CAP_SETUID)) { - if ((ruid != (uid_t) -1) && (ruid != cred->uid) && - (ruid != cred->euid) && (ruid != cred->suid)) - return -EPERM; - if ((euid != (uid_t) -1) && (euid != cred->uid) && - (euid != cred->euid) && (euid != cred->suid)) - return -EPERM; - if ((suid != (uid_t) -1) && (suid != cred->uid) && - (suid != cred->euid) && (suid != cred->suid)) - return -EPERM; + if (ruid != (uid_t) -1 && ruid != old->uid && + ruid != old->euid && ruid != old->suid) + goto error; + if (euid != (uid_t) -1 && euid != old->uid && + euid != old->euid && euid != old->suid) + goto error; + if (suid != (uid_t) -1 && suid != old->uid && + suid != old->euid && suid != old->suid) + goto error; } + + retval = -EAGAIN; if (ruid != (uid_t) -1) { - if (ruid != cred->uid && - set_user(ruid, euid != cred->euid) < 0) - return -EAGAIN; + new->uid = ruid; + if (ruid != old->uid && set_user(new) < 0) + goto error; } - if (euid != (uid_t) -1) { - if (euid != cred->euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->euid = euid; - } - cred->fsuid = cred->euid; + if (euid != (uid_t) -1) + new->euid = euid; if (suid != (uid_t) -1) - cred->suid = suid; + new->suid = suid; + new->fsuid = new->euid; - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); + retval = security_task_fix_setuid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); + return commit_creds(new); + +error: + abort_creds(new); + return retval; } asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) @@ -763,40 +772,45 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us */ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { - struct cred *cred = current->cred; + const struct cred *old; + struct cred *new; int retval; + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); if (retval) - return retval; + goto error; + retval = -EPERM; if (!capable(CAP_SETGID)) { - if ((rgid != (gid_t) -1) && (rgid != cred->gid) && - (rgid != cred->egid) && (rgid != cred->sgid)) - return -EPERM; - if ((egid != (gid_t) -1) && (egid != cred->gid) && - (egid != cred->egid) && (egid != cred->sgid)) - return -EPERM; - if ((sgid != (gid_t) -1) && (sgid != cred->gid) && - (sgid != cred->egid) && (sgid != cred->sgid)) - return -EPERM; + if (rgid != (gid_t) -1 && rgid != old->gid && + rgid != old->egid && rgid != old->sgid) + goto error; + if (egid != (gid_t) -1 && egid != old->gid && + egid != old->egid && egid != old->sgid) + goto error; + if (sgid != (gid_t) -1 && sgid != old->gid && + sgid != old->egid && sgid != old->sgid) + goto error; } - if (egid != (gid_t) -1) { - if (egid != cred->egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - cred->egid = egid; - } - cred->fsgid = cred->egid; + if (rgid != (gid_t) -1) - cred->gid = rgid; + new->gid = rgid; + if (egid != (gid_t) -1) + new->egid = egid; if (sgid != (gid_t) -1) - cred->sgid = sgid; + new->sgid = sgid; + new->fsgid = new->egid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) @@ -820,28 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us */ asmlinkage long sys_setfsuid(uid_t uid) { - struct cred *cred = current->cred; - int old_fsuid; + const struct cred *old; + struct cred *new; + uid_t old_fsuid; + + new = prepare_creds(); + if (!new) + return current_fsuid(); + old = current_cred(); + old_fsuid = old->fsuid; - old_fsuid = cred->fsuid; - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) - return old_fsuid; + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) + goto error; - if (uid == cred->uid || uid == cred->euid || - uid == cred->suid || uid == cred->fsuid || + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || capable(CAP_SETUID)) { if (uid != old_fsuid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->fsuid = uid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + goto change_okay; } - cred->fsuid = uid; } - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); +error: + abort_creds(new); + return old_fsuid; +change_okay: + commit_creds(new); return old_fsuid; } @@ -850,24 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid) */ asmlinkage long sys_setfsgid(gid_t gid) { - struct cred *cred = current->cred; - int old_fsgid; + const struct cred *old; + struct cred *new; + gid_t old_fsgid; + + new = prepare_creds(); + if (!new) + return current_fsgid(); + old = current_cred(); + old_fsgid = old->fsgid; - old_fsgid = cred->fsgid; if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - return old_fsgid; + goto error; - if (gid == cred->gid || gid == cred->egid || - gid == cred->sgid || gid == cred->fsgid || + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || capable(CAP_SETGID)) { if (gid != old_fsgid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); + new->fsgid = gid; + goto change_okay; } - cred->fsgid = gid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); } + +error: + abort_creds(new); + return old_fsgid; + +change_okay: + commit_creds(new); return old_fsgid; } @@ -1136,7 +1167,7 @@ EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, - struct group_info *group_info) + const struct group_info *group_info) { int i; unsigned int count = group_info->ngroups; @@ -1227,31 +1258,25 @@ int groups_search(const struct group_info *group_info, gid_t grp) } /** - * set_groups - Change a group subscription in a security record - * @sec: The security record to alter - * @group_info: The group list to impose + * set_groups - Change a group subscription in a set of credentials + * @new: The newly prepared set of credentials to alter + * @group_info: The group list to install * - * Validate a group subscription and, if valid, impose it upon a task security - * record. + * Validate a group subscription and, if valid, insert it into a set + * of credentials. */ -int set_groups(struct cred *cred, struct group_info *group_info) +int set_groups(struct cred *new, struct group_info *group_info) { int retval; - struct group_info *old_info; retval = security_task_setgroups(group_info); if (retval) return retval; + put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); - - spin_lock(&cred->lock); - old_info = cred->group_info; - cred->group_info = group_info; - spin_unlock(&cred->lock); - - put_group_info(old_info); + new->group_info = group_info; return 0; } @@ -1266,7 +1291,20 @@ EXPORT_SYMBOL(set_groups); */ int set_current_groups(struct group_info *group_info) { - return set_groups(current->cred, group_info); + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = set_groups(new, group_info); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); } EXPORT_SYMBOL(set_current_groups); @@ -1666,9 +1704,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned char comm[sizeof(me->comm)]; long error; - if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) + error = security_task_prctl(option, arg2, arg3, arg4, arg5); + if (error != -ENOSYS) return error; + error = 0; switch (option) { case PR_SET_PDEATHSIG: if (!valid_signal(arg2)) { diff --git a/kernel/user.c b/kernel/user.c index 104d22a..d476307 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include "cred-internals.h" struct user_namespace init_user_ns = { .kref = { @@ -104,16 +105,10 @@ static int sched_create_user(struct user_struct *up) return rc; } -static void sched_switch_user(struct task_struct *p) -{ - sched_move_task(p); -} - #else /* CONFIG_USER_SCHED */ static void sched_destroy_user(struct user_struct *up) { } static int sched_create_user(struct user_struct *up) { return 0; } -static void sched_switch_user(struct task_struct *p) { } #endif /* CONFIG_USER_SCHED */ @@ -448,36 +443,6 @@ out_unlock: return NULL; } -void switch_uid(struct user_struct *new_user) -{ - struct user_struct *old_user; - - /* What if a process setreuid()'s and this brings the - * new uid over his NPROC rlimit? We can check this now - * cheaply with the new uid cache, so if it matters - * we should be checking for it. -DaveM - */ - old_user = current->cred->user; - atomic_inc(&new_user->processes); - atomic_dec(&old_user->processes); - switch_uid_keyring(new_user); - current->cred->user = new_user; - sched_switch_user(current); - - /* - * We need to synchronize with __sigqueue_alloc() - * doing a get_uid(p->user).. If that saw the old - * user value, we need to wait until it has exited - * its critical region before we can free the old - * structure. - */ - smp_mb(); - spin_unlock_wait(¤t->sighand->siglock); - - free_uid(old_user); - suid_keys(current); -} - #ifdef CONFIG_USER_NS void release_uids(struct user_namespace *ns) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f82730a..0d9c51d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -19,6 +19,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) { struct user_namespace *ns; struct user_struct *new_user; + struct cred *new; int n; ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); @@ -45,7 +46,16 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) return ERR_PTR(-ENOMEM); } - switch_uid(new_user); + /* Install the new user */ + new = prepare_creds(); + if (!new) { + free_uid(new_user); + free_uid(ns->root_user); + kfree(ns); + } + free_uid(new->user); + new->user = new_user; + commit_creds(new); return ns; } -- cgit v1.1 From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:24 +1100 Subject: CRED: Make execve() take advantage of copy-on-write credentials Make execve() take advantage of copy-on-write credentials, allowing it to set up the credentials in advance, and then commit the whole lot after the point of no return. This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). The credential bits from struct linux_binprm are, for the most part, replaced with a single credentials pointer (bprm->cred). This means that all the creds can be calculated in advance and then applied at the point of no return with no possibility of failure. I would like to replace bprm->cap_effective with: cap_isclear(bprm->cap_effective) but this seems impossible due to special behaviour for processes of pid 1 (they always retain their parent's capability masks where normally they'd be changed - see cap_bprm_set_creds()). The following sequence of events now happens: (a) At the start of do_execve, the current task's cred_exec_mutex is locked to prevent PTRACE_ATTACH from obsoleting the calculation of creds that we make. (a) prepare_exec_creds() is then called to make a copy of the current task's credentials and prepare it. This copy is then assigned to bprm->cred. This renders security_bprm_alloc() and security_bprm_free() unnecessary, and so they've been removed. (b) The determination of unsafe execution is now performed immediately after (a) rather than later on in the code. The result is stored in bprm->unsafe for future reference. (c) prepare_binprm() is called, possibly multiple times. (i) This applies the result of set[ug]id binaries to the new creds attached to bprm->cred. Personality bit clearance is recorded, but now deferred on the basis that the exec procedure may yet fail. (ii) This then calls the new security_bprm_set_creds(). This should calculate the new LSM and capability credentials into *bprm->cred. This folds together security_bprm_set() and parts of security_bprm_apply_creds() (these two have been removed). Anything that might fail must be done at this point. (iii) bprm->cred_prepared is set to 1. bprm->cred_prepared is 0 on the first pass of the security calculations, and 1 on all subsequent passes. This allows SELinux in (ii) to base its calculations only on the initial script and not on the interpreter. (d) flush_old_exec() is called to commit the task to execution. This performs the following steps with regard to credentials: (i) Clear pdeath_signal and set dumpable on certain circumstances that may not be covered by commit_creds(). (ii) Clear any bits in current->personality that were deferred from (c.i). (e) install_exec_creds() [compute_creds() as was] is called to install the new credentials. This performs the following steps with regard to credentials: (i) Calls security_bprm_committing_creds() to apply any security requirements, such as flushing unauthorised files in SELinux, that must be done before the credentials are changed. This is made up of bits of security_bprm_apply_creds() and security_bprm_post_apply_creds(), both of which have been removed. This function is not allowed to fail; anything that might fail must have been done in (c.ii). (ii) Calls commit_creds() to apply the new credentials in a single assignment (more or less). Possibly pdeath_signal and dumpable should be part of struct creds. (iii) Unlocks the task's cred_replace_mutex, thus allowing PTRACE_ATTACH to take place. (iv) Clears The bprm->cred pointer as the credentials it was holding are now immutable. (v) Calls security_bprm_committed_creds() to apply any security alterations that must be done after the creds have been changed. SELinux uses this to flush signals and signal handlers. (f) If an error occurs before (d.i), bprm_free() will call abort_creds() to destroy the proposed new credentials and will then unlock cred_replace_mutex. No changes to the credentials will have been made. (2) LSM interface. A number of functions have been changed, added or removed: (*) security_bprm_alloc(), ->bprm_alloc_security() (*) security_bprm_free(), ->bprm_free_security() Removed in favour of preparing new credentials and modifying those. (*) security_bprm_apply_creds(), ->bprm_apply_creds() (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds() Removed; split between security_bprm_set_creds(), security_bprm_committing_creds() and security_bprm_committed_creds(). (*) security_bprm_set(), ->bprm_set_security() Removed; folded into security_bprm_set_creds(). (*) security_bprm_set_creds(), ->bprm_set_creds() New. The new credentials in bprm->creds should be checked and set up as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the second and subsequent calls. (*) security_bprm_committing_creds(), ->bprm_committing_creds() (*) security_bprm_committed_creds(), ->bprm_committed_creds() New. Apply the security effects of the new credentials. This includes closing unauthorised files in SELinux. This function may not fail. When the former is called, the creds haven't yet been applied to the process; when the latter is called, they have. The former may access bprm->cred, the latter may not. (3) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) The bprm_security_struct struct has been removed in favour of using the credentials-under-construction approach. (c) flush_unauthorized_files() now takes a cred pointer and passes it on to inode_has_perm(), file_has_perm() and dentry_open(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/cred.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index cb6b5ed..e6fcdd6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -68,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu) /* * Release a set of thread group credentials. */ -void release_tgcred(struct cred *cred) +static void release_tgcred(struct cred *cred) { #ifdef CONFIG_KEYS struct thread_group_cred *tgcred = cred->tgcred; @@ -164,6 +164,50 @@ error: EXPORT_SYMBOL(prepare_creds); /* + * Prepare credentials for current to perform an execve() + * - The caller must hold current->cred_exec_mutex + */ +struct cred *prepare_exec_creds(void) +{ + struct thread_group_cred *tgcred = NULL; + struct cred *new; + +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) + return NULL; +#endif + + new = prepare_creds(); + if (!new) { + kfree(tgcred); + return new; + } + +#ifdef CONFIG_KEYS + /* newly exec'd tasks don't get a thread keyring */ + key_put(new->thread_keyring); + new->thread_keyring = NULL; + + /* create a new per-thread-group creds for all this set of threads to + * share */ + memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + + /* inherit the session keyring; new process keyring */ + key_get(tgcred->session_keyring); + tgcred->process_keyring = NULL; + + release_tgcred(new); + new->tgcred = tgcred; +#endif + + return new; +} + +/* * prepare new credentials for the usermode helper dispatcher */ struct cred *prepare_usermodehelper_creds(void) -- cgit v1.1 From 98870ab0a5a3f1822aee681d2997017e1c87d026 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:26 +1100 Subject: CRED: Documentation Document credentials and the new credentials API. Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index e6fcdd6..b8bd2f9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management +/* Task credentials management - see Documentation/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) -- cgit v1.1 From 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:26 +1100 Subject: CRED: Differentiate objective and effective subjective credentials on a task Differentiate the objective and real subjective credentials from the effective subjective credentials on a task by introducing a second credentials pointer into the task_struct. task_struct::real_cred then refers to the objective and apparent real subjective credentials of a task, as perceived by the other tasks in the system. task_struct::cred then refers to the effective subjective credentials of a task, as used by that task when it's actually running. These are not visible to the other tasks in the system. __task_cred(task) then refers to the objective/real credentials of the task in question. current_cred() refers to the effective subjective credentials of the current task. prepare_creds() uses the objective creds as a base and commit_creds() changes both pointers in the task_struct (indeed commit_creds() requires them to be the same). override_creds() and revert_creds() change the subjective creds pointer only, and the former returns the old subjective creds. These are used by NFSD, faccessat() and do_coredump(), and will by used by CacheFiles. In SELinux, current_has_perm() is provided as an alternative to task_has_perm(). This uses the effective subjective context of current, whereas task_has_perm() uses the objective/real context of the subject. Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 38 ++++++++++++++++++++++++++------------ kernel/fork.c | 6 ++++-- 2 files changed, 30 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index b8bd2f9..f3ca106 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct thread_group_cred init_tgcred = { * The initial credentials for the initial task */ struct cred init_cred = { - .usage = ATOMIC_INIT(3), + .usage = ATOMIC_INIT(4), .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_INIT_INH_SET, .cap_permitted = CAP_FULL_SET, @@ -120,6 +120,8 @@ EXPORT_SYMBOL(__put_cred); * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * + * Preparation involves making a copy of the objective creds for modification. + * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. @@ -130,7 +132,7 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - BUG_ON(atomic_read(&task->cred->usage) < 1); + BUG_ON(atomic_read(&task->real_cred->usage) < 1); new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) @@ -262,6 +264,9 @@ error: * * We share if we can, but under some circumstances we have to generate a new * set. + * + * The new process gets the current process's subjective credentials as its + * objective and subjective credentials */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { @@ -278,6 +283,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) #endif clone_flags & CLONE_THREAD ) { + p->real_cred = get_cred(p->cred); get_cred(p->cred); atomic_inc(&p->cred->user->processes); return 0; @@ -317,7 +323,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) #endif atomic_inc(&new->user->processes); - p->cred = new; + p->cred = p->real_cred = get_cred(new); return 0; } @@ -326,7 +332,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace - * the old set. + * the old set. Both the objective and the subjective credentials pointers are + * updated. This function may not be called if the subjective credentials are + * in an overridden state. * * This function eats the caller's reference to the new credentials. * @@ -338,12 +346,15 @@ int commit_creds(struct cred *new) struct task_struct *task = current; const struct cred *old; + BUG_ON(task->cred != task->real_cred); + BUG_ON(atomic_read(&task->real_cred->usage) < 2); BUG_ON(atomic_read(&new->usage) < 1); - BUG_ON(atomic_read(&task->cred->usage) < 1); - old = task->cred; + old = task->real_cred; security_commit_creds(new, old); + get_cred(new); /* we will require a ref for the subj creds too */ + /* dumpability changes */ if (old->euid != new->euid || old->egid != new->egid || @@ -369,6 +380,7 @@ int commit_creds(struct cred *new) */ if (new->user != old->user) atomic_inc(&new->user->processes); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) atomic_dec(&old->user->processes); @@ -388,6 +400,8 @@ int commit_creds(struct cred *new) new->fsgid != old->fsgid) proc_id_connector(task, PROC_EVENT_GID); + /* release the old obj and subj refs both */ + put_cred(old); put_cred(old); return 0; } @@ -408,11 +422,11 @@ void abort_creds(struct cred *new) EXPORT_SYMBOL(abort_creds); /** - * override_creds - Temporarily override the current process's credentials + * override_creds - Override the current process's subjective credentials * @new: The credentials to be assigned * - * Install a set of temporary override credentials on the current process, - * returning the old set for later reversion. + * Install a set of temporary override subjective credentials on the current + * process, returning the old set for later reversion. */ const struct cred *override_creds(const struct cred *new) { @@ -424,11 +438,11 @@ const struct cred *override_creds(const struct cred *new) EXPORT_SYMBOL(override_creds); /** - * revert_creds - Revert a temporary credentials override + * revert_creds - Revert a temporary subjective credentials override * @old: The credentials to be restored * - * Revert a temporary set of override credentials to an old set, discarding the - * override set. + * Revert a temporary set of override subjective credentials to an old set, + * discarding the override set. */ void revert_creds(const struct cred *old) { diff --git a/kernel/fork.c b/kernel/fork.c index 82a7948..af0d0f0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,6 +146,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + put_cred(tsk->real_cred); put_cred(tsk->cred); delayacct_tsk_free(tsk); @@ -961,10 +962,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->cred->user->processes) >= + if (atomic_read(&p->real_cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->cred->user != current->nsproxy->user_ns->root_user) + p->real_cred->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } @@ -1278,6 +1279,7 @@ bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); + put_cred(p->real_cred); put_cred(p->cred); bad_fork_free: free_task(p); -- cgit v1.1 From 3a3b7ce9336952ea7b9564d976d068a238976c9d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:28 +1100 Subject: CRED: Allow kernel services to override LSM settings for task actions Allow kernel services to override LSM settings appropriate to the actions performed by a task by duplicating a set of credentials, modifying it and then using task_struct::cred to point to it when performing operations on behalf of a task. This is used, for example, by CacheFiles which has to transparently access the cache on behalf of a process that thinks it is doing, say, NFS accesses with a potentially inappropriate (with respect to accessing the cache) set of credentials. This patch provides two LSM hooks for modifying a task security record: (*) security_kernel_act_as() which allows modification of the security datum with which a task acts on other objects (most notably files). (*) security_kernel_create_files_as() which allows modification of the security datum that is used to initialise the security data on a file that a task creates. The patch also provides four new credentials handling functions, which wrap the LSM functions: (1) prepare_kernel_cred() Prepare a set of credentials for a kernel service to use, based either on a daemon's credentials or on init_cred. All the keyrings are cleared. (2) set_security_override() Set the LSM security ID in a set of credentials to a specific security context, assuming permission from the LSM policy. (3) set_security_override_from_ctx() As (2), but takes the security context as a string. (4) set_create_files_as() Set the file creation LSM security ID in a set of credentials to be the same as that on a particular inode. Signed-off-by: Casey Schaufler [Smack changes] Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index f3ca106..13697ca 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -462,3 +462,116 @@ void __init cred_init(void) cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } + +/** + * prepare_kernel_cred - Prepare a set of credentials for a kernel service + * @daemon: A userspace daemon to be used as a reference + * + * Prepare a set of credentials for a kernel service. This can then be used to + * override a task's own credentials so that work can be done on behalf of that + * task that requires a different subjective context. + * + * @daemon is used to provide a base for the security record, but can be NULL. + * If @daemon is supplied, then the security data will be derived from that; + * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * + * The caller may change these controls afterwards if desired. + * + * Returns the new credentials or NULL if out of memory. + * + * Does not take, and does not return holding current->cred_replace_mutex. + */ +struct cred *prepare_kernel_cred(struct task_struct *daemon) +{ + const struct cred *old; + struct cred *new; + + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + if (daemon) + old = get_task_cred(daemon); + else + old = get_cred(&init_cred); + + get_uid(new->user); + get_group_info(new->group_info); + +#ifdef CONFIG_KEYS + atomic_inc(&init_tgcred.usage); + new->tgcred = &init_tgcred; + new->request_key_auth = NULL; + new->thread_keyring = NULL; + new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + goto error; + + atomic_set(&new->usage, 1); + put_cred(old); + return new; + +error: + put_cred(new); + return NULL; +} +EXPORT_SYMBOL(prepare_kernel_cred); + +/** + * set_security_override - Set the security ID in a set of credentials + * @new: The credentials to alter + * @secid: The LSM security ID to set + * + * Set the LSM security ID in a set of credentials so that the subjective + * security is overridden when an alternative set of credentials is used. + */ +int set_security_override(struct cred *new, u32 secid) +{ + return security_kernel_act_as(new, secid); +} +EXPORT_SYMBOL(set_security_override); + +/** + * set_security_override_from_ctx - Set the security ID in a set of credentials + * @new: The credentials to alter + * @secctx: The LSM security context to generate the security ID from. + * + * Set the LSM security ID in a set of credentials so that the subjective + * security is overridden when an alternative set of credentials is used. The + * security ID is specified in string form as a security context to be + * interpreted by the LSM. + */ +int set_security_override_from_ctx(struct cred *new, const char *secctx) +{ + u32 secid; + int ret; + + ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); + if (ret < 0) + return ret; + + return set_security_override(new, secid); +} +EXPORT_SYMBOL(set_security_override_from_ctx); + +/** + * set_create_files_as - Set the LSM file create context in a set of credentials + * @new: The credentials to alter + * @inode: The inode to take the context from + * + * Change the LSM file creation context in a set of credentials to be the same + * as the object context of the specified inode, so that the new inodes have + * the same MAC context as that inode. + */ +int set_create_files_as(struct cred *new, struct inode *inode) +{ + new->fsuid = inode->i_uid; + new->fsgid = inode->i_gid; + return security_kernel_create_files_as(new, inode); +} +EXPORT_SYMBOL(set_create_files_as); -- cgit v1.1 From ec4e0e2fe018992d980910db901637c814575914 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 18 Nov 2008 22:41:57 -0800 Subject: sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares Impact: make load-balancing more consistent In the update_shares() path leading to tg_shares_up(), the calculation of per-cpu cfs_rq shares is rather erratic even under moderate task wake up rate. The problem is that the per-cpu tg->cfs_rq load weight used in the sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares are collected at different time. Under moderate system load, we've seen quite a bit of variation on the cfs_rq->shares and ultimately wildly affects sched_entity's load weight. This patch caches the result of initial per-cpu load weight when doing the sum calculation, and then pass it down to update_group_shares_cpu() for redistributing per-cpu cfs_rq shares. This allows consistent total cfs_rq shares across all CPUs. It also simplifies the rounding and zero load weight check. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a4c156d..93bfb08 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,27 +1453,13 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - int boost = 0; unsigned long shares; unsigned long rq_weight; if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[cpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - if (unlikely(rq_weight > sd_rq_weight)) - rq_weight = sd_rq_weight; + rq_weight = tg->cfs_rq[cpu]->rq_weight; /* * \Sum shares * rq_weight @@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, * \Sum rq_weight * */ - shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); if (abs(shares - tg->se[cpu]->load.weight) > @@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + tg->cfs_rq[cpu]->shares = shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long rq_weight = 0; + unsigned long weight, rq_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; + /* + * If there are currently no tasks on the cpu pretend there + * is one of average load so that when a new task gets to + * run here it will not get delayed by group starvation. + */ + weight = tg->cfs_rq[i]->load.weight; + if (!weight) + weight = NICE_0_LOAD; + + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - if (!rq_weight) - rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) update_group_shares_cpu(tg, i, shares, rq_weight); -- cgit v1.1 From 957ad0166e9f76a8561dafa5e14ef5bd3f5e9a3b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 21 Nov 2008 01:30:36 +0100 Subject: sched: update comment for move_task_off_dead_cpu Impact: cleanup This commit: commit f7b4cddcc5aca03e80e357360c9424dfba5056c2 Author: Oleg Nesterov Date: Tue Oct 16 23:30:56 2007 -0700 do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(ta Currently move_task_off_dead_cpu() is called under write_lock_irq(tasklist). This means it can't use task_lock() which is needed to improve migrating to take task's ->cpuset into account. Change the code to call move_task_off_dead_cpu() with irqs enabled, and change migrate_live_tasks() to use read_lock(tasklist). ...forgot to update the comment in front of move_task_off_dead_cpu. Reference: http://lkml.org/lkml/2008/6/23/135 Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93bfb08..a6085d5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6094,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) /* * Figure out where task on dead CPU should go, use force if necessary. - * NOTE: interrupts should be disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { -- cgit v1.1 From 18b6e0414e42d95183f07d8177e3ff0241abd825 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Wed, 15 Oct 2008 16:38:45 -0500 Subject: User namespaces: set of cleanups (v2) The user_ns is moved from nsproxy to user_struct, so that a struct cred by itself is sufficient to determine access (which it otherwise would not be). Corresponding ecryptfs fixes (by David Howells) are here as well. Fix refcounting. The following rules now apply: 1. The task pins the user struct. 2. The user struct pins its user namespace. 3. The user namespace pins the struct user which created it. User namespaces are cloned during copy_creds(). Unsharing a new user_ns is no longer possible. (We could re-add that, but it'll cause code duplication and doesn't seem useful if PAM doesn't need to clone user namespaces). When a user namespace is created, its first user (uid 0) gets empty keyrings and a clean group_info. This incorporates a previous patch by David Howells. Here is his original patch description: >I suggest adding the attached incremental patch. It makes the following >changes: > > (1) Provides a current_user_ns() macro to wrap accesses to current's user > namespace. > > (2) Fixes eCryptFS. > > (3) Renames create_new_userns() to create_user_ns() to be more consistent > with the other associated functions and because the 'new' in the name is > superfluous. > > (4) Moves the argument and permission checks made for CLONE_NEWUSER to the > beginning of do_fork() so that they're done prior to making any attempts > at allocation. > > (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds > to fill in rather than have it return the new root user. I don't imagine > the new root user being used for anything other than filling in a cred > struct. > > This also permits me to get rid of a get_uid() and a free_uid(), as the > reference the creds were holding on the old user_struct can just be > transferred to the new namespace's creator pointer. > > (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under > preparation rather than doing it in copy_creds(). > >David >Signed-off-by: David Howells Changelog: Oct 20: integrate dhowells comments 1. leave thread_keyring alone 2. use current_user_ns() in set_user() Signed-off-by: Serge Hallyn --- kernel/cred.c | 15 ++++++++-- kernel/fork.c | 19 +++++++++++-- kernel/nsproxy.c | 15 ++-------- kernel/sys.c | 4 +-- kernel/user.c | 47 +++++++++---------------------- kernel/user_namespace.c | 75 ++++++++++++++++++++----------------------------- 6 files changed, 76 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 13697ca..ff7bc07 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -274,6 +274,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct thread_group_cred *tgcred; #endif struct cred *new; + int ret; mutex_init(&p->cred_exec_mutex); @@ -293,6 +294,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) if (!new) return -ENOMEM; + if (clone_flags & CLONE_NEWUSER) { + ret = create_user_ns(new); + if (ret < 0) + goto error_put; + } + #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ @@ -309,8 +316,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) if (!(clone_flags & CLONE_THREAD)) { tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); if (!tgcred) { - put_cred(new); - return -ENOMEM; + ret = -ENOMEM; + goto error_put; } atomic_set(&tgcred->usage, 1); spin_lock_init(&tgcred->lock); @@ -325,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); return 0; + +error_put: + put_cred(new); + return ret; } /** diff --git a/kernel/fork.c b/kernel/fork.c index 29c18c1..1dd8945 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -976,7 +976,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (atomic_read(&p->real_cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != current->nsproxy->user_ns->root_user) + p->real_cred->user != INIT_USER) goto bad_fork_free; } @@ -1335,6 +1335,20 @@ long do_fork(unsigned long clone_flags, long nr; /* + * Do some preliminary argument and permissions checking before we + * actually start allocating stuff + */ + if (clone_flags & CLONE_NEWUSER) { + if (clone_flags & CLONE_THREAD) + return -EINVAL; + /* hopefully this check will go away when userns support is + * complete + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + /* * We hope to recycle these flags after 2.6.26 */ if (unlikely(clone_flags & CLONE_STOPPED)) { @@ -1581,8 +1595,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| - CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) goto bad_unshare_out; /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 1d3ef29..63598dc 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } - new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); - if (IS_ERR(new_nsp->user_ns)) { - err = PTR_ERR(new_nsp->user_ns); - goto out_user; - } - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: - if (new_nsp->user_ns) - put_user_ns(new_nsp->user_ns); -out_user: if (new_nsp->pid_ns) put_pid_ns(new_nsp->pid_ns); out_pid: @@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) get_nsproxy(old_ns); if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) + CLONE_NEWPID | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns) put_pid_ns(ns->pid_ns); - if (ns->user_ns) - put_user_ns(ns->user_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWNET))) + CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/sys.c b/kernel/sys.c index ab73504..ebe65c2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -565,13 +565,13 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current->nsproxy->user_ns, new->uid); + new_user = alloc_uid(current_user_ns(), new->uid); if (!new_user) return -EAGAIN; if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != current->nsproxy->user_ns->root_user) { + new_user != INIT_USER) { free_uid(new_user); return -EAGAIN; } diff --git a/kernel/user.c b/kernel/user.c index d476307..c0ef3a4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -20,9 +20,9 @@ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(1), }, - .root_user = &root_user, + .creator = &root_user, }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -48,12 +48,14 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, + .user_ns = &init_user_ns, #ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif @@ -314,12 +316,13 @@ done: * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ -static inline void free_user(struct user_struct *up, unsigned long flags) +static void free_user(struct user_struct *up, unsigned long flags) { /* restore back the count */ atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); + put_user_ns(up->user_ns); INIT_WORK(&up->work, remove_user_sysfs_dir); schedule_work(&up->work); } @@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { } * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ -static inline void free_user(struct user_struct *up, unsigned long flags) +static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); + put_user_ns(up->user_ns); kmem_cache_free(uid_cachep, up); } @@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; - struct user_namespace *ns = current->nsproxy->user_ns; + struct user_namespace *ns = current_user()->user_ns; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(ns, uid)); @@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) if (sched_create_user(new) < 0) goto out_free_user; + new->user_ns = get_user_ns(ns); + if (uids_user_create(new)) goto out_destoy_sched; @@ -427,7 +433,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) up = new; } spin_unlock_irq(&uidhash_lock); - } uids_mutex_unlock(); @@ -436,6 +441,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) out_destoy_sched: sched_destroy_user(new); + put_user_ns(new->user_ns); out_free_user: kmem_cache_free(uid_cachep, new); out_unlock: @@ -443,33 +449,6 @@ out_unlock: return NULL; } -#ifdef CONFIG_USER_NS -void release_uids(struct user_namespace *ns) -{ - int i; - unsigned long flags; - struct hlist_head *head; - struct hlist_node *nd; - - spin_lock_irqsave(&uidhash_lock, flags); - /* - * collapse the chains so that the user_struct-s will - * be still alive, but not in hashes. subsequent free_uid() - * will free them. - */ - for (i = 0; i < UIDHASH_SZ; i++) { - head = ns->uidhash_table + i; - while (!hlist_empty(head)) { - nd = head->first; - hlist_del_init(nd); - } - } - spin_unlock_irqrestore(&uidhash_lock, flags); - - free_uid(ns->root_user); -} -#endif - static int __init uid_cache_init(void) { int n; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0d9c51d..7908431 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,70 +9,55 @@ #include #include #include +#include /* - * Clone a new ns copying an original user ns, setting refcount to 1 - * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise + * Create a new user namespace, deriving the creator from the user in the + * passed credentials, and replacing that user with the new root user for the + * new namespace. + * + * This is called by copy_creds(), which will finish setting the target task's + * credentials. */ -static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) +int create_user_ns(struct cred *new) { struct user_namespace *ns; - struct user_struct *new_user; - struct cred *new; + struct user_struct *root_user; int n; ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); if (!ns) - return ERR_PTR(-ENOMEM); + return -ENOMEM; kref_init(&ns->kref); for (n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(ns->uidhash_table + n); - /* Insert new root user. */ - ns->root_user = alloc_uid(ns, 0); - if (!ns->root_user) { + /* Alloc new root user. */ + root_user = alloc_uid(ns, 0); + if (!root_user) { kfree(ns); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } - /* Reset current->user with a new one */ - new_user = alloc_uid(ns, current_uid()); - if (!new_user) { - free_uid(ns->root_user); - kfree(ns); - return ERR_PTR(-ENOMEM); - } - - /* Install the new user */ - new = prepare_creds(); - if (!new) { - free_uid(new_user); - free_uid(ns->root_user); - kfree(ns); - } - free_uid(new->user); - new->user = new_user; - commit_creds(new); - return ns; -} - -struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) -{ - struct user_namespace *new_ns; - - BUG_ON(!old_ns); - get_user_ns(old_ns); - - if (!(flags & CLONE_NEWUSER)) - return old_ns; + /* set the new root user in the credentials under preparation */ + ns->creator = new->user; + new->user = root_user; + new->uid = new->euid = new->suid = new->fsuid = 0; + new->gid = new->egid = new->sgid = new->fsgid = 0; + put_group_info(new->group_info); + new->group_info = get_group_info(&init_groups); +#ifdef CONFIG_KEYS + key_put(new->request_key_auth); + new->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - new_ns = clone_user_ns(old_ns); + /* alloc_uid() incremented the userns refcount. Just set it to 1 */ + kref_set(&ns->kref, 1); - put_user_ns(old_ns); - return new_ns; + return 0; } void free_user_ns(struct kref *kref) @@ -80,7 +65,7 @@ void free_user_ns(struct kref *kref) struct user_namespace *ns; ns = container_of(kref, struct user_namespace, kref); - release_uids(ns); + free_uid(ns->creator); kfree(ns); } EXPORT_SYMBOL(free_user_ns); -- cgit v1.1 From 6ded6ab9be4f6164aef1c527407c1b94f0929799 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Mon, 24 Nov 2008 16:24:10 -0500 Subject: User namespaces: use the current_user_ns() macro Fix up the last current_user()->user_ns instance to use current_user_ns(). Signed-off-by: Serge E. Hallyn --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index c0ef3a4..97202cb 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -361,7 +361,7 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; - struct user_namespace *ns = current_user()->user_ns; + struct user_namespace *ns = current_user_ns(); spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(ns, uid)); -- cgit v1.1 From 70574a996fc7a70c5586eb56bd92a544eccf18b6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 28 Nov 2008 22:08:00 +0300 Subject: sched: move double_unlock_balance() higher Move double_lock_balance()/double_unlock_balance() higher to fix the following with gcc-3.4.6: CC kernel/sched.o In file included from kernel/sched.c:1605: kernel/sched_rt.c: In function `find_lock_lowest_rq': kernel/sched_rt.c:914: sorry, unimplemented: inlining failed in call to 'double_unlock_balance': function body not available kernel/sched_rt.c:1077: sorry, unimplemented: called from here make[2]: *** [kernel/sched.o] Error 1 Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar --- kernel/sched.c | 67 +++++++++++++++++++++++++++---------------------------- kernel/sched_rt.c | 4 ---- 2 files changed, 33 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3d1ee42..6a99703 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1581,6 +1581,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + ret = 1; + } else + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + } + return ret; +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2781,40 +2814,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); - ret = 1; - } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); - } - return ret; -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2bdd444..587a16e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -909,10 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int double_lock_balance(struct rq *this_rq, struct rq *busiest); -static inline void double_unlock_balance(struct rq *this_rq, - struct rq *busiest); - static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -- cgit v1.1 From 6c415b9234a8c71f290e5d4fddc467f103f32719 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Mon, 1 Dec 2008 20:49:05 +0530 Subject: sched: add uid information to sched_debug for CONFIG_USER_SCHED Impact: extend information in /proc/sched_debug This patch adds uid information in sched_debug for CONFIG_USER_SCHED Signed-off-by: Arun R Bharadwaj Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ kernel/sched_debug.c | 6 +++++- kernel/user.c | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6a99703..4c7388e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -261,6 +261,10 @@ struct task_group { struct cgroup_subsys_state css; #endif +#ifdef CONFIG_USER_SCHED + uid_t uid; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -286,6 +290,12 @@ struct task_group { #ifdef CONFIG_USER_SCHED +/* Helper function to pass uid information to create_sched_user() */ +void set_tg_uid(struct user_struct *user) +{ + user->tg->uid = user->uid; +} + /* * Root task group. * Every UID task group (including init_task_group aka UID-0) will diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index baf2f17..4293cfa 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -160,10 +160,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) + { + uid_t uid = cfs_rq->tg->uid; + SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid); + } #else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); #endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); diff --git a/kernel/user.c b/kernel/user.c index 39d6159..cec2224 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up) if (IS_ERR(up->tg)) rc = -ENOMEM; + set_tg_uid(up); + return rc; } -- cgit v1.1 From c37bbb0fdcc01610fd55604eb6927210a1d20044 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 3 Dec 2008 13:17:06 -0600 Subject: user namespaces: let user_ns be cloned with fairsched (These two patches are in the next-unacked branch of git://git.kernel.org/pub/scm/linux/kernel/git/sergeh/userns-2.6. If they get some ACKs, then I hope to feed this into security-next. After these two, I think we're ready to tackle userns+capabilities) Fairsched creates a per-uid directory under /sys/kernel/uids/. So when you clone(CLONE_NEWUSER), it tries to create /sys/kernel/uids/0, which already exists, and you get back -ENOMEM. This was supposed to be fixed by sysfs tagging, but that was postponed (ok, rejected until sysfs locking is fixed). So, just as with network namespaces, we just don't create those directories for user namespaces other than the init. Signed-off-by: Serge E. Hallyn Signed-off-by: James Morris --- kernel/user.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 97202cb..6c924bc 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -246,6 +246,8 @@ static int uids_user_create(struct user_struct *up) int error; memset(kobj, 0, sizeof(struct kobject)); + if (up->user_ns != &init_user_ns) + return 0; kobj->kset = uids_kset; error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); if (error) { @@ -281,6 +283,8 @@ static void remove_user_sysfs_dir(struct work_struct *w) unsigned long flags; int remove_user = 0; + if (up->user_ns != &init_user_ns) + return; /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() * atomic. */ -- cgit v1.1 From 7657d90497f98426af17f0ac633a9b335bb7a8fb Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 3 Dec 2008 13:17:33 -0600 Subject: user namespaces: require cap_set{ug}id for CLONE_NEWUSER While ideally CLONE_NEWUSER will eventually require no privilege, the required permission checks are currently not there. As a result, CLONE_NEWUSER has the same effect as a setuid(0)+setgroups(1,"0"). While we already require CAP_SYS_ADMIN, requiring CAP_SETUID and CAP_SETGID seems appropriate. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Signed-off-by: James Morris --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1dd8945..e3a85b3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1344,7 +1344,8 @@ long do_fork(unsigned long clone_flags, /* hopefully this check will go away when userns support is * complete */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || + !capable(CAP_SETGID)) return -EPERM; } -- cgit v1.1 From 5436499e6098759c2340f8b906ea52f993dc4efb Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Sun, 7 Dec 2008 18:47:37 -0800 Subject: sched: fix sd_parent_degenerate on non-numa smp machine Impact: optimize the sched domains tree some more The addition of SD_SERIALIZE flag added to SD_NODE_INIT prevented top level dummy numa sched_domain to be properly degenerated on non-numa smp machine. The reason is that in sd_parent_degenerate(), it found that the child and parent does not have comon sched_domain flags due to SD_SERIALIZE. However, for non-numa smp box, the top level is a dummy with a single sched_group. Filter out SD_SERIALIZE if it is on non-numa machine to properly degenerate top level node sched_domain. this will cut back some of the sd domain walk in the load balancer code. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1528282..74498c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6768,6 +6768,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; } if (~cflags & pflags) return 0; -- cgit v1.1 From efbe027e95dc13ac343b6130948418d7ead7ddf1 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Mon, 8 Dec 2008 20:52:49 +0530 Subject: sched: idle_balance() does not call load_balance_newidle() Impact: fix SD_BALANCE_NEWIDLEand broaden its use load_balance_newidle() does not get called if SD_BALANCE_NEWIDLE is set at higher level domain (3-CPU) and not in low level domain (2-MC). pulled_task is initialised to -1 and checked for non-zero which is always true if the lowest level sched_domain does not have SD_BALANCE_NEWIDLE flag set. Signed-off-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 74498c8..bb9c638 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3685,7 +3685,7 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = -1; + int pulled_task = 0; unsigned long next_balance = jiffies + HZ; cpumask_t tmpmask; -- cgit v1.1 From 94d6a5f7341ebaff53d4e41cc81fab37f0d9fbed Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 8 Dec 2008 15:52:21 -0600 Subject: user namespaces: document CFS behavior Documented the currently bogus state of support for CFS user groups with user namespaces. In particular, all users in a user namespace should be children of the user which created the user namespace. This is yet to be implemented. Signed-off-by: Serge E. Hallyn Acked-by: Dhaval Giani Signed-off-by: Serge E. Hallyn Signed-off-by: James Morris --- kernel/user.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 6c924bc..6608a3d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -239,7 +239,13 @@ static struct kobj_type uids_ktype = { .release = uids_release, }; -/* create /sys/kernel/uids//cpu_share file for this user */ +/* + * Create /sys/kernel/uids//cpu_share file for this user + * We do not create this file for users in a user namespace (until + * sysfs tagging is implemented). + * + * See Documentation/scheduler/sched-design-CFS.txt for ramifications. + */ static int uids_user_create(struct user_struct *up) { struct kobject *kobj = &up->kobj; -- cgit v1.1 From ee79d1bdb6a10499e53f80b1e8d14110215178ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 Dec 2008 18:49:50 +0100 Subject: sched: let arch_update_cpu_topology indicate if topology changed Change arch_update_cpu_topology so it returns 1 if the cpu topology changed and 0 if it didn't change. This will be useful for the next patch which adds a call to this function in partition_sched_domains. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ef212da..fcfbbd9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7675,8 +7675,14 @@ static struct sched_domain_attr *dattr_cur; */ static cpumask_t fallback_doms; -void __attribute__((weak)) arch_update_cpu_topology(void) +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) { + return 0; } /* -- cgit v1.1 From d65bd5ecb2bd166cea4952a59b7e16cc3ad6ef6c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 Dec 2008 18:49:51 +0100 Subject: sched: add missing arch_update_cpu_topology() call arch_reinit_sched_domains() used to call arch_update_cpu_topology() via arch_init_sched_domains(). This call got lost with e761b7725234276a802322549cee5255305a0930 ("cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2)". So we might end up with outdated and missing cpus in the cpu core maps (architecture used to call arch_reinit_sched_domains if cpu topology changed). This adds a call to arch_update_cpu_topology in partition_sched_domains which gets called whenever scheduling domains get updated. Which is what is supposed to happen when cpu topology changes. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fcfbbd9..ad7b93b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7774,17 +7774,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; + int new_topology; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7804,7 +7808,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur; j++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { if (cpus_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; -- cgit v1.1 From 03e89e4574a680af15f59329b061f35d9813aff4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 16 Dec 2008 08:45:30 +0100 Subject: sched: fix wakeup preemption clock Impact: sharpen the wakeup-granularity to always be against current scheduler time It was possible to do the preemption check against an old time stamp. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ad7b93b..8821506 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,6 +2266,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@ -2323,7 +2324,6 @@ out_activate: schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 98345e4..928cd74 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1345,12 +1345,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); - if (unlikely(rt_prio(p->prio))) { - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + update_curr(cfs_rq); - update_rq_clock(rq); - update_curr(cfs_rq); + if (unlikely(rt_prio(p->prio))) { resched_task(curr); return; } -- cgit v1.1 From 34f28ecd0f4bdc733c681294d02d9fab5880591b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Dec 2008 08:45:31 +0100 Subject: sched: optimize update_curr() Impact: micro-optimization Skip the hard work when there is none. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 928cd74..5ad4440 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq) * overflow on 32 bits): */ delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; -- cgit v1.1 From 720f54988e17b33f3f477010b3a68ee872d20d5a Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 15 Dec 2008 22:02:01 -0800 Subject: sched, cpuacct: refactoring cpuusage_read / cpuusage_write Impact: micro-optimize the code on 64-bit architectures In the thread regarding to 'export percpu cpuacct cgroup stats' http://lkml.org/lkml/2008/12/7/13 akpm pointed out that current cpuacct code is inefficient. This patch refactoring the following: * make cpu_rq locking only on 32-bit * change iterator to each_present_cpu instead of each_possible_cpu to make it hotplug friendly. It's a bit of code churn, but I was rewarded with 160 byte code size saving on x86-64 arch and zero code size change on i386. Signed-off-by: Ken Chen Cc: Paul Menage Cc: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8821506..41b7e2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9275,6 +9275,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) kfree(ca); } +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@ -9282,17 +9317,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@ -9309,13 +9335,9 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } -- cgit v1.1 From e9515c3c9feecd74174c2998add0db51e02abb8d Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 15 Dec 2008 22:04:15 -0800 Subject: sched, cpuacct: export percpu cpuacct cgroup stats This patch export per-cpu CPU cycle usage for a given cpuacct cgroup. There is a need for a user space monitor daemon to track group CPU usage on per-cpu base. It is also useful for monitoring CFS load balancer behavior by tracking per CPU group usage. Signed-off-by: Ken Chen Reviewed-by: Li Zefan Reviewed-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 41b7e2d..f53e2b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9342,12 +9342,32 @@ out: return err; } +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -- cgit v1.1 From 80f40ee4a07530cc3acbc239a9299ec47025825b Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 15 Dec 2008 11:56:48 +0530 Subject: sched: use RCU variant of list traversal in for_each_leaf_rt_rq() Impact: fix potential of rare crash for_each_leaf_rt_rq() walks an RCU protected list (rq->leaf_rt_rq_list), but doesn't use list_for_each_entry_rcu(). Fix this. Signed-off-by: Bharata B Rao Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d9ba9d5..7bdf84c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) } #define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { -- cgit v1.1 From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 23:41:22 -0800 Subject: schedstat: consolidate per-task cpu runtime stats Impact: simplify code When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time. These two stats are exactly the same. Given that task->se.sum_exec_runtime is always accumulated by the core scheduler, sched_info can reuse that data instead of duplicate the accounting. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/delayacct.c | 2 +- kernel/sched.c | 2 ++ kernel/sched_stats.h | 5 ++--- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index b3179da..abb6e17 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) */ t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; - t3 = tsk->sched_info.cpu_time; + t3 = tsk->se.sum_exec_runtime; d->cpu_count += t1; diff --git a/kernel/sched.c b/kernel/sched.c index f53e2b8..fd835fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -596,6 +596,8 @@ struct rq { #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a..3b01098 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, rq->sched_switch, rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, - rq->rq_sched_info.cpu_time, + rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); seq_printf(seq, "\n"); @@ -123,7 +123,7 @@ static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) { if (rq) - rq->rq_sched_info.cpu_time += delta; + rq->rq_cpu_time += delta; } static inline void @@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t) unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; - t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); if (t->state == TASK_RUNNING) -- cgit v1.1 From 9bb482476c6c9d1ae033306440c51ceac93ea80c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:30:08 +0000 Subject: allow stripping of generated symbols under CONFIG_KALLSYMS_ALL Building upon parts of the module stripping patch, this patch introduces similar stripping for vmlinux when CONFIG_KALLSYMS_ALL=y. Using CONFIG_KALLSYMS_STRIP_GENERATED reduces the overhead of CONFIG_KALLSYMS_ALL from 245k/310k to 65k/80k for the (i386/x86-64) kernels I tested with. The patch also does away with the need to special case the kallsyms- internal symbols by making them available even in the first linking stage. While it is a generated file, the patch includes the changes to scripts/genksyms/keywords.c_shipped, as I'm unsure what the procedure here is. Signed-off-by: Jan Beulich Signed-off-by: Sam Ravnborg --- kernel/kallsyms.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7b8b0f2..e694afa 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,20 +30,19 @@ #define all_var 0 #endif -/* These will be re-linked against their real values during the second link stage */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[]; +extern const u8 kallsyms_names[]; /* tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV) */ extern const unsigned long kallsyms_num_syms -__attribute__((weak, section(".rodata"))); + __attribute__((__section__(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[]; +extern const u16 kallsyms_token_index[]; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[]; static inline int is_kernel_inittext(unsigned long addr) { @@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - /* do a binary search on the sorted kallsyms_addresses array */ low = 0; high = kallsyms_num_syms; -- cgit v1.1 From 412d0bb553c0227191f1bfd06100f561600bff22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Dec 2008 01:43:25 +0100 Subject: tracing/function-graph-tracer: strip ending newlines on comments Impact: tracer output improvement Ending newlines are appended automatically on comments by the function graph tracer because the newline needs to be placed after the "*/" comment characters. So if the user puts an ending newline, we want to strip it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4bf39fc..bc7d908 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -592,6 +592,12 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, if (ent->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); + /* Strip ending newline */ + if (s->buffer[s->len - 1] == '\n') { + s->buffer[s->len - 1] = '\0'; + s->len--; + } + ret = trace_seq_printf(s, " */\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.1