diff options
Diffstat (limited to 'kernel')
85 files changed, 3862 insertions, 2479 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e1c5bf3..2aebc4c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + async.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -39,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) +obj-y += smp.o +else +obj-y += up.o +endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o diff --git a/kernel/async.c b/kernel/async.c new file mode 100644 index 0000000..608b32b --- /dev/null +++ b/kernel/async.c @@ -0,0 +1,346 @@ +/* + * async.c: Asynchronous function calls for boot performance + * + * (C) Copyright 2009 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + + +/* + +Goals and Theory of Operation + +The primary goal of this feature is to reduce the kernel boot time, +by doing various independent hardware delays and discovery operations +decoupled and not strictly serialized. + +More specifically, the asynchronous function call concept allows +certain operations (primarily during system boot) to happen +asynchronously, out of order, while these operations still +have their externally visible parts happen sequentially and in-order. +(not unlike how out-of-order CPUs retire their instructions in order) + +Key to the asynchronous function call implementation is the concept of +a "sequence cookie" (which, although it has an abstracted type, can be +thought of as a monotonically incrementing number). + +The async core will assign each scheduled event such a sequence cookie and +pass this to the called functions. + +The asynchronously called function should before doing a globally visible +operation, such as registering device numbers, call the +async_synchronize_cookie() function and pass in its own cookie. The +async_synchronize_cookie() function will make sure that all asynchronous +operations that were scheduled prior to the operation corresponding with the +cookie have completed. + +Subsystem/driver initialization code that scheduled asynchronous probe +functions, but which shares global resources with other drivers/subsystems +that do not use the asynchronous call feature, need to do a full +synchronization with the async_synchronize_full() function, before returning +from their init function. This is to maintain strict ordering between the +asynchronous and synchronous parts of the kernel. + +*/ + +#include <linux/async.h> +#include <linux/module.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/kthread.h> +#include <asm/atomic.h> + +static async_cookie_t next_cookie = 1; + +#define MAX_THREADS 256 +#define MAX_WORK 32768 + +static LIST_HEAD(async_pending); +static LIST_HEAD(async_running); +static DEFINE_SPINLOCK(async_lock); + +static int async_enabled = 0; + +struct async_entry { + struct list_head list; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; +}; + +static DECLARE_WAIT_QUEUE_HEAD(async_done); +static DECLARE_WAIT_QUEUE_HEAD(async_new); + +static atomic_t entry_count; +static atomic_t thread_count; + +extern int initcall_debug; + + +/* + * MUST be called with the lock held! + */ +static async_cookie_t __lowest_in_progress(struct list_head *running) +{ + struct async_entry *entry; + if (!list_empty(running)) { + entry = list_first_entry(running, + struct async_entry, list); + return entry->cookie; + } else if (!list_empty(&async_pending)) { + entry = list_first_entry(&async_pending, + struct async_entry, list); + return entry->cookie; + } else { + /* nothing in progress... next_cookie is "infinity" */ + return next_cookie; + } + +} + +static async_cookie_t lowest_in_progress(struct list_head *running) +{ + unsigned long flags; + async_cookie_t ret; + + spin_lock_irqsave(&async_lock, flags); + ret = __lowest_in_progress(running); + spin_unlock_irqrestore(&async_lock, flags); + return ret; +} +/* + * pick the first pending entry and run it + */ +static void run_one_entry(void) +{ + unsigned long flags; + struct async_entry *entry; + ktime_t calltime, delta, rettime; + + /* 1) pick one task from the pending queue */ + + spin_lock_irqsave(&async_lock, flags); + if (list_empty(&async_pending)) + goto out; + entry = list_first_entry(&async_pending, struct async_entry, list); + + /* 2) move it to the running queue */ + list_del(&entry->list); + list_add_tail(&entry->list, &async_running); + spin_unlock_irqrestore(&async_lock, flags); + + /* 3) run it (and print duration)*/ + if (initcall_debug && system_state == SYSTEM_BOOTING) { + printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); + calltime = ktime_get(); + } + entry->func(entry->data, entry->cookie); + if (initcall_debug && system_state == SYSTEM_BOOTING) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, + entry->func, ktime_to_ns(delta) >> 10); + } + + /* 4) remove it from the running queue */ + spin_lock_irqsave(&async_lock, flags); + list_del(&entry->list); + + /* 5) free the entry */ + kfree(entry); + atomic_dec(&entry_count); + + spin_unlock_irqrestore(&async_lock, flags); + + /* 6) wake up any waiters. */ + wake_up(&async_done); + return; + +out: + spin_unlock_irqrestore(&async_lock, flags); +} + + +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +{ + struct async_entry *entry; + unsigned long flags; + async_cookie_t newcookie; + + + /* allow irq-off callers */ + entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); + + /* + * If we're out of memory or if there's too much work + * pending already, we execute synchronously. + */ + if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + spin_lock_irqsave(&async_lock, flags); + newcookie = next_cookie++; + spin_unlock_irqrestore(&async_lock, flags); + + /* low on memory.. run synchronously */ + ptr(data, newcookie); + return newcookie; + } + entry->func = ptr; + entry->data = data; + entry->running = running; + + spin_lock_irqsave(&async_lock, flags); + newcookie = entry->cookie = next_cookie++; + list_add_tail(&entry->list, &async_pending); + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + wake_up(&async_new); + return newcookie; +} + +async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +{ + return __async_schedule(ptr, data, &async_pending); +} +EXPORT_SYMBOL_GPL(async_schedule); + +async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) +{ + return __async_schedule(ptr, data, running); +} +EXPORT_SYMBOL_GPL(async_schedule_special); + +void async_synchronize_full(void) +{ + do { + async_synchronize_cookie(next_cookie); + } while (!list_empty(&async_running) || !list_empty(&async_pending)); +} +EXPORT_SYMBOL_GPL(async_synchronize_full); + +void async_synchronize_full_special(struct list_head *list) +{ + async_synchronize_cookie_special(next_cookie, list); +} +EXPORT_SYMBOL_GPL(async_synchronize_full_special); + +void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) +{ + ktime_t starttime, delta, endtime; + + if (initcall_debug && system_state == SYSTEM_BOOTING) { + printk("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); + } + + wait_event(async_done, lowest_in_progress(running) >= cookie); + + if (initcall_debug && system_state == SYSTEM_BOOTING) { + endtime = ktime_get(); + delta = ktime_sub(endtime, starttime); + + printk("async_continuing @ %i after %lli usec\n", + task_pid_nr(current), ktime_to_ns(delta) >> 10); + } +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); + +void async_synchronize_cookie(async_cookie_t cookie) +{ + async_synchronize_cookie_special(cookie, &async_running); +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie); + + +static int async_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int ret = HZ; + set_current_state(TASK_INTERRUPTIBLE); + /* + * check the list head without lock.. false positives + * are dealt with inside run_one_entry() while holding + * the lock. + */ + rmb(); + if (!list_empty(&async_pending)) + run_one_entry(); + else + ret = schedule_timeout(HZ); + + if (ret == 0) { + /* + * we timed out, this means we as thread are redundant. + * we sign off and die, but we to avoid any races there + * is a last-straw check to see if work snuck in. + */ + atomic_dec(&thread_count); + wmb(); /* manager must see our departure first */ + if (list_empty(&async_pending)) + break; + /* + * woops work came in between us timing out and us + * signing off; we need to stay alive and keep working. + */ + atomic_inc(&thread_count); + } + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int async_manager_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int tc, ec; + + set_current_state(TASK_INTERRUPTIBLE); + + tc = atomic_read(&thread_count); + rmb(); + ec = atomic_read(&entry_count); + + while (tc < ec && tc < MAX_THREADS) { + kthread_run(async_thread, NULL, "async/%i", tc); + atomic_inc(&thread_count); + tc++; + } + + schedule(); + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int __init async_init(void) +{ + if (async_enabled) + kthread_run(async_manager_thread, NULL, "async/mgr"); + return 0; +} + +static int __init setup_async(char *str) +{ + async_enabled = 1; + return 1; +} + +__setup("fastboot", setup_async); + + +core_initcall(async_init); diff --git a/kernel/audit.h b/kernel/audit.h index 9d67174..16f18ca 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return __audit_signal_info(sig, t); return 0; } -extern enum audit_state audit_filter_inodes(struct task_struct *, - struct audit_context *); -extern void audit_set_auditable(struct audit_context *); +extern void audit_filter_inodes(struct task_struct *, struct audit_context *); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED -#define audit_set_auditable(c) #endif diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8b50944..8ad9545 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree) audit_log_end(ab); rule->tree = NULL; list_del_rcu(&entry->list); + list_del(&entry->rule.list); call_rcu(&entry->rcu, audit_free_rule_rcu); } } @@ -617,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) if (pathname[0] != '/' || rule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || rule->inode_f || rule->watch || rule->tree) return -EINVAL; rule->tree = alloc_tree(pathname); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9fd85a4..fbf24d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #error Fix audit_filter_list initialiser #endif }; +static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_rules_list[0]), + LIST_HEAD_INIT(audit_rules_list[1]), + LIST_HEAD_INIT(audit_rules_list[2]), + LIST_HEAD_INIT(audit_rules_list[3]), + LIST_HEAD_INIT(audit_rules_list[4]), + LIST_HEAD_INIT(audit_rules_list[5]), +}; DEFINE_MUTEX(audit_filter_mutex); @@ -244,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree) + krule->watch || krule->inode_f || krule->tree || + (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; krule->inode_f = f; @@ -262,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, if (path[0] != '/' || path[len-1] == '/' || krule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || krule->inode_f || krule->watch || krule->tree) return -EINVAL; @@ -412,12 +421,32 @@ exit_err: return ERR_PTR(err); } +static u32 audit_ops[] = +{ + [Audit_equal] = AUDIT_EQUAL, + [Audit_not_equal] = AUDIT_NOT_EQUAL, + [Audit_bitmask] = AUDIT_BIT_MASK, + [Audit_bittest] = AUDIT_BIT_TEST, + [Audit_lt] = AUDIT_LESS_THAN, + [Audit_gt] = AUDIT_GREATER_THAN, + [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, + [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, +}; + +static u32 audit_to_op(u32 op) +{ + u32 n; + for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) + ; + return n; +} + + /* Translate struct audit_rule to kernel's rule respresentation. * Exists for backward compatibility with userspace. */ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; - struct audit_field *ino_f; int err = 0; int i; @@ -427,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 n; + + n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ + if (n & AUDIT_NEGATE) + f->op = Audit_not_equal; + else if (!n) + f->op = Audit_equal; + else + f->op = audit_to_op(n); + + entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; err = -EINVAL; + if (f->op == Audit_bad) + goto exit_free; + switch(f->type) { default: goto exit_free; @@ -454,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EXIT: case AUDIT_SUCCESS: /* bit ops are only useful on syscall args */ - if (f->op == AUDIT_BIT_MASK || - f->op == AUDIT_BIT_TEST) { - err = -EINVAL; + if (f->op == Audit_bitmask || f->op == Audit_bittest) goto exit_free; - } break; case AUDIT_ARG0: case AUDIT_ARG1: @@ -467,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) break; /* arch is only allowed to be = or != */ case AUDIT_ARCH: - if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) - && (f->op != AUDIT_NEGATE) && (f->op)) { - err = -EINVAL; + if (f->op != Audit_not_equal && f->op != Audit_equal) goto exit_free; - } entry->rule.arch_f = f; break; case AUDIT_PERM: @@ -488,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; } - - entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (f->op & AUDIT_NEGATE) - f->op = AUDIT_NOT_EQUAL; - else if (!f->op) - f->op = AUDIT_EQUAL; - else if (f->op == AUDIT_OPERATORS) { - err = -EINVAL; - goto exit_free; - } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -530,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; - struct audit_field *ino_f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -546,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, struct audit_field *f = &entry->rule.fields[i]; err = -EINVAL; - if (!(data->fieldflags[i] & AUDIT_OPERATORS) || - data->fieldflags[i] & ~AUDIT_OPERATORS) + + f->op = audit_to_op(data->fieldflags[i]); + if (f->op == Audit_bad) goto exit_free; - f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; f->val = data->values[i]; f->lsm_str = NULL; @@ -662,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -713,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule->fields[i] = krule->fields[i].type; if (krule->vers_ops == 1) { - if (krule->fields[i].op & AUDIT_NOT_EQUAL) + if (krule->fields[i].op == Audit_not_equal) rule->fields[i] |= AUDIT_NEGATE; } else { - rule->fields[i] |= krule->fields[i].op; + rule->fields[i] |= audit_ops[krule->fields[i].op]; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; @@ -744,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) struct audit_field *f = &krule->fields[i]; data->fields[i] = f->type; - data->fieldflags[i] = f->op; + data->fieldflags[i] = audit_ops[f->op]; switch(f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -919,6 +924,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; + new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; new->watch = NULL; @@ -987,9 +993,8 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context && - audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) - audit_set_auditable(current->audit_context); + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { @@ -1007,12 +1012,15 @@ static void audit_update_watch(struct audit_parent *parent, list_del_rcu(&oentry->list); nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); - else { + } else { int h = audit_hash_ino((u32)ino); list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); } call_rcu(&oentry->rcu, audit_free_rule_rcu); @@ -1077,6 +1085,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_log_end(ab); } list_del(&r->rlist); + list_del(&r->list); list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); } @@ -1102,12 +1111,16 @@ static void audit_inotify_unregister(struct list_head *in_list) /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head **p) { struct audit_entry *e, *found = NULL; + struct list_head *list; int h; - if (entry->rule.watch) { + if (entry->rule.inode_f) { + h = audit_hash_ino(entry->rule.inode_f->val); + *p = list = &audit_inode_hash[h]; + } else if (entry->rule.watch) { /* we don't know the inode number, so must walk entire hash */ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { list = &audit_inode_hash[h]; @@ -1118,6 +1131,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry, } } goto out; + } else { + *p = list = &audit_filter_list[entry->rule.listnr]; } list_for_each_entry(e, list, list) @@ -1258,15 +1273,17 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, return ret; } +static u64 prio_low = ~0ULL/2; +static u64 prio_high = ~0ULL/2 - 1; + /* Add rule to given filterlist if not a duplicate. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_add_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct nameidata *ndp = NULL, *ndw = NULL; + struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1277,13 +1294,8 @@ static inline int audit_add_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); mutex_unlock(&audit_filter_mutex); if (e) { err = -EEXIST; @@ -1319,10 +1331,22 @@ static inline int audit_add_rule(struct audit_entry *entry, } } + entry->rule.prio = ~0ULL; + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { + if (entry->rule.flags & AUDIT_FILTER_PREPEND) + entry->rule.prio = ++prio_high; + else + entry->rule.prio = --prio_low; + } + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + list_add(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { + list_add_tail(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_tail_rcu(&entry->list, list); } #ifdef CONFIG_AUDITSYSCALL @@ -1345,15 +1369,14 @@ error: } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch, *tmp_watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; + struct list_head *list; LIST_HEAD(inotify_list); - int h, ret = 0; + int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1363,13 +1386,8 @@ static inline int audit_del_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); if (!e) { mutex_unlock(&audit_filter_mutex); ret = -ENOENT; @@ -1404,6 +1422,7 @@ static inline int audit_del_rule(struct audit_entry *entry, audit_remove_tree_rule(&e->rule); list_del_rcu(&e->list); + list_del(&e->rule.list); call_rcu(&e->rcu, audit_free_rule_rcu); #ifdef CONFIG_AUDITSYSCALL @@ -1432,30 +1451,16 @@ out: static void audit_list(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *entry; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(entry, &audit_filter_list[i], list) { - struct audit_rule *rule; - - rule = audit_krule_to_rule(&entry->rule); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(entry, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule *rule; - rule = audit_krule_to_rule(&entry->rule); + rule = audit_krule_to_rule(r); if (unlikely(!rule)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, @@ -1474,30 +1479,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q) static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *e; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(e, &audit_filter_list[i], list) { - struct audit_rule_data *data; - - data = audit_krule_to_data(&e->rule); - if (unlikely(!data)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(e, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; - data = audit_krule_to_data(&e->rule); + data = audit_krule_to_data(r); if (unlikely(!data)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, @@ -1603,8 +1594,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_add_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_add_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "add", &entry->rule, !err); @@ -1620,8 +1610,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_del_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_del_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "remove", &entry->rule, !err); @@ -1634,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } -int audit_comparator(const u32 left, const u32 op, const u32 right) +int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { - case AUDIT_EQUAL: + case Audit_equal: return (left == right); - case AUDIT_NOT_EQUAL: + case Audit_not_equal: return (left != right); - case AUDIT_LESS_THAN: + case Audit_lt: return (left < right); - case AUDIT_LESS_THAN_OR_EQUAL: + case Audit_le: return (left <= right); - case AUDIT_GREATER_THAN: + case Audit_gt: return (left > right); - case AUDIT_GREATER_THAN_OR_EQUAL: + case Audit_ge: return (left >= right); - case AUDIT_BIT_MASK: + case Audit_bitmask: return (left & right); - case AUDIT_BIT_TEST: + case Audit_bittest: return ((left & right) == right); + default: + BUG(); + return 0; } - BUG(); - return 0; } /* Compare given dentry name with last component in given path, @@ -1778,6 +1768,43 @@ unlock_and_return: return result; } +static int update_lsm_rule(struct audit_krule *r) +{ + struct audit_entry *entry = container_of(r, struct audit_entry, rule); + struct audit_entry *nentry; + struct audit_watch *watch; + struct audit_tree *tree; + int err = 0; + + if (!security_audit_rule_known(r)) + return 0; + + watch = r->watch; + tree = r->tree; + nentry = audit_dupe_rule(r, watch); + if (IS_ERR(nentry)) { + /* save the first error encountered for the + * return value */ + err = PTR_ERR(nentry); + audit_panic("error updating LSM filters"); + if (watch) + list_del(&r->rlist); + list_del_rcu(&entry->list); + list_del(&r->list); + } else { + if (watch) { + list_add(&nentry->rule.rlist, &watch->rules); + list_del(&r->rlist); + } else if (tree) + list_replace_init(&r->rlist, &nentry->rule.rlist); + list_replace_rcu(&entry->list, &nentry->list); + list_replace(&r->list, &nentry->rule.list); + } + call_rcu(&entry->rcu, audit_free_rule_rcu); + + return err; +} + /* This function will re-initialize the lsm_rule field of all applicable rules. * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the @@ -1785,45 +1812,19 @@ unlock_and_return: * updated rule. */ int audit_update_lsm_rules(void) { - struct audit_entry *entry, *n, *nentry; - struct audit_watch *watch; - struct audit_tree *tree; + struct audit_krule *r, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!security_audit_rule_known(&entry->rule)) - continue; - - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); - if (IS_ERR(nentry)) { - /* save the first error encountered for the - * return value */ - if (!err) - err = PTR_ERR(nentry); - audit_panic("error updating LSM filters"); - if (watch) - list_del(&entry->rule.rlist); - list_del_rcu(&entry->list); - } else { - if (watch) { - list_add(&nentry->rule.rlist, - &watch->rules); - list_del(&entry->rule.rlist); - } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); - list_replace_rcu(&entry->list, &nentry->list); - } - call_rcu(&entry->rcu, audit_free_rule_rcu); + list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { + int res = update_lsm_rule(r); + if (!err) + err = res; } } - mutex_unlock(&audit_filter_mutex); return err; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4819f371..8cbddff 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -124,43 +124,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_mq_open { - struct audit_aux_data d; - int oflag; - mode_t mode; - struct mq_attr attr; -}; - -struct audit_aux_data_mq_sendrecv { - struct audit_aux_data d; - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; -}; - -struct audit_aux_data_mq_notify { - struct audit_aux_data d; - mqd_t mqdes; - struct sigevent notification; -}; - -struct audit_aux_data_mq_getsetattr { - struct audit_aux_data d; - mqd_t mqdes; - struct mq_attr mqstat; -}; - -struct audit_aux_data_ipcctl { - struct audit_aux_data d; - struct ipc_perm p; - unsigned long qbytes; - uid_t uid; - gid_t gid; - mode_t mode; - u32 osid; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -168,23 +131,6 @@ struct audit_aux_data_execve { struct mm_struct *mm; }; -struct audit_aux_data_socketcall { - struct audit_aux_data d; - int nargs; - unsigned long args[0]; -}; - -struct audit_aux_data_sockaddr { - struct audit_aux_data d; - int len; - char a[0]; -}; - -struct audit_aux_data_fd_pair { - struct audit_aux_data d; - int fd[2]; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -219,14 +165,14 @@ struct audit_tree_refs { struct audit_context { int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state; + enum audit_state state, current_state; unsigned int serial; /* serial number for record */ struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ unsigned long argv[4]; /* syscall arguments */ int return_valid; /* return code is valid */ long return_code;/* syscall return code */ - int auditable; /* 1 if record should be written */ + u64 prio; int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -234,7 +180,8 @@ struct audit_context { struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; - + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; uid_t uid, euid, suid, fsuid; @@ -252,6 +199,49 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + struct { + uid_t uid; + gid_t gid; + mode_t mode; + u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + mode_t perm_mode; + unsigned long qbytes; + } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; + struct { + int oflag; + mode_t mode; + struct mq_attr attr; + } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; + }; + int fds[2]; + #if AUDIT_DEBUG int put_count; int ino_count; @@ -608,19 +598,12 @@ static int audit_filter_rules(struct task_struct *tsk, } } /* Find ipc objects that match */ - if (ctx) { - struct audit_aux_data *aux; - for (aux = ctx->aux; aux; - aux = aux->next) { - if (aux->type == AUDIT_IPC) { - struct audit_aux_data_ipcctl *axi = (void *)aux; - if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { - ++result; - break; - } - } - } - } + if (!ctx || ctx->type != AUDIT_IPC) + break; + if (security_audit_rule_match(ctx->ipc.osid, + f->type, f->op, + f->lsm_rule, ctx)) + ++result; } break; case AUDIT_ARG0: @@ -647,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk, return 0; } } - if (rule->filterkey && ctx) - ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + + if (ctx) { + if (rule->prio <= ctx->prio) + return 0; + if (rule->filterkey) { + kfree(ctx->filterkey); + ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + } + ctx->prio = rule->prio; + } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; @@ -661,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk, * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ -static enum audit_state audit_filter_task(struct task_struct *tsk) +static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; @@ -669,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (state == AUDIT_RECORD_CONTEXT) + *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } @@ -702,6 +695,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, audit_filter_rules(tsk, &e->rule, ctx, NULL, &state)) { rcu_read_unlock(); + ctx->current_state = state; return state; } } @@ -715,15 +709,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, * buckets applicable to the inode numbers in audit_names[]. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ -enum audit_state audit_filter_inodes(struct task_struct *tsk, - struct audit_context *ctx) +void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { int i; struct audit_entry *e; enum audit_state state; if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; + return; rcu_read_lock(); for (i = 0; i < ctx->name_count; i++) { @@ -740,17 +733,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk, if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { rcu_read_unlock(); - return state; + ctx->current_state = state; + return; } } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; } -void audit_set_auditable(struct audit_context *ctx) +static void audit_set_auditable(struct audit_context *ctx) { - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } } static inline struct audit_context *audit_get_context(struct task_struct *tsk, @@ -781,23 +777,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, else context->return_code = return_code; - if (context->in_syscall && !context->dummy && !context->auditable) { - enum audit_state state; - - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - if (state == AUDIT_RECORD_CONTEXT) { - context->auditable = 1; - goto get_context; - } - - state = audit_filter_inodes(tsk, context); - if (state == AUDIT_RECORD_CONTEXT) - context->auditable = 1; - + if (context->in_syscall && !context->dummy) { + audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); } -get_context: - tsk->audit_context = NULL; return context; } @@ -807,8 +791,7 @@ static inline void audit_free_names(struct audit_context *context) int i; #if AUDIT_DEBUG == 2 - if (context->auditable - ||context->put_count + context->ino_count != context->name_count) { + if (context->put_count + context->ino_count != context->name_count) { printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -859,6 +842,7 @@ static inline void audit_zero_context(struct audit_context *context, { memset(context, 0, sizeof(*context)); context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; } static inline struct audit_context *audit_alloc_context(enum audit_state state) @@ -884,18 +868,21 @@ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; + char *key = NULL; if (likely(!audit_ever_enabled)) return 0; /* Return if not auditing. */ - state = audit_filter_task(tsk); + state = audit_filter_task(tsk, &key); if (likely(state == AUDIT_DISABLED)) return 0; if (!(context = audit_alloc_context(state))) { + kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } + context->filterkey = key; tsk->audit_context = context; set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); @@ -921,6 +908,7 @@ static inline void audit_free_context(struct audit_context *context) free_tree_refs(context); audit_free_aux(context); kfree(context->filterkey); + kfree(context->sockaddr); kfree(context); context = previous; } while (context); @@ -1230,6 +1218,97 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); } +static void show_special(struct audit_context *context, int *call_panic) +{ + struct audit_buffer *ab; + int i; + + ab = audit_log_start(context, GFP_KERNEL, context->type); + if (!ab) + return; + + switch (context->type) { + case AUDIT_SOCKETCALL: { + int nargs = context->socketcall.nargs; + audit_log_format(ab, "nargs=%d", nargs); + for (i = 0; i < nargs; i++) + audit_log_format(ab, " a%d=%lx", i, + context->socketcall.args[i]); + break; } + case AUDIT_IPC: { + u32 osid = context->ipc.osid; + + audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + context->ipc.uid, context->ipc.gid, context->ipc.mode); + if (osid) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx(osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", osid); + *call_panic = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + if (context->ipc.has_perm) { + audit_log_end(ab); + ab = audit_log_start(context, GFP_KERNEL, + AUDIT_IPC_SET_PERM); + audit_log_format(ab, + "qbytes=%lx ouid=%u ogid=%u mode=%#o", + context->ipc.qbytes, + context->ipc.perm_uid, + context->ipc.perm_gid, + context->ipc.perm_mode); + if (!ab) + return; + } + break; } + case AUDIT_MQ_OPEN: { + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + context->mq_open.oflag, context->mq_open.mode, + context->mq_open.attr.mq_flags, + context->mq_open.attr.mq_maxmsg, + context->mq_open.attr.mq_msgsize, + context->mq_open.attr.mq_curmsgs); + break; } + case AUDIT_MQ_SENDRECV: { + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + context->mq_sendrecv.mqdes, + context->mq_sendrecv.msg_len, + context->mq_sendrecv.msg_prio, + context->mq_sendrecv.abs_timeout.tv_sec, + context->mq_sendrecv.abs_timeout.tv_nsec); + break; } + case AUDIT_MQ_NOTIFY: { + audit_log_format(ab, "mqdes=%d sigev_signo=%d", + context->mq_notify.mqdes, + context->mq_notify.sigev_signo); + break; } + case AUDIT_MQ_GETSETATTR: { + struct mq_attr *attr = &context->mq_getsetattr.mqstat; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + context->mq_getsetattr.mqdes, + attr->mq_flags, attr->mq_maxmsg, + attr->mq_msgsize, attr->mq_curmsgs); + break; } + case AUDIT_CAPSET: { + audit_log_format(ab, "pid=%d", context->capset.pid); + audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); + audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); + audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + break; } + } + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1307,94 +1386,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { - case AUDIT_MQ_OPEN: { - struct audit_aux_data_mq_open *axi = (void *)aux; - audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " - "mq_msgsize=%ld mq_curmsgs=%ld", - axi->oflag, axi->mode, axi->attr.mq_flags, - axi->attr.mq_maxmsg, axi->attr.mq_msgsize, - axi->attr.mq_curmsgs); - break; } - - case AUDIT_MQ_SENDRECV: { - struct audit_aux_data_mq_sendrecv *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", - axi->mqdes, axi->msg_len, axi->msg_prio, - axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); - break; } - - case AUDIT_MQ_NOTIFY: { - struct audit_aux_data_mq_notify *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d sigev_signo=%d", - axi->mqdes, - axi->notification.sigev_signo); - break; } - - case AUDIT_MQ_GETSETATTR: { - struct audit_aux_data_mq_getsetattr *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " - "mq_curmsgs=%ld ", - axi->mqdes, - axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, - axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); - break; } - - case AUDIT_IPC: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "ouid=%u ogid=%u mode=%#o", - axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - break; } - - case AUDIT_IPC_SET_PERM: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", - axi->qbytes, axi->uid, axi->gid, axi->mode); - break; } case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); break; } - case AUDIT_SOCKETCALL: { - struct audit_aux_data_socketcall *axs = (void *)aux; - audit_log_format(ab, "nargs=%d", axs->nargs); - for (i=0; i<axs->nargs; i++) - audit_log_format(ab, " a%d=%lx", i, axs->args[i]); - break; } - - case AUDIT_SOCKADDR: { - struct audit_aux_data_sockaddr *axs = (void *)aux; - - audit_log_format(ab, "saddr="); - audit_log_n_hex(ab, axs->a, axs->len); - break; } - - case AUDIT_FD_PAIR: { - struct audit_aux_data_fd_pair *axs = (void *)aux; - audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1409,18 +1406,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); break; } - case AUDIT_CAPSET: { - struct audit_aux_data_capset *axs = (void *)aux; - audit_log_format(ab, "pid=%d", axs->pid); - audit_log_cap(ab, "cap_pi", &axs->cap.inheritable); - audit_log_cap(ab, "cap_pp", &axs->cap.permitted); - audit_log_cap(ab, "cap_pe", &axs->cap.effective); - break; } - } audit_log_end(ab); } + if (context->type) + show_special(context, &call_panic); + + if (context->fds[0] >= 0) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); + if (ab) { + audit_log_format(ab, "fd0=%d fd1=%d", + context->fds[0], context->fds[1]); + audit_log_end(ab); + } + } + + if (context->sockaddr_len) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); + if (ab) { + audit_log_format(ab, "saddr="); + audit_log_n_hex(ab, (void *)context->sockaddr, + context->sockaddr_len); + audit_log_end(ab); + } + } + for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; @@ -1536,7 +1547,7 @@ void audit_free(struct task_struct *tsk) * We use GFP_ATOMIC here because we might be doing this * in the context of the idle thread */ /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); audit_free_context(context); @@ -1620,15 +1631,17 @@ void audit_syscall_entry(int arch, int major, state = context->state; context->dummy = !audit_n_rules; - if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); + } if (likely(state == AUDIT_DISABLED)) return; context->serial = 0; context->ctime = CURRENT_TIME; context->in_syscall = 1; - context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->current_state = state; context->ppid = 0; } @@ -1636,17 +1649,20 @@ void audit_finish_fork(struct task_struct *child) { struct audit_context *ctx = current->audit_context; struct audit_context *p = child->audit_context; - if (!p || !ctx || !ctx->auditable) + if (!p || !ctx) + return; + if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) return; p->arch = ctx->arch; p->major = ctx->major; memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); p->ctime = ctx->ctime; p->dummy = ctx->dummy; - p->auditable = ctx->auditable; p->in_syscall = ctx->in_syscall; p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); p->ppid = current->pid; + p->prio = ctx->prio; + p->current_state = ctx->current_state; } /** @@ -1670,11 +1686,11 @@ void audit_syscall_exit(int valid, long return_code) if (likely(!context)) return; - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); context->in_syscall = 0; - context->auditable = 0; + context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; if (context->previous) { struct audit_context *new_context = context->previous; @@ -1689,8 +1705,13 @@ void audit_syscall_exit(int valid, long return_code) context->aux_pids = NULL; context->target_pid = 0; context->target_sid = 0; - kfree(context->filterkey); - context->filterkey = NULL; + context->sockaddr_len = 0; + context->type = 0; + context->fds[0] = -1; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; + } tsk->audit_context = context; } } @@ -2081,7 +2102,10 @@ int auditsc_get_stamp(struct audit_context *ctx, t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } return 1; } @@ -2127,132 +2151,46 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @mode: mode bits * @u_attr: queue attributes * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) { - struct audit_aux_data_mq_open *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_attr != NULL) { - if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->attr, 0, sizeof(ax->attr)); + if (attr) + memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); + else + memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); - ax->oflag = oflag; - ax->mode = mode; + context->mq_open.oflag = oflag; + context->mq_open.mode = mode; - ax->d.type = AUDIT_MQ_OPEN; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_OPEN; } /** - * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time + * @abs_timeout: Message timeout in absolute time * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec __user *u_abs_timeout) +void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec *abs_timeout) { - struct audit_aux_data_mq_sendrecv *ax; struct audit_context *context = current->audit_context; + struct timespec *p = &context->mq_sendrecv.abs_timeout; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); - - ax->mqdes = mqdes; - ax->msg_len = msg_len; - ax->msg_prio = msg_prio; - - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive - * @mqdes: MQ descriptor - * @msg_len: Message length - * @u_msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, - unsigned int __user *u_msg_prio, - const struct timespec __user *u_abs_timeout) -{ - struct audit_aux_data_mq_sendrecv *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_msg_prio != NULL) { - if (get_user(ax->msg_prio, u_msg_prio)) { - kfree(ax); - return -EFAULT; - } - } else - ax->msg_prio = 0; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + if (abs_timeout) + memcpy(p, abs_timeout, sizeof(struct timespec)); + else + memset(p, 0, sizeof(struct timespec)); - ax->mqdes = mqdes; - ax->msg_len = msg_len; + context->mq_sendrecv.mqdes = mqdes; + context->mq_sendrecv.msg_len = msg_len; + context->mq_sendrecv.msg_prio = msg_prio; - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_SENDRECV; } /** @@ -2260,38 +2198,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, * @mqdes: MQ descriptor * @u_notification: Notification event * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { - struct audit_aux_data_mq_notify *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_notification != NULL) { - if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->notification, 0, sizeof(ax->notification)); - - ax->mqdes = mqdes; + if (notification) + context->mq_notify.sigev_signo = notification->sigev_signo; + else + context->mq_notify.sigev_signo = 0; - ax->d.type = AUDIT_MQ_NOTIFY; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_notify.mqdes = mqdes; + context->type = AUDIT_MQ_NOTIFY; } /** @@ -2299,55 +2218,29 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) * @mqdes: MQ descriptor * @mqstat: MQ flags * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - struct audit_aux_data_mq_getsetattr *ax; struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->mqdes = mqdes; - ax->mqstat = *mqstat; - - ax->d.type = AUDIT_MQ_GETSETATTR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_getsetattr.mqdes = mqdes; + context->mq_getsetattr.mqstat = *mqstat; + context->type = AUDIT_MQ_GETSETATTR; } /** * audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_ipc_obj(struct kern_ipc_perm *ipcp) +void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->uid = ipcp->uid; - ax->gid = ipcp->gid; - ax->mode = ipcp->mode; - security_ipc_getsecid(ipcp, &ax->osid); - ax->d.type = AUDIT_IPC; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.uid = ipcp->uid; + context->ipc.gid = ipcp->gid; + context->ipc.mode = ipcp->mode; + context->ipc.has_perm = 0; + security_ipc_getsecid(ipcp, &context->ipc.osid); + context->type = AUDIT_IPC; } /** @@ -2357,26 +2250,17 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @gid: msgq group id * @mode: msgq mode (permissions) * - * Returns 0 for success or NULL context or < 0 on error. + * Called only after audit_ipc_obj(). */ -int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->qbytes = qbytes; - ax->uid = uid; - ax->gid = gid; - ax->mode = mode; - - ax->d.type = AUDIT_IPC_SET_PERM; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.qbytes = qbytes; + context->ipc.perm_uid = uid; + context->ipc.perm_gid = gid; + context->ipc.perm_mode = mode; + context->ipc.has_perm = 1; } int audit_bprm(struct linux_binprm *bprm) @@ -2406,27 +2290,17 @@ int audit_bprm(struct linux_binprm *bprm) * @nargs: number of args * @args: args array * - * Returns 0 for success or NULL context or < 0 on error. */ -int audit_socketcall(int nargs, unsigned long *args) +void audit_socketcall(int nargs, unsigned long *args) { - struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->nargs = nargs; - memcpy(ax->args, args, nargs * sizeof(unsigned long)); + return; - ax->d.type = AUDIT_SOCKETCALL; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_SOCKETCALL; + context->socketcall.nargs = nargs; + memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); } /** @@ -2434,29 +2308,12 @@ int audit_socketcall(int nargs, unsigned long *args) * @fd1: the first file descriptor * @fd2: the second file descriptor * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_fd_pair(int fd1, int fd2) +void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = current->audit_context; - struct audit_aux_data_fd_pair *ax; - - if (likely(!context)) { - return 0; - } - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) { - return -ENOMEM; - } - - ax->fd[0] = fd1; - ax->fd[1] = fd2; - - ax->d.type = AUDIT_FD_PAIR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->fds[0] = fd1; + context->fds[1] = fd2; } /** @@ -2468,22 +2325,20 @@ int __audit_fd_pair(int fd1, int fd2) */ int audit_sockaddr(int len, void *a) { - struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->len = len; - memcpy(ax->a, a, len); + if (!context->sockaddr) { + void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); + if (!p) + return -ENOMEM; + context->sockaddr = p; + } - ax->d.type = AUDIT_SOCKADDR; - ax->d.next = context->aux; - context->aux = (void *)ax; + context->sockaddr_len = len; + memcpy(context->sockaddr, a, len); return 0; } @@ -2617,29 +2472,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -int __audit_log_capset(pid_t pid, +void __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old) { - struct audit_aux_data_capset *ax; struct audit_context *context = current->audit_context; - - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->d.type = AUDIT_CAPSET; - ax->d.next = context->aux; - context->aux = (void *)ax; - - ax->pid = pid; - ax->cap.effective = new->cap_effective; - ax->cap.inheritable = new->cap_effective; - ax->cap.permitted = new->cap_permitted; - - return 0; + context->capset.pid = pid; + context->capset.cap.effective = new->cap_effective; + context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.permitted = new->cap_permitted; + context->type = AUDIT_CAPSET; } /** diff --git a/kernel/capability.c b/kernel/capability.c index 36b4b4d..688926e 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (ret < 0) goto error; - ret = audit_log_capset(pid, new, current_cred()); - if (ret < 0) - return ret; + audit_log_capset(pid, new, current_cred()); return commit_creds(new); @@ -308,7 +306,7 @@ int capable(int cap) BUG(); } - if (has_capability(current, cap)) { + if (security_capable(cap) == 0) { current->flags |= PF_SUPERPRIV; return 1; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 48348dd..c298310 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -84,7 +84,7 @@ struct cgroupfs_root { /* Tracks how many cgroups are currently defined in hierarchy.*/ int number_of_cgroups; - /* A list running through the mounted hierarchies */ + /* A list running through the active hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ @@ -116,7 +116,6 @@ static int root_count; * be called. */ static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_subsys(_root, _ss) \ list_for_each_entry(_ss, &_root->subsys_list, sibling) -/* for_each_root() allows you to iterate across the active hierarchies */ -#define for_each_root(_root) \ +/* for_each_active_root() allows you to iterate across the active hierarchies */ +#define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by @@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = cg->subsys[i]->cgroup; + struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) @@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @cg: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_cg_links, + struct css_set *cg, struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + BUG_ON(list_empty(tmp_cg_links)); + link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, + cgrp_link_list); + link->cg = cg; + list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_add(&link->cg_link_list, &cg->cg_links); +} + /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -400,7 +418,6 @@ static struct css_set *find_css_set( int i; struct list_head tmp_cg_links; - struct cg_cgroup_link *link; struct hlist_head *hhead; @@ -445,26 +462,11 @@ static struct css_set *find_css_set( * only do it for the first subsystem in each * hierarchy */ - if (ss->root->subsys_list.next == &ss->sibling) { - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &cgrp->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - } - if (list_empty(&rootnode.subsys_list)) { - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &dummytop->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); + if (ss->root->subsys_list.next == &ss->sibling) + link_css_set(&tmp_cg_links, res, cgrp); } + if (list_empty(&rootnode.subsys_list)) + link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -573,7 +575,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; } @@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) + if (ss->pre_destroy) ss->pre_destroy(ss, cgrp); return; } +static void free_cgroup_rcu(struct rcu_head *obj) +{ + struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); + + kfree(cgrp); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) /* * Release the subsystem state objects. */ - for_each_subsys(cgrp->root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } + for_each_subsys(cgrp->root, ss) + ss->destroy(ss, cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); - /* Drop the active superblock reference that we took when we - * created the cgroup */ + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ deactivate_super(cgrp->root->sb); - kfree(cgrp); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); } @@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; - list_add(&ss->sibling, &root->subsys_list); - rcu_assign_pointer(ss->root, root); + list_move(&ss->sibling, &root->subsys_list); + ss->root = root; if (ss->bind) ss->bind(ss, cgrp); - + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; - rcu_assign_pointer(subsys[i]->root, &rootnode); - list_del(&ss->sibling); + subsys[i]->root = &rootnode; + list_move(&ss->sibling, &rootnode.subsys_list); + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, root = NULL; } else { /* New superblock */ - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; int i; @@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, list_add(&root->root_list, &roots); root_count++; - sb->s_root->d_fsdata = &root->top_cgroup; + sb->s_root->d_fsdata = root_cgrp; root->top_cgroup.dentry = sb->s_root; /* Link the top cgroup in this hierarchy into all @@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct hlist_node *node; struct css_set *cg; - hlist_for_each_entry(cg, node, hhead, hlist) { - struct cg_cgroup_link *link; - - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - } + hlist_for_each_entry(cg, node, hhead, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&cgrp->sibling)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->sibling)); + BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cgroup_populate_dir(cgrp); + cgroup_populate_dir(root_cgrp); mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); } @@ -1115,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } + list_del(&root->root_list); + root_count--; + mutex_unlock(&cgroup_mutex); kfree(root); @@ -1147,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held. Writes path of cgroup into buf. - * Returns 0 on success, -errno on error. + * Called with cgroup_mutex held or else with an RCU-protected cgroup + * reference. Writes path of cgroup into buf. Returns 0 on success, + * -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; + struct dentry *dentry = rcu_dereference(cgrp->dentry); - if (cgrp == dummytop) { + if (!dentry || cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1167,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { - int len = cgrp->dentry->d_name.len; + int len = dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cgrp->dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; + dentry = rcu_dereference(cgrp->dentry); if (!cgrp->parent) continue; if (--start < buf) @@ -1218,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) int retval = 0; struct cgroup_subsys *ss; struct cgroup *oldcgrp; - struct css_set *cg = tsk->cgroups; + struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; int subsys_id; @@ -1238,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); /* * Locate or allocate a new css_set for this task, * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); + put_css_set(cg); if (!newcg) return -ENOMEM; @@ -1447,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -1492,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) @@ -1556,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); - if (!cft) - return -ENODEV; + if (cft->read_map || cft->read_seq_string) { struct cgroup_seqfile_state *state = kzalloc(sizeof(*state), GFP_USER); @@ -1673,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, if (!error) { dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); - cgrp->dentry = dentry; + rcu_assign_pointer(cgrp->dentry, dentry); dget(dentry); } dput(dentry); @@ -1814,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; + struct cg_cgroup_link *link; /* If the iterator cg is NULL, we have no tasks */ if (!it->cg_link) @@ -1821,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - if (l == &res->cgroups->tasks) { + link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); + if (l == &link->cg->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); @@ -2015,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { - int n = 0; + int n = 0, pid; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; - pidarray[n++] = task_pid_vnr(tsk); + pid = task_pid_vnr(tsk); + if (pid > 0) + pidarray[n++] = pid; } cgroup_iter_end(cgrp, &it); return n; @@ -2054,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { @@ -2079,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } cgroup_iter_end(cgrp, &it); - rcu_read_unlock(); err: return ret; } @@ -2326,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); + atomic_set(&css->refcnt, 1); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); @@ -2334,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; } +static void cgroup_lock_hierarchy(struct cgroupfs_root *root) +{ + /* We need to take each hierarchy_mutex in a consistent order */ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_lock_nested(&ss->hierarchy_mutex, i); + } +} + +static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) +{ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_unlock(&ss->hierarchy_mutex); + } +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -2382,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_css(css, ss, cgrp); } + cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); + cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -2433,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the - * cgroup, if the css refcount is also 0, then there should + * cgroup, if the css refcount is also 1, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since @@ -2454,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) + if (css && (atomic_read(&css->refcnt) > 1)) return 1; } return 0; } +/* + * Atomically mark all (or else none) of the cgroup's CSS objects as + * CSS_REMOVED. Return true on success, or false if the cgroup has + * busy subsystems. Call with cgroup_mutex held + */ + +static int cgroup_clear_css_refs(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + unsigned long flags; + bool failed = false; + local_irq_save(flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + int refcnt; + do { + /* We can only remove a CSS with a refcnt==1 */ + refcnt = atomic_read(&css->refcnt); + if (refcnt > 1) { + failed = true; + goto done; + } + BUG_ON(!refcnt); + /* + * Drop the refcnt to 0 while we check other + * subsystems. This will cause any racing + * css_tryget() to spin until we set the + * CSS_REMOVED bits or abort + */ + } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + } + done: + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (failed) { + /* + * Restore old refcnt if we previously managed + * to clear it from 1 to 0 + */ + if (!atomic_read(&css->refcnt)) + atomic_set(&css->refcnt, 1); + } else { + /* Commit the fact that the CSS is removed */ + set_bit(CSS_REMOVED, &css->flags); + } + } + local_irq_restore(flags); + return !failed; +} + static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; - struct super_block *sb; - struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ @@ -2489,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_lock(&cgroup_mutex); parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) - || cgroup_has_css_refs(cgrp)) { + || !cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } @@ -2504,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); - /* delete my sibling from parent->children */ + + cgroup_lock_hierarchy(cgrp->root); + /* delete this cgroup from parent->children */ list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(cgrp->root); + spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); @@ -2527,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ + list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ @@ -2540,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); + mutex_init(&ss->hierarchy_mutex); ss->active = 1; } @@ -2565,7 +2648,6 @@ int __init cgroup_init_early(void) INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); - list_add(&rootnode.root_list, &roots); root_count = 1; init_task.cgroups = &init_css_set; @@ -2672,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); - for_each_root(root) { + for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int subsys_id; int count = 0; - /* Skip this hierarchy if it has no active subsystems */ - if (!root->actual_subsys_bits) - continue; seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); @@ -2790,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child) } } -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - * - * The callbacks are invoked with mmap_sem held in read mode. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); - } - } -} -#endif /* CONFIG_MM_OWNER */ - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question @@ -2834,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); + task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); write_unlock(&css_set_lock); } } @@ -2941,14 +2991,20 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } + task_lock(tsk); cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - atomic_inc(&parent->root->sb->s_active); + if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + /* We race with the final deactivate_super() */ + mutex_unlock(&cgroup_mutex); + return 0; + } /* Keep the cgroup alive */ get_css_set(cg); + task_unlock(tsk); mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ @@ -2967,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, } /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + ret = vfs_mkdir(inode, dentry, 0755); child = __d_cgrp(dentry); dput(dentry); if (ret) { @@ -2977,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, goto out_release; } - if (!child) { - printk(KERN_INFO - "Couldn't find new cgroup %s\n", nodename); - ret = -ENOMEM; - goto out_release; - } - /* The cgroup now exists. Retake cgroup_mutex and check * that we're still in the same state that we thought we * were. */ @@ -3079,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { + if ((atomic_dec_return(&css->refcnt) == 1) && + notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } diff --git a/kernel/compat.c b/kernel/compat.c index 8eafe3e..42d5654 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -24,6 +24,7 @@ #include <linux/migrate.h> #include <linux/posix-timers.h> #include <linux/times.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> @@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } + force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } @@ -454,16 +456,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, } static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, - unsigned len, cpumask_t *new_mask) + unsigned len, struct cpumask *new_mask) { unsigned long *k; - if (len < sizeof(cpumask_t)) - memset(new_mask, 0, sizeof(cpumask_t)); - else if (len > sizeof(cpumask_t)) - len = sizeof(cpumask_t); + if (len < cpumask_size()) + memset(new_mask, 0, cpumask_size()); + else if (len > cpumask_size()) + len = cpumask_size(); - k = cpus_addr(*new_mask); + k = cpumask_bits(new_mask); return compat_get_bitmap(k, user_mask_ptr, len * 8); } @@ -471,40 +473,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); if (retval) - return retval; + goto out; - return sched_setaffinity(pid, &new_mask); + retval = sched_setaffinity(pid, new_mask); +out: + free_cpumask_var(new_mask); + return retval; } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; unsigned long *k; - unsigned int min_length = sizeof(cpumask_t); + unsigned int min_length = cpumask_size(); - if (NR_CPUS <= BITS_PER_COMPAT_LONG) + if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) min_length = sizeof(compat_ulong_t); if (len < min_length) return -EINVAL; - ret = sched_getaffinity(pid, &mask); + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); if (ret < 0) - return ret; + goto out; - k = cpus_addr(mask); + k = cpumask_bits(mask); ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret) - return ret; + if (ret == 0) + ret = min_length; - return min_length; +out: + free_cpumask_var(mask); + return ret; } int get_compat_itimerspec(struct itimerspec *dst, @@ -883,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ea32e8..79e40f0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,29 +15,8 @@ #include <linux/stop_machine.h> #include <linux/mutex.h> -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP - -/* - * Represents all cpu's that are currently online. - */ -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); - -#else /* CONFIG_SMP */ - -/* Serializes the updates to cpu_online_map, cpu_present_map */ +#ifdef CONFIG_SMP +/* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); @@ -64,8 +43,6 @@ void __init cpu_hotplug_init(void) cpu_hotplug.refcount = 0; } -cpumask_t cpu_active_map; - #ifdef CONFIG_HOTPLUG_CPU void get_online_cpus(void) @@ -96,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); /* * The following two API's must be used when attempting - * to serialize the updates to cpu_online_map, cpu_present_map. + * to serialize the updates to cpu_online_mask, cpu_present_mask. */ void cpu_maps_update_begin(void) { @@ -217,7 +194,7 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_t old_allowed, tmp; + cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { @@ -231,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) + return -ENOMEM; + cpu_hotplug_begin(); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); @@ -245,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } /* Ensure that we are not runnable on dying cpu */ - old_allowed = current->cpus_allowed; - cpus_setall(tmp); - cpu_clear(cpu, tmp); - set_cpus_allowed_ptr(current, &tmp); - tmp = cpumask_of_cpu(cpu); + cpumask_copy(old_allowed, ¤t->cpus_allowed); + set_cpus_allowed_ptr(current, + cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); - err = __stop_machine(take_cpu_down, &tcd_param, &tmp); + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, @@ -277,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); out_allowed: - set_cpus_allowed_ptr(current, &old_allowed); + set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); if (!err) { @@ -285,13 +263,17 @@ out_release: hcpu) == NOTIFY_BAD) BUG(); } + free_cpumask_var(old_allowed); return err; } int __ref cpu_down(unsigned int cpu) { - int err = 0; + int err; + err = stop_machine_create(); + if (err) + return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -303,7 +285,7 @@ int __ref cpu_down(unsigned int cpu) /* * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_map. + * using stale version of the cpu_active_mask. * This is not strictly necessary becuase stop_machine() * that we run down the line already provides the required * synchronization. But it's really a side effect and we do not @@ -318,6 +300,7 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); + stop_machine_destroy(); return err; } EXPORT_SYMBOL(cpu_down); @@ -367,7 +350,7 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; - if (!cpu_isset(cpu, cpu_possible_map)) { + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) @@ -392,25 +375,28 @@ out: } #ifdef CONFIG_PM_SLEEP_SMP -static cpumask_t frozen_cpus; +static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error = 0; + int cpu, first_cpu, error; + error = stop_machine_create(); + if (error) + return error; cpu_maps_update_begin(); - first_cpu = first_cpu(cpu_online_map); + first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); if (!error) { - cpu_set(cpu, frozen_cpus); + cpumask_set_cpu(cpu, frozen_cpus); printk("CPU%d is down\n", cpu); } else { printk(KERN_ERR "Error taking CPU%d down: %d\n", @@ -426,6 +412,7 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); + stop_machine_destroy(); return error; } @@ -436,11 +423,11 @@ void __ref enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); cpu_hotplug_disabled = 0; - if (cpus_empty(frozen_cpus)) + if (cpumask_empty(frozen_cpus)) goto out; printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask_nr(cpu, frozen_cpus) { + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); @@ -448,10 +435,18 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); } + +static int alloc_frozen_cpus(void) +{ + if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) + return -ENOMEM; + return 0; +} +core_initcall(alloc_frozen_cpus); #endif /* CONFIG_PM_SLEEP_SMP */ /** @@ -467,7 +462,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) unsigned long val = CPU_STARTING; #ifdef CONFIG_PM_SLEEP_SMP - if (cpu_isset(cpu, frozen_cpus)) + if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) val = CPU_STARTING_FROZEN; #endif /* CONFIG_PM_SLEEP_SMP */ raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); @@ -479,7 +474,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1<<nr. * - * It is used by cpumask_of_cpu() to get a constant address to a CPU + * It is used by cpumask_of() to get a constant address to a CPU * mask value that has a single bit set only. */ @@ -502,3 +497,71 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap); const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); + +#ifdef CONFIG_INIT_ALL_POSSIBLE +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly + = CPU_BITS_ALL; +#else +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +#endif +const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +EXPORT_SYMBOL(cpu_possible_mask); + +static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +EXPORT_SYMBOL(cpu_online_mask); + +static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +EXPORT_SYMBOL(cpu_present_mask); + +static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +EXPORT_SYMBOL(cpu_active_mask); + +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); +} + +void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); +} + +void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +} + +void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); +} + +void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_present_bits), src); +} + +void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_possible_bits), src); +} + +void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_online_bits), src); +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 96c0ba1..647c77a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -84,7 +84,7 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ + cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ struct cpuset *parent; /* my parent */ @@ -195,8 +195,6 @@ static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .cpus_allowed = CPU_MASK_ALL, - .mems_allowed = NODE_MASK_ALL, }; /* @@ -240,6 +238,17 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); /* + * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist + * buffers. They are statically allocated to prevent using excess stack + * when calling cpuset_print_task_mems_allowed(). + */ +#define CPUSET_NAME_LEN (128) +#define CPUSET_NODELIST_LEN (256) +static char cpuset_name[CPUSET_NAME_LEN]; +static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +static DEFINE_SPINLOCK(cpuset_buffer_lock); + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -267,7 +276,7 @@ static struct file_system_type cpuset_fs_type = { }; /* - * Return in *pmask the portion of a cpusets's cpus_allowed that + * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, @@ -280,15 +289,16 @@ static struct file_system_type cpuset_fs_type = { * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) +static void guarantee_online_cpus(const struct cpuset *cs, + struct cpumask *pmask) { - while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) + while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = cs->parent; if (cs) - cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else - *pmask = cpu_online_map; - BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); + cpumask_copy(pmask, cpu_online_mask); + BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); } /* @@ -364,14 +374,9 @@ void cpuset_update_task_memory_state(void) struct task_struct *tsk = current; struct cpuset *cs; - if (task_cs(tsk) == &top_cpuset) { - /* Don't need rcu for top_cpuset. It's never freed. */ - my_cpusets_mem_gen = top_cpuset.mems_generation; - } else { - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - } + rcu_read_lock(); + my_cpusets_mem_gen = task_cs(tsk)->mems_generation; + rcu_read_unlock(); if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); @@ -403,12 +408,43 @@ void cpuset_update_task_memory_state(void) static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpus_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); } +/** + * alloc_trial_cpuset - allocate a trial cpuset + * @cs: the cpuset that the trial cpuset duplicates + */ +static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +{ + struct cpuset *trial; + + trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + if (!trial) + return NULL; + + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { + kfree(trial); + return NULL; + } + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + + return trial; +} + +/** + * free_trial_cpuset - free the trial cpuset + * @trial: the trial cpuset to be freed + */ +static void free_trial_cpuset(struct cpuset *trial) +{ + free_cpumask_var(trial->cpus_allowed); + kfree(trial); +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -458,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) return -EINVAL; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -468,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ if (cgroup_task_count(cur->css.cgroup)) { - if (cpus_empty(trial->cpus_allowed) || + if (cpumask_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed)) { return -ENOSPC; } @@ -483,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpus_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); } static void @@ -508,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; if (is_sched_load_balance(cp)) @@ -575,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -static int generate_sched_domains(cpumask_t **domains, +/* FIXME: see the FIXME in partition_sched_domains() */ +static int generate_sched_domains(struct cpumask **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -583,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct cpumask *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] cpumask_t slot */ + int nslot; /* next empty doms[] struct cpumask slot */ doms = NULL; dattr = NULL; @@ -594,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -603,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - *doms = top_cpuset.cpus_allowed; + cpumask_copy(doms, top_cpuset.cpus_allowed); ndoms = 1; goto done; @@ -622,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains, cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; /* @@ -673,7 +710,7 @@ restart: * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ - doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -685,7 +722,7 @@ restart: for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; - cpumask_t *dp; + struct cpumask *dp; int apn = a->pn; if (apn < 0) { @@ -708,14 +745,14 @@ restart: continue; } - cpus_clear(*dp); + cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); + cpumask_or(dp, dp, b->cpus_allowed); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -755,7 +792,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; get_online_cpus(); @@ -824,7 +861,7 @@ void rebuild_sched_domains(void) static int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - return !cpus_equal(tsk->cpus_allowed, + return !cpumask_equal(&tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed); } @@ -842,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); + set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -874,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) * @cs: the cpuset to consider * @buf: buffer of cpu numbers written to this cpuset */ -static int update_cpumask(struct cpuset *cs, const char *buf) +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { struct ptr_heap heap; - struct cpuset trialcs; int retval; int is_load_balanced; @@ -885,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case @@ -894,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * with tasks have cpus. */ if (!*buf) { - cpus_clear(trialcs.cpus_allowed); + cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs.cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) return retval; - if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) return -EINVAL; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; - is_load_balanced = is_sched_load_balance(&trialcs); + is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs.cpus_allowed; + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); /* @@ -1006,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ - fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ retval = -ENOMEM; /* @@ -1093,9 +1128,9 @@ done: * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ -static int update_nodemask(struct cpuset *cs, const char *buf) +static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { - struct cpuset trialcs; nodemask_t oldmem; int retval; @@ -1106,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case @@ -1115,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf) * with tasks have memory. */ if (!*buf) { - nodes_clear(trialcs.mems_allowed); + nodes_clear(trialcs->mems_allowed); } else { - retval = nodelist_parse(buf, trialcs.mems_allowed); + retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) goto done; - if (!nodes_subset(trialcs.mems_allowed, + if (!nodes_subset(trialcs->mems_allowed, node_states[N_HIGH_MEMORY])) return -EINVAL; } oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs.mems_allowed)) { + if (nodes_equal(oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) goto done; mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs.mems_allowed; + cs->mems_allowed = trialcs->mems_allowed; cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); @@ -1156,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; - if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) async_rebuild_sched_domains(); } @@ -1175,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { - struct cpuset trialcs; + struct cpuset *trialcs; int err; int balance_flag_changed; - trialcs = *cs; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + if (turning_on) - set_bit(bit, &trialcs.flags); + set_bit(bit, &trialcs->flags); else - clear_bit(bit, &trialcs.flags); + clear_bit(bit, &trialcs->flags); - err = validate_change(cs, &trialcs); + err = validate_change(cs, trialcs); if (err < 0) - return err; + goto out; balance_flag_changed = (is_sched_load_balance(cs) != - is_sched_load_balance(&trialcs)); + is_sched_load_balance(trialcs)); mutex_lock(&callback_mutex); - cs->flags = trialcs.flags; + cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); - if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); - return 0; +out: + free_trial_cpuset(trialcs); + return err; } /* @@ -1300,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* Protected by cgroup_lock */ +static cpumask_var_t cpus_attach; + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) { struct cpuset *cs = cgroup_cs(cont); + int ret = 0; - if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - if (tsk->flags & PF_THREAD_BOUND) { - cpumask_t mask; + if (tsk->flags & PF_THREAD_BOUND) { mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) + ret = -EINVAL; mutex_unlock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, mask)) - return -EINVAL; } - return security_task_setscheduler(tsk, 0, NULL); + return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk) { - cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); int err; - mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); - err = set_cpus_allowed_ptr(tsk, &cpus); - mutex_unlock(&callback_mutex); + if (cs == &top_cpuset) { + cpumask_copy(cpus_attach, cpu_possible_mask); + } else { + mutex_lock(&callback_mutex); + guarantee_online_cpus(cs, cpus_attach); + mutex_unlock(&callback_mutex); + } + err = set_cpus_allowed_ptr(tsk, cpus_attach); if (err) return; @@ -1348,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, cpuset_migrate_mm(mm, &from, &to); mmput(mm); } - } /* The various types of files and directories in a cpuset file system */ @@ -1443,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *trialcs; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + switch (cft->private) { case FILE_CPULIST: - retval = update_cpumask(cgroup_cs(cgrp), buf); + retval = update_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: - retval = update_nodemask(cgroup_cs(cgrp), buf); + retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } + + free_trial_cpuset(trialcs); cgroup_unlock(); return retval; } @@ -1476,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - cpumask_t mask; + int ret; mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, mask); + return ret; } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) @@ -1718,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, parent_cs = cgroup_cs(parent); cs->mems_allowed = parent_cs->mems_allowed; - cs->cpus_allowed = parent_cs->cpus_allowed; + cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); return; } @@ -1744,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create( cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } cpuset_update_task_memory_state(); cs->flags = 0; @@ -1752,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpus_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); @@ -1779,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; + free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1802,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { + alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + top_cpuset.mems_generation = cpuset_mems_generation++; return 0; } @@ -1817,7 +1875,7 @@ int __init cpuset_init(void) { int err = 0; - cpus_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); @@ -1829,6 +1887,9 @@ int __init cpuset_init(void) if (err < 0) return err; + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); + number_of_cpusets = 1; return 0; } @@ -1903,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed) || + while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) parent = parent->parent; @@ -1944,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -1952,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); - cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_online_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ - if (cpus_empty(cp->cpus_allowed) || + if (cpumask_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { @@ -1984,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; switch (phase) { @@ -1999,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, } cgroup_lock(); - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2044,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); @@ -2054,15 +2116,15 @@ void __init cpuset_init_smp(void) /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * - * Description: Returns the cpumask_t cpus_allowed of the cpuset + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_map, even if this means going outside the * tasks cpuset. **/ -void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); cpuset_cpus_allowed_locked(tsk, pmask); @@ -2073,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { task_lock(tsk); guarantee_online_cpus(task_cs(tsk), pmask); @@ -2356,6 +2418,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +/** + * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed + * @task: pointer to task_struct of some task. + * + * Description: Prints @task's name, cpuset name, and cached copy of its + * mems_allowed to the kernel log. Must hold task_lock(task) to allow + * dereferencing task_cs(task). + */ +void cpuset_print_task_mems_allowed(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = task_cs(tsk)->css.cgroup->dentry; + spin_lock(&cpuset_buffer_lock); + snprintf(cpuset_name, CPUSET_NAME_LEN, + dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, + tsk->mems_allowed); + printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", + tsk->comm, cpuset_name, cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); +} + /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc07..3a03918 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -372,7 +372,8 @@ int commit_creds(struct cred *new) old->fsuid != new->fsuid || old->fsgid != new->fsgid || !cap_issubset(new->cap_permitted, old->cap_permitted)) { - set_dumpable(task->mm, suid_dumpable); + if (task->mm) + set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; smp_wmb(); } @@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) else old = get_cred(&init_cred); + *new = *old; get_uid(new->user); get_group_info(new->group_info); @@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) error: put_cred(new); + put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index f013a0c..0387074 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + struct dma_coherent_mem *mem; int order = get_order(size); + int pageno; - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; + if (!dev) + return 0; + mem = dev->dma_mem; + if (!mem) + return 0; + if (unlikely(size > mem->size)) + return 0; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (pageno >= 0) { + /* + * Memory was found in the per-device arena. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ + *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; } - return (mem != NULL); + return 1; } EXPORT_SYMBOL(dma_alloc_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index e69edc74..2a803c2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -642,35 +642,31 @@ retry: /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. + * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); mm->owner = NULL; - up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); - read_unlock(&tasklist_lock); - down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ @@ -1052,10 +1048,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 913284e..4a9b318 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -405,6 +405,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; + +static int __init coredump_filter_setup(char *s) +{ + default_dump_filter = + (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & + MMF_DUMP_FILTER_MASK; + return 1; +} + +__setup("coredump_filter=", coredump_filter_setup); + #include <linux/init_task.h> static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) @@ -413,8 +425,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags - : MMF_DUMP_FILTER_DEFAULT; + mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); @@ -763,7 +774,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; - if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sighand->count); return 0; } @@ -1120,12 +1131,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); + pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(task_active_pid_ns(p)); + retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } @@ -1475,12 +1486,10 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + mmap_init(); } /* diff --git a/kernel/futex.c b/kernel/futex.c index 7c6cbab..002aa18 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key) */ static void drop_futex_key_refs(union futex_key *key) { - if (!key->both.ptr) + if (!key->both.ptr) { + /* If we're here then we tried to put a key we failed to get */ + WARN_ON_ONCE(1); return; + } switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -730,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; } @@ -755,7 +758,7 @@ retryfull: goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -777,12 +780,12 @@ retry: * but we might get them from range checking */ ret = op_ret; - goto out; + goto out_put_keys; #endif if (unlikely(op_ret != -EFAULT)) { ret = op_ret; - goto out; + goto out_put_keys; } /* @@ -796,7 +799,7 @@ retry: ret = futex_handle_fault((unsigned long)uaddr2, attempt); if (ret) - goto out; + goto out_put_keys; goto retry; } @@ -834,10 +837,11 @@ retry: spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); - +out: return ret; } @@ -854,13 +858,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_q *this, *next; int ret, drop_count = 0; - retry: +retry: ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -882,7 +886,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, if (!ret) goto retry; - return ret; + goto out_put_keys; } if (curval != *cmpval) { ret = -EAGAIN; @@ -927,9 +931,11 @@ out_unlock: while (--drop_count >= 0) drop_futex_key_refs(&key1); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); +out: return ret; } @@ -990,7 +996,7 @@ static int unqueue_me(struct futex_q *q) int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ - retry: +retry: lock_ptr = q->lock_ptr; barrier(); if (lock_ptr != NULL) { @@ -1172,11 +1178,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; hb = queue_lock(&q); @@ -1204,6 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, if (unlikely(ret)) { queue_unlock(&q, hb); + put_futex_key(fshared, &q.key); ret = get_user(uval, uaddr); @@ -1213,7 +1220,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, } ret = -EWOULDBLOCK; if (uval != val) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1305,11 +1312,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, return -ERESTART_RESTARTBLOCK; } - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - - out_release_sem: put_futex_key(fshared, &q.key); + +out: return ret; } @@ -1358,16 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; - retry_unlocked: +retry_unlocked: hb = queue_lock(&q); - retry_locked: +retry_locked: ret = lock_taken = 0; /* @@ -1388,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, */ if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { ret = -EDEADLK; - goto out_unlock_release_sem; + goto out_unlock_put_key; } /* * Surprise - we got the lock. Just return to userspace: */ if (unlikely(!curval)) - goto out_unlock_release_sem; + goto out_unlock_put_key; uval = curval; @@ -1431,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, * We took the lock due to owner died take over. */ if (unlikely(lock_taken)) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* * We dont have the lock. Look up the PI state (or create it if @@ -1470,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, goto retry_locked; } default: - goto out_unlock_release_sem; + goto out_unlock_put_key; } } @@ -1561,16 +1568,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, destroy_hrtimer_on_stack(&to->timer); return ret != -EINTR ? ret : -ERESTARTNOINTR; - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - out_release_sem: +out_put_key: put_futex_key(fshared, &q.key); +out: if (to) destroy_hrtimer_on_stack(&to->timer); return ret; - uaddr_faulted: +uaddr_faulted: /* * We have to r/w *(int __user *)uaddr, and we have to modify it * atomically. Therefore, if we continue to fault after get_user() @@ -1583,7 +1591,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) - goto out_release_sem; + goto out_put_key; goto retry_unlocked; } @@ -1675,9 +1683,9 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; pi_faulted: diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index bda9cb9..1455b76 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -32,7 +32,6 @@ */ #include <linux/cpu.h> -#include <linux/irq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/hrtimer.h> @@ -635,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static void __run_hrtimer(struct hrtimer *timer); /* * When High resolution timers are active, try to reprogram. Note, that in case @@ -647,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - /* - * XXX: recursion check? - * hrtimer_forward() should round up with timer granularity - * so that we never get into inf recursion here, - * it doesn't do that though - */ - __run_hrtimer(timer); + spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + spin_lock(&base->cpu_base->lock); return 1; } return 0; @@ -706,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -781,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. */ -static void enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, int reprogram) +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -815,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (leftmost) { - /* - * Reprogram the clock event device. When the timer is already - * expired hrtimer_enqueue_reprogram has either called the - * callback or added it to the pending list and raised the - * softirq. - * - * This is a NOP for !HIGHRES - */ - if (reprogram && hrtimer_enqueue_reprogram(timer, base)) - return; - + if (leftmost) base->first = &timer->node; - } rb_link_node(&timer->node, parent, link); rb_insert_color(&timer->node, &base->active); @@ -837,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; + + return leftmost; } /* @@ -913,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, leftmost; base = lock_hrtimer_base(timer, &flags); @@ -941,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n timer_stats_hrtimer_set_start_info(timer); + leftmost = enqueue_hrtimer(timer, new_base); + /* * Only allow reprogramming if the new base is on this CPU. * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? */ - enqueue_hrtimer(timer, new_base, - new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) + hrtimer_enqueue_reprogram(timer, new_base); unlock_hrtimer_base(timer, &flags); @@ -1158,13 +1143,13 @@ static void __run_hrtimer(struct hrtimer *timer) spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid - * reprogramming of the event hardware. This happens at the end of this - * function anyway. + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() */ if (restart != HRTIMER_NORESTART) { BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base, 0); + enqueue_hrtimer(timer, base); } timer->state &= ~HRTIMER_STATE_CALLBACK; } @@ -1244,6 +1229,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) } } +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = &__get_cpu_var(tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + /** * hrtimer_peek_ahead_timers -- run soft-expired timers now * @@ -1255,20 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ void hrtimer_peek_ahead_timers(void) { - struct tick_device *td; unsigned long flags; - if (!hrtimer_hres_active()) - return; - local_irq_save(flags); - td = &__get_cpu_var(tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); + __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } -#endif /* CONFIG_HIGH_RES_TIMERS */ +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from timer softirq every jiffy, expire hrtimers: @@ -1514,39 +1518,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* - * Enqueue the timers on the new cpu, but do not reprogram - * the timer as that would enable a deadlock between - * hrtimer_enqueue_reprogramm() running the timer and us still - * holding a nested base lock. - * - * Instead we tickle the hrtimer interrupt after the migration - * is done, which will run all expired timers and re-programm - * the timer device. + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. */ - enqueue_hrtimer(timer, new_base, 0); + enqueue_hrtimer(timer, new_base); /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; } } -static int migrate_hrtimers(int scpu) +static void migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; - int dcpu, i; + int i; BUG_ON(cpu_online(scpu)); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &get_cpu_var(hrtimer_bases); - - dcpu = smp_processor_id(); - tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = &__get_cpu_var(hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1555,15 +1556,11 @@ static int migrate_hrtimers(int scpu) } spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(hrtimer_bases); + spin_unlock(&new_base->lock); - return dcpu; -} - -static void tickle_timers(void *arg) -{ - hrtimer_peek_ahead_timers(); + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1584,11 +1581,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, case CPU_DEAD: case CPU_DEAD_FROZEN: { - int dcpu; - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); - dcpu = migrate_hrtimers(scpu); - smp_call_function_single(dcpu, tickle_timers, NULL, 0); + migrate_hrtimers(scpu); break; } #endif @@ -1609,6 +1603,9 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif } /** diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 650ce41..1de9700 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/delay.h> +#include <linux/async.h> #include "internals.h" @@ -34,15 +35,16 @@ unsigned long probe_irq_on(void) unsigned int status; int i; + /* + * quiesce the kernel, or at least the asynchronous portion + */ + async_synchronize_full(); mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -71,9 +73,6 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -92,9 +91,6 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); status = desc->status; @@ -133,9 +129,6 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); status = desc->status; @@ -178,9 +171,6 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); status = desc->status; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6eb3c79..c248eba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpus_setall(desc->affinity); + cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6492400..375d68c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include <linux/kernel_stat.h> #include <linux/rculist.h> #include <linux/hash.h> +#include <linux/bootmem.h> #include "internals.h" @@ -56,11 +57,8 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -void __init __attribute__((weak)) arch_early_irq_init(void) -{ -} - #ifdef CONFIG_SPARSE_IRQ + static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, @@ -68,9 +66,6 @@ static struct irq_desc irq_desc_init = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif }; void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) @@ -90,13 +85,11 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) desc->kstat_irqs = (unsigned int *)ptr; } -void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu) -{ -} - static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); + + spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP desc->cpu = cpu; @@ -107,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, cpu, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } arch_init_chip_data(desc, cpu); } @@ -115,7 +112,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) */ DEFINE_SPINLOCK(sparse_irq_lock); -struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; +struct irq_desc **irq_desc_ptrs __read_mostly; static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -125,40 +122,52 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; -/* FIXME: use bootmem alloc ...*/ -static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; +static unsigned int *kstat_irqs_legacy; -void __init early_irq_init(void) +int __init early_irq_init(void) { struct irq_desc *desc; int legacy_count; int i; + /* initialize nr_irqs based on nr_cpu_ids */ + arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); + desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + /* allocate irq_desc_ptrs array based on nr_irqs */ + irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + + /* allocate based on nr_cpu_ids */ + /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ + kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; - desc[i].kstat_irqs = kstat_irqs_legacy[i]; - + desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; } - for (i = legacy_count; i < NR_IRQS; i++) + for (i = legacy_count; i < nr_irqs; i++) irq_desc_ptrs[i] = NULL; - arch_early_irq_init(); + return arch_early_irq_init(); } struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; + if (irq_desc_ptrs && irq < nr_irqs) + return irq_desc_ptrs[irq]; + + return NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) @@ -167,10 +176,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) unsigned long flags; int node; - if (irq >= NR_IRQS) { - printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", - irq, NR_IRQS); - WARN_ON(1); + if (irq >= nr_irqs) { + WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", + irq, nr_irqs); return NULL; } @@ -203,7 +211,7 @@ out_unlock: return desc; } -#else +#else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { @@ -212,13 +220,37 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; -#endif +int __init early_irq_init(void) +{ + struct irq_desc *desc; + int count; + int i; + + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) { + desc[i].irq = i; + init_alloc_desc_masks(&desc[i], 0, true); + } + return arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + return irq_to_desc(irq); +} +#endif /* !CONFIG_SPARSE_IRQ */ /* * What should we do if we get a hw irq event on an illegal vector? @@ -428,9 +460,6 @@ void early_init_irq_lock_class(void) int i; for_each_irq_desc(i, desc) { - if (!desc) - continue; - lockdep_set_class(&desc->lock, &irq_desc_lock_class); } } @@ -439,7 +468,7 @@ void early_init_irq_lock_class(void) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc->kstat_irqs[cpu]; + return desc ? desc->kstat_irqs[cpu] : 0; } #endif EXPORT_SYMBOL(kstat_irqs_cpu); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e6d0a43..40416a8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); extern spinlock_t sparse_irq_lock; + +#ifdef CONFIG_SPARSE_IRQ +/* irq_desc_ptrs allocated at boot time */ +extern struct irq_desc **irq_desc_ptrs; +#else +/* irq_desc_ptrs is a fixed size array */ extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +#endif #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 540f6c4..b98739a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,8 +16,15 @@ #include "internals.h" #ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; -cpumask_t irq_default_affinity = CPU_MASK_ALL; +static int init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL); + cpumask_setall(irq_default_affinity); + return 0; +} +core_initcall(init_irq_default_affinity); /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) @@ -79,7 +86,7 @@ int irq_can_set_affinity(unsigned int irq) * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -91,14 +98,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - desc->affinity = cpumask; + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = cpumask; + cpumask_copy(desc->pending_mask, cpumask); } #else - desc->affinity = cpumask; + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -112,26 +119,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) */ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { - cpumask_t mask; - if (!irq_can_set_affinity(irq)) return 0; - cpus_and(mask, cpu_online_map, irq_default_affinity); - /* * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpus_intersects(desc->affinity, cpu_online_map)) - mask = desc->affinity; + if (cpumask_any_and(desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - desc->affinity = mask; - desc->chip->set_affinity(irq, mask); + cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); +set_affinity: + desc->chip->set_affinity(irq, desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9db681d..e05ad9b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,7 +4,6 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); - cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -19,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(desc->pending_mask))) + if (unlikely(cpumask_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -27,8 +26,6 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, desc->pending_mask, cpu_online_map); - /* * If there was a valid mask to work with, please * do the disable, re-program, enable sequence. @@ -41,10 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(!cpus_empty(tmp))) { - desc->chip->set_affinity(irq,tmp); + if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { + cpumask_and(desc->affinity, + desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, desc->affinity); } - cpus_clear(desc->pending_mask); + cpumask_clear(desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 089c3746..666260e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -38,14 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->kstat_irqs = NULL; } -static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, +static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); + if (!init_alloc_desc_masks(desc, cpu, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + return false; + } + spin_lock_init(&desc->lock); desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); + return true; } static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) @@ -74,15 +82,19 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n", - irq, cpu, node); if (!desc) { - printk(KERN_ERR "can not get new irq_desc for moving\n"); + printk(KERN_ERR "irq %d: can not get new irq_desc " + "for migration.\n", irq); + /* still use old one */ + desc = old_desc; + goto out_unlock; + } + if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { /* still use old one */ + kfree(desc); desc = old_desc; goto out_unlock; } - init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; @@ -106,8 +118,6 @@ struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) return desc; old_cpu = desc->cpu; - printk(KERN_DEBUG - "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu); if (old_cpu != cpu) { node = cpu_to_node(cpu); old_node = cpu_to_node(old_cpu); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f6b3440..692363d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - cpumask_t *mask = &desc->affinity; + const struct cpumask *mask = desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) - mask = &desc->pending_mask; + mask = desc->pending_mask; #endif seq_cpumask(m, mask); seq_putc(m, '\n'); @@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_t new_value; + cpumask_var_t new_value; int err; if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto free_cpumask; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) + if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity_usr(irq) ? -EINVAL : count; - - irq_set_affinity(irq, new_value); + err = irq_select_affinity_usr(irq) ? -EINVAL : count; + } else { + irq_set_affinity(irq, new_value); + err = count; + } - return count; +free_cpumask: + free_cpumask_var(new_value); + return err; } static int irq_affinity_proc_open(struct inode *inode, struct file *file) @@ -84,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = { static int default_affinity_show(struct seq_file *m, void *v) { - seq_cpumask(m, &irq_default_affinity); + seq_cpumask(m, irq_default_affinity); seq_putc(m, '\n'); return 0; } @@ -92,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v) static ssize_t default_affinity_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - cpumask_t new_value; + cpumask_var_t new_value; int err; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto out; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto out; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) - return -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) { + err = -EINVAL; + goto out; + } - irq_default_affinity = new_value; + cpumask_copy(irq_default_affinity, new_value); + err = count; - return count; +out: + free_cpumask_var(new_value); + return err; } static int default_affinity_open(struct inode *inode, struct file *file) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3738107..dd364c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -91,9 +91,6 @@ static int misrouted_irq(int irq) int i, ok = 0; for_each_irq_desc(i, desc) { - if (!desc) - continue; - if (!i) continue; @@ -115,8 +112,6 @@ static void poll_spurious_irqs(unsigned long dummy) for_each_irq_desc(i, desc) { unsigned int status; - if (!desc) - continue; if (!i) continue; diff --git a/kernel/kexec.c b/kernel/kexec.c index ac0fde7..3fb855a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) struct elf_prstatus prstatus; u32 *buf; - if ((cpu < 0) || (cpu >= NR_CPUS)) + if ((cpu < 0) || (cpu >= nr_cpu_ids)) return; /* Using ELF notes here is opportunistic. diff --git a/kernel/kmod.c b/kernel/kmod.c index b46dbb9..a27a5f6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; /** * request_module - try to load a kernel module - * @fmt: printf style format string for the name of the module - * @varargs: arguements as specified in the format string + * @fmt: printf style format string for the name of the module + * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code on failure. Note that a diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f8a3f2..1b9cbdc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobe_enabled; -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -115,6 +115,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ static struct hlist_head kprobe_insn_pages; static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -144,10 +145,10 @@ loop_end: } /** - * get_insn_slot() - Find a slot on an executable page for an instruction. + * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) return kip->insns; } +kprobe_opcode_t __kprobes *get_insn_slot(void) +{ + kprobe_opcode_t *ret; + mutex_lock(&kprobe_insn_mutex); + ret = __get_insn_slot(); + mutex_unlock(&kprobe_insn_mutex); + return ret; +} + /* Return 1 if all garbages are collected, otherwise 0. */ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) { @@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; + int safety; /* Ensure no-one is preepmted on the garbages */ - if (check_safety() != 0) + mutex_unlock(&kprobe_insn_mutex); + safety = check_safety(); + mutex_lock(&kprobe_insn_mutex); + if (safety != 0) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { @@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; + mutex_lock(&kprobe_insn_mutex); hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { @@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); + + mutex_unlock(&kprobe_insn_mutex); } #endif @@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler) { + if (kp->pre_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; @@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler) { + if (kp->post_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); @@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, head); } -void kretprobe_hash_lock(struct task_struct *tsk, +void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) spin_unlock_irqrestore(hlist_lock, *flags); } -void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; - if (p->post_handler) + /* We don't care the kprobe which has gone. */ + if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); @@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap; + if (kprobe_gone(old_p)) { + /* + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. + */ + ret = arch_prepare_kprobe(old_p); + if (ret) + return ret; + } if (old_p->pre_handler == aggr_pre_handler) { copy_kprobe(old_p, p); ret = add_new_kprobe(old_p, p); + ap = old_p; } else { ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) + if (!ap) { + if (kprobe_gone(old_p)) + arch_remove_kprobe(old_p); return -ENOMEM; + } add_aggr_kprobe(ap, old_p); copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); } + if (kprobe_gone(old_p)) { + /* + * If the old_p has gone, its breakpoint has been disarmed. + * We have to arm it again after preparing real kprobes. + */ + ap->flags &= ~KPROBE_FLAG_GONE; + if (kprobe_enabled) + arch_arm_kprobe(ap); + } return ret; } @@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } -static int __kprobes __register_kprobe(struct kprobe *p, - unsigned long called_from) +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; struct kprobe *old_p; @@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; } - p->mod_refcounted = 0; - + p->flags = 0; /* * Check if are we probing a module. */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod; - calling_mod = __module_text_address(called_from); /* - * We must allow modules to probe themself and in this case - * avoid incrementing the module refcount, so as to allow - * unloading of self probing modules. + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. */ - if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } - p->mod_refcounted = 1; - } else - probed_mod = NULL; + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); + return -EINVAL; + } + /* + * If the module freed .init.text, we couldn't insert + * kprobes in there. + */ + if (within_module_init((unsigned long)p->addr, probed_mod) && + probed_mod->state != MODULE_STATE_COMING) { + module_put(probed_mod); + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p, out: mutex_unlock(&kprobe_mutex); - if (ret && probed_mod) + if (probed_mod) module_put(probed_mod); + return ret; } @@ -697,16 +743,16 @@ valid_p: list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are - * enabled - otherwise, the breakpoint would already have - * been removed. We save on flushing icache. + * enabled and not gone - otherwise, the breakpoint would + * already have been removed. We save on flushing icache. */ - if (kprobe_enabled) + if (kprobe_enabled && !kprobe_gone(old_p)) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) old_p->break_handler = NULL; - if (p->post_handler) { + if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &old_p->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; @@ -721,39 +767,27 @@ noclean: static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct module *mod; struct kprobe *old_p; - if (p->mod_refcounted) { - /* - * Since we've already incremented refcount, - * we don't need to disable preemption. - */ - mod = module_text_address((unsigned long)p->addr); - if (mod) - module_put(mod); - } - - if (list_empty(&p->list) || list_is_singular(&p->list)) { - if (!list_empty(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - kfree(old_p); - } + if (list_empty(&p->list)) arch_remove_kprobe(p); + else if (list_is_singular(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); + arch_remove_kprobe(old_p); + kfree(old_p); } } -static int __register_kprobes(struct kprobe **kps, int num, - unsigned long called_from) +int __kprobes register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kprobe(kps[i], called_from); + ret = register_kprobe(kps[i]); if (ret < 0) { if (i > 0) unregister_kprobes(kps, i); @@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num, return ret; } -/* - * Registration and unregistration functions for kprobe. - */ -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobes(&p, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - return __register_kprobes(kps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __register_jprobes(struct jprobe **jps, int num, - unsigned long called_from) +int __kprobes register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num, /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; - ret = __register_kprobe(&jp->kp, called_from); + ret = register_kprobe(&jp->kp); } if (ret < 0) { if (i > 0) @@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num, int __kprobes register_jprobe(struct jprobe *jp) { - return __register_jprobes(&jp, 1, - (unsigned long)__builtin_return_address(0)); + return register_jprobes(&jp, 1); } void __kprobes unregister_jprobe(struct jprobe *jp) @@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_jprobes(&jp, 1); } -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - return __register_jprobes(jps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -static int __kprobes __register_kretprobe(struct kretprobe *rp, - unsigned long called_from) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, rp->nmissed = 0; /* Establish function entry probe point */ - ret = __register_kprobe(&rp->kp, called_from); + ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); return ret; } -static int __register_kretprobes(struct kretprobe **rps, int num, - unsigned long called_from) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kretprobe(rps[i], called_from); + ret = register_kretprobe(rps[i]); if (ret < 0) { if (i > 0) unregister_kretprobes(rps, i); @@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num, return ret; } -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return __register_kretprobes(&rp, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return __register_kretprobes(rps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { int i; @@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ +/* Set the kprobe gone and remove its instruction buffer. */ +static void __kprobes kill_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + p->flags |= KPROBE_FLAG_GONE; + if (p->pre_handler == aggr_pre_handler) { + /* + * If this is an aggr_kprobe, we have to list all the + * chained probes and mark them GONE. + */ + list_for_each_entry_rcu(kp, &p->list, list) + kp->flags |= KPROBE_FLAG_GONE; + p->post_handler = NULL; + p->break_handler = NULL; + } + /* + * Here, we can remove insn_slot safely, because no thread calls + * the original probed function (which will be freed soon) any more. + */ + arch_remove_kprobe(p); +} + +/* Module notifier call back, checking kprobes on the module */ +static int __kprobes kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + int checkcore = (val == MODULE_STATE_GOING); + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) + return NOTIFY_DONE; + + /* + * When MODULE_STATE_GOING was notified, both of module .text and + * .init.text sections would be freed. When MODULE_STATE_LIVE was + * notified, only .init.text section would be freed. We need to + * disable kprobes which have been inserted in the sections. + */ + mutex_lock(&kprobe_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (within_module_init((unsigned long)p->addr, mod) || + (checkcore && + within_module_core((unsigned long)p->addr, mod))) { + /* + * The vaddr this probe is installed will soon + * be vfreed buy not synced to disk. Hence, + * disarming the breakpoint isn't needed. + */ + kill_kprobe(p); + } + } + mutex_unlock(&kprobe_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block kprobe_module_nb = { + .notifier_call = kprobes_module_callback, + .priority = 0 +}; + static int __init init_kprobes(void) { int i, err = 0; @@ -1111,6 +1174,9 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_module_notifier(&kprobe_module_nb); + kprobes_initialized = (err == 0); if (!err) @@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; if (sym) - seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " ")); + seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, + sym, offset, (modname ? modname : " "), + (kprobe_gone(p) ? "[GONE]" : "")); else - seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); + seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, + (kprobe_gone(p) ? "[GONE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - arch_arm_kprobe(p); + if (!kprobe_gone(p)) + arch_arm_kprobe(p); } kprobe_enabled = true; @@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p)) + if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) arch_disarm_kprobe(p); } } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 08dd8ed..528dd78 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -137,7 +137,7 @@ struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif diff --git a/kernel/module.c b/kernel/module.c index dd2a541..c9332c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> -#include <linux/unwind.h> #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -51,6 +50,7 @@ #include <asm/sections.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> +#include <linux/async.h> #if 0 #define DEBUGP printk @@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags) return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; + /* Create stop_machine threads since free_module relies on + * a non-failing stop_machine call. */ + ret = stop_machine_create(); + if (ret) + return ret; + + if (mutex_lock_interruptible(&module_mutex) != 0) { + ret = -EINTR; + goto out_stop; + } mod = find_module(name); if (!mod) { @@ -809,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + async_synchronize_full(); mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); @@ -817,10 +826,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags) out: mutex_unlock(&module_mutex); +out_stop: + stop_machine_destroy(); return ret; } -static void print_unload_info(struct seq_file *m, struct module *mod) +static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; int printed_something = 0; @@ -893,7 +904,7 @@ void module_put(struct module *module) EXPORT_SYMBOL(module_put); #else /* !CONFIG_MODULE_UNLOAD */ -static void print_unload_info(struct seq_file *m, struct module *mod) +static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ seq_printf(m, " - -"); @@ -1439,8 +1450,6 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); - unwind_remove_table(mod->unwind_info, 0); - /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1578,11 +1587,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +/* Additional bytes needed by arch in front of individual sections */ +unsigned int __weak arch_mod_section_prepend(struct module *mod, + unsigned int section) +{ + /* default implementation just returns zero */ + return 0; +} + /* Update size with this section: return offset. */ -static long get_offset(unsigned int *size, Elf_Shdr *sechdr) +static long get_offset(struct module *mod, unsigned int *size, + Elf_Shdr *sechdr, unsigned int section) { long ret; + *size += arch_mod_section_prepend(mod, section); ret = ALIGN(*size, sechdr->sh_addralign ?: 1); *size = ret + sechdr->sh_size; return ret; @@ -1622,7 +1641,7 @@ static void layout_sections(struct module *mod, || strncmp(secstrings + s->sh_name, ".init", 5) == 0) continue; - s->sh_entsize = get_offset(&mod->core_size, s); + s->sh_entsize = get_offset(mod, &mod->core_size, s, i); DEBUGP("\t%s\n", secstrings + s->sh_name); } if (m == 0) @@ -1640,7 +1659,7 @@ static void layout_sections(struct module *mod, || strncmp(secstrings + s->sh_name, ".init", 5) != 0) continue; - s->sh_entsize = (get_offset(&mod->init_size, s) + s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); DEBUGP("\t%s\n", secstrings + s->sh_name); } @@ -1725,15 +1744,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name, return NULL; } -static int is_exported(const char *name, const struct module *mod) +static int is_exported(const char *name, unsigned long value, + const struct module *mod) { - if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) - return 1; + const struct kernel_symbol *ks; + if (!mod) + ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else - if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) - return 1; - else - return 0; + ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + return ks != NULL && ks->value == value; } /* As per nm */ @@ -1847,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int unwindex = 0; unsigned int num_kp, num_mcount; struct kernel_param *kp; struct module *mod; @@ -1865,6 +1883,13 @@ static noinline struct module *load_module(void __user *umod, /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) return ERR_PTR(-ENOMEM); + + /* Create stop_machine threads since the error path relies on + * a non-failing stop_machine call. */ + err = stop_machine_create(); + if (err) + goto free_hdr; + if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; goto free_hdr; @@ -1930,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); -#ifdef ARCH_UNWIND_SECTION_NAME - unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); -#endif /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1942,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif - if (unwindex) - sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2240,14 +2260,10 @@ static noinline struct module *load_module(void __user *umod, add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - /* Size of section 0 is 0, so this works well if no unwind info. */ - mod->unwind_info = unwind_add_table(mod, - (void *)sechdrs[unwindex].sh_addr, - sechdrs[unwindex].sh_size); - /* Get rid of temporary copy */ vfree(hdr); + stop_machine_destroy(); /* Done! */ return mod; @@ -2270,6 +2286,7 @@ static noinline struct module *load_module(void __user *umod, kfree(args); free_hdr: vfree(hdr); + stop_machine_destroy(); return ERR_PTR(err); truncated: @@ -2337,11 +2354,12 @@ sys_init_module(void __user *umod, /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; wake_up(&module_wq); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); - unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2376,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod, unsigned long nextval; /* At worse, next value is at end of module */ - if (within(addr, mod->module_init, mod->init_size)) + if (within_module_init(addr, mod)) nextval = (unsigned long)mod->module_init+mod->init_text_size; else nextval = (unsigned long)mod->module_core+mod->core_text_size; @@ -2424,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) - || within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -2447,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -2471,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -2504,7 +2522,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, mod); + *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } @@ -2691,7 +2709,7 @@ int is_module_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_core, mod->core_size)) { + if (within_module_core(addr, mod)) { preempt_enable(); return 1; } diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 43c2111..78bc3fd 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -13,7 +13,6 @@ struct ns_cgroup { struct cgroup_subsys_state css; - spinlock_t lock; }; struct cgroup_subsys ns_subsys; @@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) return ERR_PTR(-ENOMEM); - spin_lock_init(&ns_cgroup->lock); return &ns_cgroup->css; } diff --git a/kernel/panic.c b/kernel/panic.c index 3a0b089..33cab3d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -302,6 +302,8 @@ static int init_oops_id(void) { if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); + else + oops_id++; return 0; } diff --git a/kernel/pid.c b/kernel/pid.c index 064e76a..1b3586f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) } EXPORT_SYMBOL(task_session_nr_ns); +struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) +{ + return ns_of_pid(task_pid(tsk)); +} +EXPORT_SYMBOL_GPL(task_active_pid_ns); + /* - * Used by proc to find the first pid that is greater then or equal to nr. + * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f77d381..45e8541 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode) { int error; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + error = platform_begin(platform_mode); if (error) return error; - error = platform_begin(platform_mode); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); if (error) goto Close; diff --git a/kernel/power/main.c b/kernel/power/main.c index 613f169..2399888 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) /* this may fail if the RTC hasn't been initialized */ status = rtc_read_time(rtc, &alm.time); if (status < 0) { - printk(err_readtime, rtc->dev.bus_id, status); + printk(err_readtime, dev_name(&rtc->dev), status); return; } rtc_tm_to_time(&alm.time, &now); @@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) status = rtc_set_alarm(rtc, &alm); if (status < 0) { - printk(err_wakealarm, rtc->dev.bus_id, status); + printk(err_wakealarm, dev_name(&rtc->dev), status); return; } @@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(char **)name_ptr = dev->bus_id; + *(const char **)name_ptr = dev_name(dev); return 1; } diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 72016f0..9789083 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { /* run sysrq poweroff on boot cpu */ - schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5d2ab83..f5fc2d7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -25,6 +25,7 @@ #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> +#include <linux/list.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ unsigned long *data; /* bitmap representing pages */ @@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb) return bb->end_pfn - bb->start_pfn; } -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; - /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -273,11 +259,7 @@ struct memory_bitmap { static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); bm->cur.bit = 0; } @@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @nr_blocks - number of blocks to allocate + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + if (!populated_zone(zone)) + continue; - zbm->next = zbmlist; - zbmlist = zbm; + zone_start = zone->zone_start_pfn; + zone_end = zone->zone_start_pfn + zone->spanned_pages; + + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; - Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** @@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. */ - static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; - BUG_ON(!bb); - } - zone_bm->cur_block = bb; + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; + bm->cur.bit = pfn + 1; *bit_nr = pfn; *addr = bb->data; return 0; @@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = bb->next; - bm->cur.block = bb; - bm->cur.bit = 0; - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; @@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ - -static struct page *saveable_highmem_page(unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(!PageHighMem(page)); @@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) + if (saveable_highmem_page(zone, pfn)) n++; } return n; } #else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ - -static struct page *saveable_page(unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(PageHighMem(page)); @@ -903,7 +918,7 @@ unsigned int count_data_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) + if (saveable_page(zone, pfn)) n++; } return n; @@ -944,7 +959,7 @@ static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) @@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + dst = kmap_atomic(d_page, KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { @@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } } #else -#define page_is_saveable(zone, pfn) saveable_page(pfn) +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { @@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; } + + return 0; } /* List of "safe" pages that may be used to store data loaded from the suspend @@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_page = page; if (safe_highmem_pages > 0) { @@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static inline void * get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { - return NULL; + return ERR_PTR(-EINVAL); } static inline void copy_last_highmem_page(void) {} @@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); @@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); pbe->address = safe_pages_list; @@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return error; } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + if (handle->prev == nr_meta_pages) { error = prepare_image(&orig_bm, ©_bm); if (error) @@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); if (handle->buffer != buffer) handle->sync_read = 0; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 023ff2a..a92c914 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -262,3 +262,125 @@ int swsusp_shrink_memory(void) return 0; } + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/printk.c b/kernel/printk.c index e651ab0..7015733 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) static const char recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int recursion_bug; - static int new_text_line = 1; +static int new_text_line = 1; static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) diff --git a/kernel/profile.c b/kernel/profile.c index 60adefb..784933ac 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +static cpumask_var_t prof_cpu_mask; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -113,9 +113,13 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); if (!slab_is_available()) { prof_buffer = alloc_bootmem(buffer_bytes); + alloc_bootmem_cpumask_var(&prof_cpu_mask); return 0; } + if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); if (prof_buffer) return 0; @@ -128,6 +132,7 @@ int __ref profile_init(void) if (prof_buffer) return 0; + free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -386,13 +391,15 @@ out_free: return NOTIFY_BAD; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cpu_set(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - cpu_clear(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); if (per_cpu(cpu_profile_hits, cpu)[0]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); per_cpu(cpu_profile_hits, cpu)[0] = NULL; @@ -430,19 +437,19 @@ void profile_tick(int type) if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && prof_cpu_mask != NULL && + cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <asm/uaccess.h> -#include <asm/ptrace.h> static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); + int len = cpumask_scnprintf(page, count, data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -452,16 +459,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, static int prof_cpu_mask_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { - cpumask_t *mask = (cpumask_t *)data; + struct cpumask *mask = data; unsigned long full_count = count, err; - cpumask_t new_value; + cpumask_var_t new_value; - err = cpumask_parse_user(buffer, count, new_value); - if (err) - return err; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; - *mask = new_value; - return full_count; + err = cpumask_parse_user(buffer, count, new_value); + if (!err) { + cpumask_copy(mask, new_value); + err = full_count; + } + free_cpumask_var(new_value); + return err; } void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) @@ -472,7 +483,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); if (!entry) return; - entry->data = (void *)&prof_cpu_mask; + entry->data = prof_cpu_mask; entry->read_proc = prof_cpu_mask_read_proc; entry->write_proc = prof_cpu_mask_write_proc; } diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a00..490934f 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp, struct rcu_ctrlblk *rcp) { int cpu; - cpumask_t cpumask; unsigned long flags; set_need_resched(); @@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp, * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. * - * cpu_online_map is updated by the _cpu_down() + * cpu_online_mask is updated by the _cpu_down() * using __stop_machine(). Since we're in irqs disabled * section, __stop_machine() is not exectuting, hence - * the cpu_online_map is stable. + * the cpu_online_mask is stable. * * However, a cpu might have been offlined _just_ before * we disabled irqs while entering here. @@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp, * notification, leading to the offlined cpu's bit * being set in the rcp->cpumask. * - * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent * sending smp_reschedule() to an offlined CPU. */ - cpus_and(cpumask, rcp->cpumask, cpu_online_map); - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask_nr(cpu, cpumask) - smp_send_reschedule(cpu); + for_each_cpu_and(cpu, + to_cpumask(rcp->cpumask), cpu_online_mask) { + if (cpu != rdp->cpu) + smp_send_reschedule(cpu); + } } spin_unlock_irqrestore(&rcp->lock, flags); } @@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for_each_possible_cpu(cpu) { - if (cpu_isset(cpu, rcp->cpumask)) + if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) printk(" %d", cpu); } printk(" (detected by %d, t=%ld jiffies)\n", @@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) long delta; delta = jiffies - rcp->jiffies_stall; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && + delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rcp); @@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(to_cpumask(rcp->cpumask), + cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } @@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) */ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) { - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { + cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); + if (cpumask_empty(to_cpumask(rcp->cpumask))) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ad63af8..d92a76a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void); /* Makes kernel-doc tools happy */ -synchronize_rcu_xxx(synchronize_rcu, call_rcu) +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 0498265..33cfc50 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] = { "idle", "waitack", "waitzero", "waitmb" }; #endif /* #ifdef CONFIG_RCU_TRACE */ -static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; +static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly + = CPU_BITS_NONE; /* * Enum and per-CPU flag to determine when each CPU has seen @@ -758,7 +759,7 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; dyntick_save_progress_counter(cpu); } @@ -776,7 +777,7 @@ rcu_try_flip_waitack(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitack_needed(cpu) && per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); @@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void) /* Check to see if the sum of the "last" counters is zero. */ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; if (sum != 0) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); @@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; dyntick_save_progress_counter(cpu); } @@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitmb_needed(cpu) && per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); @@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu) RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - cpu_clear(cpu, rcu_cpu_online_map); + cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); @@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu) struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpu_set(cpu, rcu_cpu_online_map); + cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); /* @@ -1176,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +void __synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(__synchronize_sched); /* @@ -1430,7 +1440,7 @@ void __init __rcu_init(void) * We don't need protection against CPU-Hotplug here * since * a) If a CPU comes online while we are iterating over the - * cpu_online_map below, we would only end up making a + * cpu_online_mask below, we would only end up making a * duplicate call to rcu_online_cpu() which sets the corresponding * CPU's mask in the rcu_cpu_online_map. * diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b310655..7c4142a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -136,31 +136,47 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ -#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ -static int fullstop; /* stop generating callbacks at test end. */ -DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ - /* spawning of kthreads. */ +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ + /* of kthreads. */ /* - * Detect and respond to a signal-based shutdown. + * Detect and respond to a system shutdown. */ static int rcutorture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { - if (fullstop) - return NOTIFY_DONE; - if (signal_pending(current)) { - mutex_lock(&fullstop_mutex); - if (!ACCESS_ONCE(fullstop)) - fullstop = FULLSTOP_SIGNALED; - mutex_unlock(&fullstop_mutex); - } + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } /* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + printk(KERN_NOTICE + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + +/* * Allocate an element from the rcu_tortures pool. */ static struct rcu_torture * @@ -221,13 +237,14 @@ rcu_random(struct rcu_random_state *rrsp) } static void -rcu_stutter_wait(void) +rcu_stutter_wait(char *title) { - while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) { + while (stutter_pause_test || !rcutorture_runnable) { if (rcutorture_runnable) schedule_timeout_interruptible(1); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); } } @@ -289,7 +306,7 @@ rcu_torture_cb(struct rcu_head *p) int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop) { + if (fullstop != FULLSTOP_DONTSTOP) { /* Test is ending, just drop callbacks on the floor. */ /* The next initialization will pick up the pieces. */ return; @@ -621,10 +638,11 @@ rcu_torture_writer(void *arg) } rcu_torture_current_version++; oldbatch = cur_ops->completed(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -645,11 +663,12 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); cur_ops->sync(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -754,12 +773,13 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -856,7 +876,8 @@ rcu_torture_stats(void *arg) do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); return 0; } @@ -924,7 +945,8 @@ rcu_torture_shuffle(void *arg) do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); return 0; } @@ -939,10 +961,11 @@ rcu_torture_stutter(void *arg) do { schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 1; - if (!kthread_should_stop() && !fullstop) + if (!kthread_should_stop()) schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 0; - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); return 0; } @@ -969,15 +992,16 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); - if (!fullstop) { - /* If being signaled, let it happen, then exit. */ + if (fullstop == FULLSTOP_SHUTDOWN) { + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); - schedule_timeout_interruptible(10 * HZ); + schedule_timeout_uninterruptible(10); if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_CLEANUP; + fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_nb); if (stutter_task) { @@ -1077,7 +1101,7 @@ rcu_torture_init(void) else nrealreaders = 2 * num_online_cpus(); rcu_torture_print_module_parms("Start of test"); - fullstop = 0; + fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a342b03..f2d8638 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); #ifdef CONFIG_NO_HZ -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = 1, + .dynticks = 1, +}; #endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per softirq. */ @@ -572,6 +575,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rnp->qsmask = rnp->qsmaskinit; + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -1379,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) static void __cpuinit rcu_online_cpu(int cpu) { -#ifdef CONFIG_NO_HZ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dynticks_nesting = 1; - rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */ - rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1; -#endif /* #ifdef CONFIG_NO_HZ */ rcu_init_percpu_data(cpu, &rcu_state); rcu_init_percpu_data(cpu, &rcu_bh_state); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index f275c8e..bf8e753 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -15,10 +15,11 @@ #include <linux/uaccess.h> #include <linux/mm.h> -void res_counter_init(struct res_counter *counter) +void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->parent = parent; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) return 0; } -int res_counter_charge(struct res_counter *counter, unsigned long val) +int res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) { int ret; unsigned long flags; - - spin_lock_irqsave(&counter->lock, flags); - ret = res_counter_charge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + struct res_counter *c, *u; + + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + ret = res_counter_charge_locked(c, val); + spin_unlock(&c->lock); + if (ret < 0) { + *limit_fail_at = c; + goto undo; + } + } + ret = 0; + goto done; +undo: + for (u = counter; u != c; u = u->parent) { + spin_lock(&u->lock); + res_counter_uncharge_locked(u, val); + spin_unlock(&u->lock); + } +done: + local_irq_restore(flags); return ret; } @@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) void res_counter_uncharge(struct res_counter *counter, unsigned long val) { unsigned long flags; + struct res_counter *c; - spin_lock_irqsave(&counter->lock, flags); - res_counter_uncharge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + res_counter_uncharge_locked(c, val); + spin_unlock(&c->lock); + } + local_irq_restore(flags); } diff --git a/kernel/resource.c b/kernel/resource.c index e633106..ca6a153 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res) */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, - const char *name) + const char *name, int flags) { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); @@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent, res->start = start; res->end = start + n - 1; res->flags = IORESOURCE_BUSY; + res->flags |= flags; write_lock(&resource_lock); @@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start, { struct resource * res; - res = __request_region(parent, start, n, "check-region"); + res = __request_region(parent, start, n, "check-region", 0); if (!res) return -EBUSY; @@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev, dr->start = start; dr->n = n; - res = __request_region(parent, start, n, name); + res = __request_region(parent, start, n, name, 0); if (res) devres_add(dev, dr); else @@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) return err; } + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); diff --git a/kernel/sched.c b/kernel/sched.c index c731dd8..1d290906 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch); DEFINE_TRACE(sched_migrate_task); #ifdef CONFIG_SMP + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) * Since cpu_power is a 'constant', we can use a reciprocal divide. @@ -498,18 +501,26 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; #endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif }; /* @@ -1514,7 +1525,7 @@ static int tg_shares_up(struct task_group *tg, void *data) struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1535,7 +1546,7 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@ -2101,15 +2112,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2141,17 +2154,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2193,7 +2203,6 @@ static int sched_balance_self(int cpu, int flag) update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -2202,14 +2211,13 @@ static int sched_balance_self(int cpu, int flag) continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2218,10 +2226,10 @@ static int sched_balance_self(int cpu, int flag) /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@ -2266,7 +2274,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@ -2315,7 +2323,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@ -2846,7 +2854,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; @@ -2911,7 +2919,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@ -3086,7 +3094,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3122,10 +3130,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3134,13 +3143,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@ -3251,8 +3255,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3267,8 +3271,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3394,6 +3398,10 @@ out_balanced: if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + cpumask_first(sched_group_cpus(group_leader)); + } return group_min; } #endif @@ -3407,16 +3415,16 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@ -3446,7 +3454,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -3454,7 +3462,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3514,8 +3522,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@ -3532,7 +3540,8 @@ redo: /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@ -3607,7 +3616,7 @@ out: */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3616,7 +3625,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3660,17 +3669,76 @@ redo: double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } if (!ld_moved) { + int active_balance = 0; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); + if (active_balance) + wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); + } else sd->nr_balance_failed = 0; @@ -3696,7 +3764,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3707,7 +3778,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3722,6 +3793,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* @@ -3759,7 +3831,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@ -3778,10 +3850,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@ -3809,7 +3880,7 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@ -3823,7 +3894,7 @@ int select_nohz_load_balancer(int stop_tick) } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -3836,10 +3907,10 @@ int select_nohz_load_balancer(int stop_tick) } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@ -3867,7 +3938,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3892,7 +3967,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3926,6 +4001,8 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* @@ -3950,12 +4027,13 @@ static void run_rebalance_domains(struct softirq_action *h) */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -3993,7 +4071,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@ -4006,7 +4084,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4018,7 +4096,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@ -4028,7 +4106,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@ -4080,13 +4158,17 @@ unsigned long long task_delta_exec(struct task_struct *p) * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + /* Add user time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); /* Add user time to cpustat. */ @@ -4103,51 +4185,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { cputime64_t tmp; struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; tmp = cputime_to_cputime64(cputime); + /* Add guest time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); + /* Add guest time to cpustat. */ cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->guest = cputime64_add(cpustat->guest, tmp); } /* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - -/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) + cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); + account_guest_time(p, cputime, cputime_scaled); return; } + /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -4156,49 +4235,85 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else - cpustat->idle = cputime64_add(cpustat->idle, tmp); + cpustat->system = cputime64_add(cpustat->system, tmp); + /* Account for system time used */ acct_update_integrals(p); } /* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update + * Account for involuntary wait time. + * @steal: the cpu time spent in involuntary wait */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) +void account_steal_time(cputime_t cputime) { - p->stimescaled = cputime_add(p->stimescaled, cputime); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); } /* - * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen - * @steal: the cpu time spent in involuntary wait + * Account for idle time. + * @cputime: the cpu time spent in idle wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +void account_idle_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); } /* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING @@ -5401,10 +5516,9 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@ -5426,6 +5540,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; @@ -5434,37 +5556,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@ -5477,17 +5603,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@ -5504,7 +5633,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@ -5523,19 +5652,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** @@ -5872,7 +6006,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5899,9 +6033,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@ -5956,7 +6090,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@ -5964,13 +6098,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@ -5978,15 +6112,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -6028,7 +6162,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@ -6125,50 +6259,41 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - cpumask_t mask; - struct rq *rq; int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); - do { - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* @@ -6180,7 +6305,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@ -6470,7 +6595,7 @@ static void set_rq_online(struct rq *rq) if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@ -6490,7 +6615,7 @@ static void set_rq_offline(struct rq *rq) class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@ -6531,7 +6656,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@ -6545,7 +6670,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@ -6595,7 +6720,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@ -6634,13 +6759,13 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); - cpus_clear(*groupmask); + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6654,11 +6779,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -6678,31 +6803,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6710,7 +6836,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@ -6720,8 +6846,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@ -6734,7 +6859,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6742,7 +6867,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@ -6773,7 +6898,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@ -6797,6 +6922,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +static void free_rootdomain(struct root_domain *rd) +{ + cpupri_cleanup(&rd->cpupri); + + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@ -6806,38 +6941,62 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd) +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri, true); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; - cpupri_init(&rd->cpupri); + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; + return 0; + +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@ -6849,7 +7008,10 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@ -6891,19 +7053,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } /* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; +static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, cpu_isolated_map); return 1; } @@ -6912,42 +7067,43 @@ __setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@ -7011,23 +7167,21 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static void sched_domain_node_span(int node, cpumask_t *span) +static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; - node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(*span); + cpumask_clear(span); nodes_clear(used_nodes); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(node)); node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - node_to_cpumask_ptr_next(nodemask, next_node); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(next_node)); } } #endif /* CONFIG_NUMA */ @@ -7035,18 +7189,33 @@ static void sched_domain_node_span(int node, cpumask_t *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + +/* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -7055,56 +7224,53 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC - *mask = cpu_coregroup_map(cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@ -7114,23 +7280,23 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, * groups, so roll our own. Now each node has its own list of groups which * gets dynamically allocated. */ -static DEFINE_PER_CPU(struct sched_domain, node_domains); +static DEFINE_PER_CPU(struct static_sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; -static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); + group = cpumask_first(nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@ -7142,11 +7308,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + sd = &per_cpu(phys_domains, j).sd; + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@ -7163,11 +7329,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@ -7177,9 +7344,8 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@ -7197,7 +7363,8 @@ next_sg: } } #else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@ -7223,7 +7390,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@ -7288,48 +7455,6 @@ SD_INIT_FUNC(CPU) SD_INIT_FUNC(MC) #endif -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ - *masks = kmalloc(sizeof(**masks), GFP_KERNEL); -} -static inline void sched_cpumask_free(struct allmasks *masks) -{ - kfree(masks); -} -#else -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ } -static inline void sched_cpumask_free(struct allmasks *masks) -{ } -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@ -7369,17 +7494,38 @@ static void set_domain_attribute(struct sched_domain *sd, * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const cpumask_t *cpu_map, +static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@ -7387,75 +7533,57 @@ static int __build_sched_domains(const cpumask_t *cpu_map, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } - - /* get space for all scratch cpumask variables */ - sched_cpumask_alloc(&allmasks); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; + goto free_sched_groups; } - tmpmask = (cpumask_t *)allmasks; - - #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); - *nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { - sd = &per_cpu(allnodes_domains, i); + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; } else p = NULL; - sd = &per_cpu(node_domains, i); + sd = &per_cpu(node_domains, i).sd; SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@ -7463,11 +7591,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, + cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7475,11 +7603,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7488,13 +7616,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@ -7505,13 +7630,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_core_map = cpu_coregroup_map(i); - cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + for_each_cpu(i, cpu_map) { + cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@ -7522,12 +7643,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@ -7538,8 +7655,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@ -7548,58 +7663,53 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; - *nodemask = node_to_cpumask(i); - cpus_clear(*covered); - - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + cpumask_clear(covered); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; - sd = &per_cpu(node_domains, j); + sd = &per_cpu(node_domains, j).sd; sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; - node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); + if (cpumask_empty(tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@ -7607,9 +7717,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@ -7618,22 +7728,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -7645,53 +7755,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } - sched_cpumask_free(allmasks); - return 0; + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - sched_cpumask_free(allmasks); - kfree(rd); - return -ENOMEM; + free_rootdomain(rd); + goto free_tmpmask; #endif } -static int build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } -static cpumask_t *doms_cur; /* current sched domains */ +static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ -static cpumask_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -7708,16 +7843,16 @@ int __attribute__((weak)) arch_update_cpu_topology(void) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + doms_cur = fallback_doms; + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -7725,8 +7860,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@ -7735,15 +7870,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map, * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ -static void detach_destroy_domains(const cpumask_t *cpu_map) +static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7768,7 +7904,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@ -7782,13 +7918,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; @@ -7807,7 +7944,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@ -7819,15 +7956,15 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + doms_new = fallback_doms; + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } @@ -7839,7 +7976,7 @@ match2: } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@ -7852,7 +7989,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static void arch_reinit_sched_domains(void) { get_online_cpus(); @@ -7861,25 +7998,33 @@ int arch_reinit_sched_domains(void) rebuild_sched_domains(); put_online_cpus(); - - return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { - int ret; + unsigned int level = 0; + + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ - if (buf[0] != '0' && buf[0] != '1') + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) return -EINVAL; if (smt) - sched_smt_power_savings = (buf[0] == '1'); + sched_smt_power_savings = level; else - sched_mc_power_savings = (buf[0] == '1'); + sched_mc_power_savings = level; - ret = arch_reinit_sched_domains(); + arch_reinit_sched_domains(); - return ret ? ret : count; + return count; } #ifdef CONFIG_SCHED_MC @@ -7914,7 +8059,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_store); #endif -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) { int err = 0; @@ -7979,7 +8124,9 @@ static int update_runtime(struct notifier_block *nfb, void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -7988,10 +8135,10 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -8006,9 +8153,13 @@ void __init sched_init_smp(void) init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) @@ -8323,6 +8474,15 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ + scheduler_running = 1; } diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096..a0b0852 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { + if (timekeeping_suspended) + return; + sched_clock_tick(); touch_softlockup_watchdog(); } diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 52154fe..1e00bfa 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -67,24 +67,21 @@ static int convert_prio(int prio) * Returns: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - cpumask_t *lowest_mask) + struct cpumask *lowest_mask) { int idx = 0; int task_pri = convert_prio(p->prio); for_each_cpupri_active(cp->pri_active, idx) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - cpumask_t mask; if (idx >= task_pri) break; - cpus_and(mask, p->cpus_allowed, vec->mask); - - if (cpus_empty(mask)) + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - *lowest_mask = mask; + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } @@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); - cpu_clear(cpu, vec->mask); + cpumask_clear_cpu(cpu, vec->mask); spin_unlock_irqrestore(&vec->lock, flags); } @@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_lock_irqsave(&vec->lock, flags); - cpu_set(cpu, vec->mask); + cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); @@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem * - * Returns: (void) + * Returns: -ENOMEM if memory fails. */ -void cpupri_init(struct cpupri *cp) +int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) { int i; @@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp) spin_lock_init(&vec->lock); vec->count = 0; - cpus_clear(vec->mask); + if (bootmem) + alloc_bootmem_cpumask_var(&vec->mask); + else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; } for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; } +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index f25811b0..642a94e 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -14,7 +14,7 @@ struct cpupri_vec { spinlock_t lock; int count; - cpumask_t mask; + cpumask_var_t mask; }; struct cpupri { @@ -27,7 +27,8 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, cpumask_t *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -void cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) #define cpupri_init() do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4293cfa..16eeba4e 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -145,6 +145,19 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irqrestore(&tasklist_lock, flags); } +#if defined(CONFIG_CGROUP_SCHED) && \ + (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) +static void task_group_path(struct task_group *tg, char *buf, int buflen) +{ + /* may be NULL if the underlying cgroup isn't fully-created yet */ + if (!tg->css.cgroup) { + buf[0] = '\0'; + return; + } + cgroup_path(tg->css.cgroup, buf, buflen); +} +#endif + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -154,10 +167,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128] = ""; + char path[128]; struct task_group *tg = cfs_rq->tg; - cgroup_path(tg->css.cgroup, path, sizeof(path)); + task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); #elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) @@ -208,10 +221,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128] = ""; + char path[128]; struct task_group *tg = rt_rq->tg; - cgroup_path(tg->css.cgroup, path, sizeof(path)); + task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); #else diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5ad4440..8e1352c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= P[w / rw] - */ -static inline unsigned long -calc_delta_weight(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - se->load.weight, &cfs_rq_of(se)->load); - } - - return delta; -} - -/* * delta /= w */ static inline unsigned long @@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - if (unlikely(!se->on_rq)) - nr_running++; + for_each_sched_entity(se) { + struct load_weight *load = &cfs_rq->load; + + if (unlikely(!se->on_rq)) { + struct load_weight lw = cfs_rq->load; - return calc_delta_weight(__sched_period(nr_running), se); + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; } /* @@ -1019,16 +1013,33 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; + unsigned int chosen_wakeup_cpu; + int this_cpu; + + /* + * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu + * are idle and this is not a kernel thread and this task's affinity + * allows it to be moved to preferred cpu, then just move! + */ + + this_cpu = smp_processor_id(); + chosen_wakeup_cpu = + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && + idle_cpu(cpu) && idle_cpu(this_cpu) && + p->mm && !(p->flags & PF_KTHREAD) && + cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) + return chosen_wakeup_cpu; /* * If it is idle, then it is the best cpu to run this task. @@ -1046,10 +1057,9 @@ static int wake_idle(int cpu, struct task_struct *p) if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@ -1242,13 +1252,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync) * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* @@ -1607,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } } -#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) - /* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 51d2af3..da932f4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq) if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq) /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) } #ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) return rt_rq->rt_throttled; } -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq) int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq) /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); +free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ @@ -914,7 +919,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -953,18 +958,19 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) return next; } -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +static inline int pick_optimal_cpu(int this_cpu, + const struct cpumask *mask) { int first; /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) + if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) return this_cpu; - first = first_cpu(*mask); - if (first != NR_CPUS) + first = cpumask_first(mask); + if (first < nr_cpu_ids) return first; return -1; @@ -973,9 +979,10 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -988,7 +995,7 @@ static int find_lowest_rq(struct task_struct *task) * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@ -998,7 +1005,7 @@ static int find_lowest_rq(struct task_struct *task) * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@ -1008,18 +1015,25 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu == cpu) this_cpu = -1; /* Skip this_cpu opt if the same */ - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - cpumask_t domain_mask; - int best_cpu; + if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; + + cpumask_and(domain_mask, + sched_domain_span(sd), + lowest_mask); - cpus_and(domain_mask, sd->span, *lowest_mask); + best_cpu = pick_optimal_cpu(this_cpu, + domain_mask); - best_cpu = pick_optimal_cpu(this_cpu, - &domain_mask); - if (best_cpu != -1) - return best_cpu; + if (best_cpu != -1) { + free_cpumask_var(domain_mask); + return best_cpu; + } + } } + free_cpumask_var(domain_mask); } /* @@ -1054,8 +1068,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@ -1176,7 +1190,7 @@ static int pull_rt_task(struct rq *this_rq) next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1305,9 +1319,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@ -1328,7 +1342,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } @@ -1371,6 +1385,15 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + GFP_KERNEL, cpu_to_node(i)); +} #endif /* CONFIG_SMP */ /* @@ -1541,3 +1564,4 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 3b01098..f2773b5 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { diff --git a/kernel/signal.c b/kernel/signal.c index 8e95855..3152ac3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_vnr(current); + q->info.si_pid = task_tgid_nr_ns(current, + task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: diff --git a/kernel/smp.c b/kernel/smp.c index 75c8dde..5cfa0e5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -24,8 +24,8 @@ struct call_function_data { struct call_single_data csd; spinlock_t lock; unsigned int refs; - cpumask_t cpumask; struct rcu_head rcu_head; + unsigned long cpumask_bits[]; }; struct call_single_queue { @@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function_queue, csd.list) { int refs; - if (!cpu_isset(cpu, data->cpumask)) + if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) continue; data->csd.func(data->csd.info); spin_lock(&data->lock); - cpu_clear(cpu, data->cpumask); + cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); WARN_ON(data->refs == 0); data->refs--; refs = data->refs; @@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { + } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { @@ -266,51 +266,19 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } -/* Dummy function */ -static void quiesce_dummy(void *unused) -{ -} - -/* - * Ensure stack based data used in call function mask is safe to free. - * - * This is needed by smp_call_function_mask when using on-stack data, because - * a single call function queue is shared by all CPUs, and any CPU may pick up - * the data item on the queue at any time before it is deleted. So we need to - * ensure that all CPUs have transitioned through a quiescent state after - * this call. - * - * This is a very slow function, implemented by sending synchronous IPIs to - * all possible CPUs. For this reason, we have to alloc data rather than use - * stack based data even in the case of synchronous calls. The stack based - * data is then just used for deadlock/oom fallback which will be very rare. - * - * If a faster scheme can be made, we could go back to preferring stack based - * data -- the data allocation/free is non-zero cost. - */ -static void smp_call_function_mask_quiesce_stack(cpumask_t mask) -{ - struct call_single_data data; - int cpu; - - data.func = quiesce_dummy; - data.info = NULL; - - for_each_cpu_mask(cpu, mask) { - data.flags = CSD_FLAG_WAIT; - generic_exec_single(cpu, &data); - } -} +/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +#ifndef arch_send_call_function_ipi_mask +#define arch_send_call_function_ipi_mask(maskp) \ + arch_send_call_function_ipi(*(maskp)) +#endif /** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. - * * If @wait is true, then returns once @func has returned. Note that @wait * will be implicitly turned on in case of allocation failures, since * we fall back to on-stack allocation. @@ -319,53 +287,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask) * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) +void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *), void *info, + bool wait) { - struct call_function_data d; - struct call_function_data *data = NULL; - cpumask_t allbutself; + struct call_function_data *data; unsigned long flags; - int cpu, num_cpus; - int slowpath = 0; + int cpu, next_cpu; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); - cpu = smp_processor_id(); - allbutself = cpu_online_map; - cpu_clear(cpu, allbutself); - cpus_and(mask, mask, allbutself); - num_cpus = cpus_weight(mask); - - /* - * If zero CPUs, return. If just a single CPU, turn this request - * into a targetted single call instead since it's faster. - */ - if (!num_cpus) - return 0; - else if (num_cpus == 1) { - cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, wait); + /* So, what's a CPU they want? Ignoring this one. */ + cpu = cpumask_first_and(mask, cpu_online_mask); + if (cpu == smp_processor_id()) + cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + /* No online cpus? We're done. */ + if (cpu >= nr_cpu_ids) + return; + + /* Do we have another CPU which isn't us? */ + next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + if (next_cpu == smp_processor_id()) + next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); + + /* Fastpath: do that cpu by itself. */ + if (next_cpu >= nr_cpu_ids) { + smp_call_function_single(cpu, func, info, wait); + return; } - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) { - data->csd.flags = CSD_FLAG_ALLOC; - if (wait) - data->csd.flags |= CSD_FLAG_WAIT; - } else { - data = &d; - data->csd.flags = CSD_FLAG_WAIT; - wait = 1; - slowpath = 1; + data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); + if (unlikely(!data)) { + /* Slow path. */ + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpumask_test_cpu(cpu, mask)) + smp_call_function_single(cpu, func, info, wait); + } + return; } spin_lock_init(&data->lock); + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; data->csd.func = func; data->csd.info = info; - data->refs = num_cpus; - data->cpumask = mask; + cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); + data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); spin_lock_irqsave(&call_function_lock, flags); list_add_tail_rcu(&data->csd.list, &call_function_queue); @@ -377,18 +349,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(mask); + arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ - if (wait) { + if (wait) csd_flag_wait(&data->csd); - if (unlikely(slowpath)) - smp_call_function_mask_quiesce_stack(mask); - } - - return 0; } -EXPORT_SYMBOL(smp_call_function_mask); +EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. @@ -396,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_mask); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. + * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. In case of allocation @@ -407,12 +374,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function(void (*func)(void *), void *info, int wait) { - int ret; - preempt_disable(); - ret = smp_call_function_mask(cpu_online_map, func, info, wait); + smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - return ret; + return 0; } EXPORT_SYMBOL(smp_call_function); diff --git a/kernel/softirq.c b/kernel/softirq.c index 466e75c..0365b48 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -784,3 +784,28 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) } EXPORT_SYMBOL(on_each_cpu); #endif + +/* + * [ These __weak aliases are kept in a separate compilation unit, so that + * GCC does not inline them incorrectly. ] + */ + +int __init __weak early_irq_init(void) +{ + return 0; +} + +int __init __weak arch_probe_nr_irqs(void) +{ + return 0; +} + +int __init __weak arch_early_irq_init(void) +{ + return 0; +} + +int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + return 0; +} diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1ab790c..d9188c6 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -303,17 +303,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = any_online_cpu(cpu_online_map); + check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: if (hotcpu == check_cpu) { - cpumask_t temp_cpu_online_map = cpu_online_map; - - cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = any_online_cpu(temp_cpu_online_map); + /* Pick any other online cpu. */ + check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); } break; @@ -323,7 +321,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: p = per_cpu(watchdog_task, hotcpu); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 24e8cea..0cd415e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -38,7 +38,10 @@ struct stop_machine_data { static unsigned int num_threads; static atomic_t thread_ack; static DEFINE_MUTEX(lock); - +/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ +static DEFINE_MUTEX(setup_lock); +/* Users of stop_machine. */ +static int refcount; static struct workqueue_struct *stop_machine_wq; static struct stop_machine_data active, idle; static const cpumask_t *active_cpus; @@ -69,10 +72,10 @@ static void stop_cpu(struct work_struct *unused) int err; if (!active_cpus) { - if (cpu == first_cpu(cpu_online_map)) + if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; } else { - if (cpu_isset(cpu, *active_cpus)) + if (cpumask_test_cpu(cpu, active_cpus)) smdata = &active; } /* Simple state machine */ @@ -109,7 +112,44 @@ static int chill(void *unused) return 0; } -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int stop_machine_create(void) +{ + mutex_lock(&setup_lock); + if (refcount) + goto done; + stop_machine_wq = create_rt_workqueue("kstop"); + if (!stop_machine_wq) + goto err_out; + stop_machine_work = alloc_percpu(struct work_struct); + if (!stop_machine_work) + goto err_out; +done: + refcount++; + mutex_unlock(&setup_lock); + return 0; + +err_out: + if (stop_machine_wq) + destroy_workqueue(stop_machine_wq); + mutex_unlock(&setup_lock); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(stop_machine_create); + +void stop_machine_destroy(void) +{ + mutex_lock(&setup_lock); + refcount--; + if (refcount) + goto done; + destroy_workqueue(stop_machine_wq); + free_percpu(stop_machine_work); +done: + mutex_unlock(&setup_lock); +} +EXPORT_SYMBOL_GPL(stop_machine_destroy); + +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct work_struct *sm_work; int i, ret; @@ -142,23 +182,18 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) return ret; } -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; + ret = stop_machine_create(); + if (ret) + return ret; /* No CPUs can come up or down during this. */ get_online_cpus(); ret = __stop_machine(fn, data, cpus); put_online_cpus(); - + stop_machine_destroy(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); - -static int __init stop_machine_init(void) -{ - stop_machine_wq = create_rt_workqueue("kstop"); - stop_machine_work = alloc_percpu(struct work_struct); - return 0; -} -core_initcall(stop_machine_init); diff --git a/kernel/sys.c b/kernel/sys.c index d356d79..763c3c1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> +#include <linux/ptrace.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } + force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } @@ -1627,6 +1629,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { + utime = task_utime(current); + stime = task_stime(current); accumulate_thread_rusage(p, r); goto out; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c..89d7443 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,15 +82,14 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -101,6 +100,7 @@ static int two = 2; #endif static int zero; +static int one = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -952,12 +952,22 @@ static struct ctl_table vm_table[] = { .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_background_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = &dirty_background_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, @@ -969,6 +979,16 @@ static struct ctl_table vm_table[] = { .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = &dirty_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), @@ -1085,6 +1105,17 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#else + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = VM_LAPTOP_MODE, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bd6be76..888adbc 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,18 +290,17 @@ ret: return; } -static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; struct listener *s, *tmp; unsigned int cpu; - cpumask_t mask = *maskp; - if (!cpus_subset(mask, cpu_possible_map)) + if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; if (isadd == REGISTER) { - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); if (!s) @@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) /* Deregister or cleanup */ cleanup: - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { @@ -335,7 +334,7 @@ cleanup: return 0; } -static int parse(struct nlattr *na, cpumask_t *mask) +static int parse(struct nlattr *na, struct cpumask *mask) { char *data; int len; @@ -352,7 +351,7 @@ static int parse(struct nlattr *na, cpumask_t *mask) if (!data) return -ENOMEM; nla_strlcpy(data, na, len); - ret = cpulist_parse(data, *mask); + ret = cpulist_parse(data, mask); kfree(data); return ret; } @@ -428,23 +427,33 @@ err: static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { - int rc = 0; + int rc; struct sk_buff *rep_skb; struct taskstats *stats; size_t size; - cpumask_t mask; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, REGISTER); + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, REGISTER); + goto free_return_rc; + } - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +free_return_rc: + free_cpumask_var(mask); return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, DEREGISTER); + } + free_cpumask_var(mask); /* * Size includes space for nested attributes diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 06b6395..4f10451 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,21 +22,11 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; +static u32 (*target)(u32 value); +static u32 (*target2)(u32 value); static noinline u32 kprobe_target(u32 value) { - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - return (value / div_factor); } @@ -74,7 +64,7 @@ static int test_kprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kprobe(&kp); if (preh_val == 0) { @@ -92,6 +82,84 @@ static int test_kprobe(void) return 0; } +static noinline u32 kprobe_target2(u32 value) +{ + return (value / div_factor) + 1; +} + +static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor) + 1; + return 0; +} + +static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler2\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp2 = { + .symbol_name = "kprobe_target2", + .pre_handler = kp_pre_handler2, + .post_handler = kp_post_handler2 +}; + +static int test_kprobes(void) +{ + int ret; + struct kprobe *kps[2] = {&kp, &kp2}; + + kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kprobes(kps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobes returned %d\n", ret); + return ret; + } + + preh_val = 0; + posth_val = 0; + ret = target(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + preh_val = 0; + posth_val = 0; + ret = target2(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler2 not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler2 not called\n"); + handler_errors++; + } + + unregister_kprobes(kps, 2); + return 0; + +} + static u32 j_kprobe_target(u32 value) { if (value != rand1) { @@ -121,7 +189,7 @@ static int test_jprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -132,6 +200,43 @@ static int test_jprobe(void) return 0; } +static struct jprobe jp2 = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_jprobes(void) +{ + int ret; + struct jprobe *jps[2] = {&jp, &jp2}; + + jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_jprobes(jps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobes returned %d\n", ret); + return ret; + } + + jph_val = 0; + ret = target(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + jph_val = 0; + ret = target2(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler2 not called\n"); + handler_errors++; + } + unregister_jprobes(jps, 2); + + return 0; +} #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -177,7 +282,7 @@ static int test_kretprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -187,12 +292,72 @@ static int test_kretprobe(void) return 0; } + +static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler2\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp2 = { + .handler = return_handler2, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_kretprobes(void) +{ + int ret; + struct kretprobe *rps[2] = {&rp, &rp2}; + + rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kretprobes(rps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + krph_val = 0; + ret = target(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + krph_val = 0; + ret = target2(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler2 not called\n"); + handler_errors++; + } + unregister_kretprobes(rps, 2); + return 0; +} #endif /* CONFIG_KRETPROBES */ int init_test_probes(void) { int ret; + target = kprobe_target; + target2 = kprobe_target2; + do { rand1 = random32(); } while (rand1 <= div_factor); @@ -204,15 +369,30 @@ int init_test_probes(void) errors++; num_tests++; + ret = test_kprobes(); + if (ret < 0) + errors++; + + num_tests++; ret = test_jprobe(); if (ret < 0) errors++; + num_tests++; + ret = test_jprobes(); + if (ret < 0) + errors++; + #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); if (ret < 0) errors++; + + num_tests++; + ret = test_kretprobes(); + if (ret < 0) + errors++; #endif /* CONFIG_KRETPROBES */ if (errors) diff --git a/kernel/time.c b/kernel/time.c index d63a433..4886e3c 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -37,6 +37,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f8d9680..ea2f48a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -166,6 +166,8 @@ static void clockevents_notify_released(void) void clockevents_register_device(struct clock_event_device *dev) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(!dev->cpumask); + /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9ed2eec..ca89e15 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,10 +145,11 @@ static void clocksource_watchdog(unsigned long data) * Cycle through CPUs to check if the CPUs stay * synchronized to each other. */ - int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); + int next_cpu = cpumask_next(raw_smp_processor_id(), + cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = first_cpu(cpu_online_map); + next_cpu = cpumask_first(cpu_online_mask); watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); } @@ -173,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -195,7 +196,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1ca9955..06f1975 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -45,7 +45,7 @@ * * The value 8 is somewhat carefully chosen, as anything * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ #define JIFFIES_SHIFT 8 diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f98a1b7..118a3b3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,7 +28,9 @@ */ struct tick_device tick_broadcast_device; -static cpumask_t tick_broadcast_mask; +/* FIXME: Use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); +static DECLARE_BITMAP(tmpmask, NR_CPUS); static DEFINE_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void) return &tick_broadcast_device; } -cpumask_t *tick_get_broadcast_mask(void) +struct cpumask *tick_get_broadcast_mask(void) { - return &tick_broadcast_mask; + return to_cpumask(tick_broadcast_mask); } /* @@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) clockevents_exchange_device(NULL, dev); tick_broadcast_device.evtdev = dev; - if (!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); return 1; } @@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; - cpu_set(cpu, tick_broadcast_mask); + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_clear_oneshot(cpu); } } @@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) } /* - * Broadcast the event to the cpus, which are set in the mask + * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(cpumask_t mask) +static void tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; @@ -135,21 +137,20 @@ static void tick_do_broadcast(cpumask_t mask) /* * Check, if the current cpu is in the mask */ - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); td = &per_cpu(tick_cpu_device, cpu); td->evtdev->event_handler(td->evtdev); } - if (!cpus_empty(mask)) { + if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ - cpu = first_cpu(mask); - td = &per_cpu(tick_cpu_device, cpu); + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } } @@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask) */ static void tick_do_periodic_broadcast(void) { - cpumask_t mask; - spin_lock(&tick_broadcast_lock); - cpus_and(mask, cpu_online_map, tick_broadcast_mask); - tick_do_broadcast(mask); + cpumask_and(to_cpumask(tmpmask), + cpu_online_mask, tick_get_broadcast_mask()); + tick_do_broadcast(to_cpumask(tmpmask)); spin_unlock(&tick_broadcast_lock); } @@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpus_empty(tick_broadcast_mask); + bc_stopped = cpumask_empty(tick_get_broadcast_mask()); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpu_isset(cpu, tick_broadcast_mask)) { - cpu_set(cpu, tick_broadcast_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpu_isset(cpu, tick_broadcast_mask)) { - cpu_clear(cpu, tick_broadcast_mask); + cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) { + if (cpumask_empty(tick_get_broadcast_mask())) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -272,7 +272,7 @@ out: */ void tick_broadcast_on_off(unsigned long reason, int *oncpu) { - if (!cpu_isset(*oncpu, cpu_online_map)) + if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else @@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpus_empty(tick_broadcast_mask)) + if (bc && cpumask_empty(tick_get_broadcast_mask())) clockevents_shutdown(bc); } @@ -342,10 +342,10 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if(!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(bc); - broadcast = cpu_isset(smp_processor_id(), - tick_broadcast_mask); + broadcast = cpumask_test_cpu(smp_processor_id(), + tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: broadcast = tick_resume_broadcast_oneshot(bc); @@ -360,14 +360,15 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -static cpumask_t tick_broadcast_oneshot_mask; +/* FIXME: use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); /* - * Debugging: see timer_list.c + * Exposed for debugging: see timer_list.c */ -cpumask_t *tick_get_broadcast_oneshot_mask(void) +struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return &tick_broadcast_oneshot_mask; + return to_cpumask(tick_broadcast_oneshot_mask); } static int tick_broadcast_set_event(ktime_t expires, int force) @@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu) static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; - cpumask_t mask; ktime_t now, next_event; int cpu; @@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - mask = CPU_MASK_NONE; + cpumask_clear(to_cpumask(tmpmask)); now = ktime_get(); /* Find all expired events */ - for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { + for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, to_cpumask(tmpmask)); else if (td->evtdev->next_event.tv64 < next_event.tv64) next_event.tv64 = td->evtdev->next_event.tv64; } @@ -424,7 +424,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(mask); + tick_do_broadcast(to_cpumask(tmpmask)); /* * Two reasons for reprogram: @@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) goto out; if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_set(cpu, tick_broadcast_oneshot_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(dev->next_event, 1); } } else { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_clear_cpu(cpu, + tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); if (dev->next_event.tv64 != KTIME_MAX) tick_program_event(dev->next_event, 1); @@ -502,15 +503,16 @@ out: */ static void tick_broadcast_clear_oneshot(int cpu) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); } -static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) { struct tick_device *td; int cpu; - for_each_cpu_mask_nr(cpu, *mask) { + for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; @@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; int cpu = smp_processor_id(); - cpumask_t mask; bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - mask = tick_broadcast_mask; - cpu_clear(cpu, mask); - cpus_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, mask); - - if (was_periodic && !cpus_empty(mask)) { - tick_broadcast_init_next_event(&mask, tick_next_period); + cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); + cpumask_or(tick_get_broadcast_oneshot_mask(), + tick_get_broadcast_oneshot_mask(), + to_cpumask(tmpmask)); + + if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_next_period); tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; @@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434..63e05d4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, - const cpumask_t *cpumask) + const struct cpumask *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; @@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpus_equal(newdev->cpumask, *cpumask)) - irq_set_affinity(newdev->irq, *cpumask); + if (!cpumask_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current @@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - if (!cpu_isset(cpu, newdev->cpumask)) + if (!cpumask_test_cpu(cpu, newdev->cpumask)) goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) goto out_bc; } @@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); @@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup) } /* Transfer the do_timer job away from this cpu */ if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); + int cpu = cpumask_first(cpu_online_mask); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8f3fc25..1b6c05b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void) if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@ -301,7 +301,7 @@ void tick_nohz_stop_sched_tick(int inidle) tick_do_timer_cpu = TICK_DO_TIMER_NONE; if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) @@ -319,7 +319,7 @@ void tick_nohz_stop_sched_tick(int inidle) /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@ -361,7 +361,7 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -419,7 +419,9 @@ void tick_nohz_restart_sched_tick(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; +#endif ktime_t now; local_irq_disable(); @@ -439,8 +441,9 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -450,12 +453,9 @@ void tick_nohz_restart_sched_tick(void) /* * We might be one off. Do not randomly account a huge number of ticks! */ - if (ticks && ticks < LONG_MAX) { - add_preempt_count(HARDIRQ_OFFSET); - account_system_time(current, HARDIRQ_OFFSET, - jiffies_to_cputime(ticks)); - sub_preempt_count(HARDIRQ_OFFSET); - } + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif touch_softlockup_watchdog(); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fa05e88..900f1b6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + static struct timespec xtime_cache __attribute__ ((aligned (16))); void update_xtime_cache(u64 nsec) { @@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts) unsigned long seq; s64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqbegin(&xtime_lock); @@ -299,8 +304,6 @@ void __init timekeeping_init(void) write_sequnlock_irqrestore(&xtime_lock, flags); } -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; diff --git a/kernel/timer.c b/kernel/timer.c index 566257d..dee3f64 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1018,21 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) } #endif -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy = jiffies_to_cputime(1); - - if (user_tick) { - account_user_time(p, one_jiffy); - account_user_time_scaled(p, cputime_to_scaled(one_jiffy)); - } else { - account_system_time(p, HARDIRQ_OFFSET, one_jiffy); - account_system_time_scaled(p, cputime_to_scaled(one_jiffy)); - } -} -#endif - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d601a7..8b0daf0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event) */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - return rb_event_length(event); + unsigned length = rb_event_length(event); + if (event->type != RINGBUF_TYPE_DATA) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); @@ -195,7 +201,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ - for_each_cpu_mask(cpu, buffer->cpumask) + for_each_cpu(cpu, buffer->cpumask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) @@ -267,7 +273,7 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_t cpumask; + cpumask_var_t cpumask; atomic_t record_disabled; struct mutex mutex; @@ -458,6 +464,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (!buffer) return NULL; + if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; @@ -465,14 +474,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; - buffer->cpumask = cpu_possible_map; + cpumask_copy(buffer->cpumask, cpu_possible_mask); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) - goto fail_free_buffer; + goto fail_free_cpumask; for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = @@ -492,6 +501,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) } kfree(buffer->buffers); + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); + fail_free_buffer: kfree(buffer); return NULL; @@ -510,6 +522,8 @@ ring_buffer_free(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + free_cpumask_var(buffer->cpumask); + kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); @@ -1283,7 +1297,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1396,7 +1410,7 @@ int ring_buffer_write(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1478,7 +1492,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1498,7 +1512,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1515,7 +1529,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1532,7 +1546,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1850,7 +1864,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; cpu_buffer = buffer->buffers[cpu]; @@ -2025,7 +2039,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2062,7 +2076,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) struct ring_buffer_iter *iter; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); @@ -2172,7 +2186,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2228,7 +2242,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; cpu_buffer = buffer->buffers[cpu]; @@ -2252,8 +2266,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; - if (!cpu_isset(cpu, buffer_a->cpumask) || - !cpu_isset(cpu, buffer_b->cpumask)) + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) return -EINVAL; /* At least make sure the two buffers are somewhat the same */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4185d52..c580233 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_t __read_mostly tracing_buffer_mask; +static cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ - for_each_cpu_mask(cpu, tracing_buffer_mask) + for_each_cpu(cpu, tracing_buffer_mask) /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpu_isset(iter->cpu, iter->started)) + if (cpumask_test_cpu(iter->cpu, iter->started)) return; - cpu_set(iter->cpu, iter->started); + cpumask_set_cpu(iter->cpu, iter->started); trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } @@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = { /* * Only trace on a CPU if the bitmask is set: */ -static cpumask_t tracing_cpumask = CPU_MASK_ALL; - -/* - * When tracing/tracing_cpu_mask is modified then this holds - * the new bitmask we are about to install: - */ -static cpumask_t tracing_cpumask_new; +static cpumask_var_t tracing_cpumask; /* * The tracer itself will not take this lock, but still we want @@ -2693,6 +2687,10 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { int err, cpu; + cpumask_var_t tracing_cpumask_new; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; mutex_lock(&tracing_cpumask_update_lock); err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); @@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpu_isset(cpu, tracing_cpumask) && - !cpu_isset(cpu, tracing_cpumask_new)) { + if (cpumask_test_cpu(cpu, tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); } - if (!cpu_isset(cpu, tracing_cpumask) && - cpu_isset(cpu, tracing_cpumask_new)) { + if (!cpumask_test_cpu(cpu, tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); } } __raw_spin_unlock(&ftrace_max_lock); local_irq_enable(); - tracing_cpumask = tracing_cpumask_new; + cpumask_copy(tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask_new); return count; err_unlock: mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask); return err; } @@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (!iter) return -ENOMEM; + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&trace_types_lock); /* trace pipe does not show start of buffer */ - cpus_setall(iter->started); + cpumask_setall(iter->started); iter->tr = &global_trace; iter->trace = current_trace; @@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + free_cpumask_var(iter->started); kfree(iter); atomic_dec(&tracing_reader); @@ -3752,7 +3758,6 @@ void ftrace_dump(void) static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static cpumask_t mask; static int dump_ran; unsigned long flags; int cnt = 0, cpu; @@ -3786,8 +3791,6 @@ void ftrace_dump(void) * and then release the locks again. */ - cpus_clear(mask); - while (!trace_empty(&iter)) { if (!cnt) @@ -3823,19 +3826,28 @@ __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; int i; + int ret = -ENOMEM; - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_buffer_mask = cpu_possible_map; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(tracing_cpumask, cpu_all_mask); + /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - return 0; + goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + #ifdef CONFIG_TRACER_MAX_TRACE max_tr.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); @@ -3843,7 +3855,7 @@ __init static int tracer_alloc_buffers(void) printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); - return 0; + goto out_free_cpumask; } max_tr.entries = ring_buffer_size(max_tr.buffer); WARN_ON(max_tr.entries != global_trace.entries); @@ -3873,8 +3885,14 @@ __init static int tracer_alloc_buffers(void) &trace_panic_notifier); register_die_notifier(&trace_die_notifier); + ret = 0; - return 0; +out_free_cpumask: + free_cpumask_var(tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; } early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc7a4f8..4d3d381 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -368,7 +368,7 @@ struct trace_iterator { loff_t pos; long idx; - cpumask_t started; + cpumask_var_t started; }; int tracing_is_enabled(void); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3ccebde..366c8c3 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); tracing_sched_switch_assign_trace(tr); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4bf39fc..930c08e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu) int i; int ret; int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); /* diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index b6a3e20..649df22 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); } @@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); } @@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); } diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index a7172a3..7bda248 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr) trace_power_enabled = 1; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); return 0; } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index a5779bd..eaca5ad 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void start_stack_timer(int cpu) +static void start_stack_timer(void *unused) { - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; @@ -208,14 +208,7 @@ static void start_stack_timer(int cpu) static void start_stack_timers(void) { - cpumask_t saved_mask = current->cpus_allowed; - int cpu; - - for_each_online_cpu(cpu) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - start_stack_timer(cpu); - } - set_cpus_allowed_ptr(current, &saved_mask); + on_each_cpu(start_stack_timer, NULL, 1); } static void stop_stack_timer(int cpu) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab..43f891b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; diff --git a/kernel/up.c b/kernel/up.c new file mode 100644 index 0000000..1ff27a2 --- /dev/null +++ b/kernel/up.c @@ -0,0 +1,21 @@ +/* + * Uniprocessor-only support functions. The counterpart to kernel/smp.c + */ + +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/smp.h> + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + WARN_ON(cpu != 0); + + local_irq_disable(); + (func)(info); + local_irq_enable(); + + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4952322..2f44583 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); static int singlethread_cpu __read_mostly; -static cpumask_t cpu_singlethread_map __read_mostly; +static const struct cpumask *cpu_singlethread_map __read_mostly; /* * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD * flushes cwq->worklist. This means that flush_workqueue/wait_on_work @@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly; * use cpu_possible_map, the cpumask below is more a documentation * than optimization. */ -static cpumask_t cpu_populated_map __read_mostly; +static cpumask_var_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_wq_single_threaded(struct workqueue_struct *wq) @@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq) return wq->singlethread; } -static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) { return is_wq_single_threaded(wq) - ? &cpu_singlethread_map : &cpu_populated_map; + ? cpu_singlethread_map : cpu_populated_map; } static @@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) */ void flush_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; might_sleep(); @@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work) { struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - const cpumask_t *cpu_map; + const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; cpu_maps_update_begin(); @@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - cpu_set(cpu, cpu_populated_map); + cpumask_set_cpu(cpu, cpu_populated_map); } undo: list_for_each_entry(wq, &workqueues, list) { @@ -964,7 +964,7 @@ undo: switch (action) { case CPU_UP_CANCELED: case CPU_POST_DEAD: - cpu_clear(cpu, cpu_populated_map); + cpumask_clear_cpu(cpu, cpu_populated_map); } return ret; @@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void __init init_workqueues(void) { - cpu_populated_map = cpu_online_map; - singlethread_cpu = first_cpu(cpu_possible_map); - cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + + cpumask_copy(cpu_populated_map, cpu_online_mask); + singlethread_cpu = cpumask_first(cpu_possible_mask); + cpu_singlethread_map = cpumask_of(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); |