summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile44
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/cgroup.c325
-rw-r--r--kernel/compat.c74
-rw-r--r--kernel/cpuset.c884
-rw-r--r--kernel/debug/debug_core.c1
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c25
-rw-r--r--kernel/debug/kdb/kdb_main.c135
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/events/core.c34
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c22
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/futex.c50
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kexec.c184
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kprobes.c54
-rw-r--r--kernel/kthread.c52
-rw-r--r--kernel/lockdep.c42
-rw-r--r--kernel/module.c142
-rw-r--r--kernel/mutex.c151
-rw-r--r--kernel/nsproxy.c5
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/posix-timers.c25
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/printk.c89
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c242
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/debug.c97
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/stats.c79
-rw-r--r--kernel/signal.c351
-rw-r--r--kernel/smp.c183
-rw-r--r--kernel/smpboot.c16
-rw-r--r--kernel/softirq.c38
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c367
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/sysctl_binary.c43
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/ntp.c26
-rw-r--r--kernel/time/tick-broadcast.c3
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c26
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl376
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c74
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace.c5
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c9
-rw-r--r--kernel/user_namespace.c99
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/watchdog.c10
-rw-r--r--kernel/workqueue.c64
84 files changed, 2740 insertions, 2815 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f109..b3097bd 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6..bbde5f1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o \
async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
obj-y += sched/
obj-y += power/
-ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
-obj-$(CONFIG_X86) += kcmp.o
-endif
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -127,11 +125,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
ifeq ($(CONFIG_MODULE_SIG),y)
#
@@ -153,23 +159,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates
# fail and that the kernel may be used afterwards.
#
###############################################################################
-sign_key_with_hash :=
-ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
-sign_key_with_hash := -sha1
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
-sign_key_with_hash := -sha224
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
-sign_key_with_hash := -sha256
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
-sign_key_with_hash := -sha384
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
-sign_key_with_hash := -sha512
-endif
-ifeq ($(sign_key_with_hash),)
+ifndef CONFIG_MODULE_SIG_HASH
$(error Could not determine digest type to use from kernel config)
endif
@@ -182,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
@echo "### needs to be run as root, and uses a hardware random"
@echo "### number generator if one is available."
@echo "###"
- openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
- -x509 -config x509.genkey \
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
-outform DER -out signing_key.x509 \
-keyout signing_key.priv
@echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index e8b1627..b9bd7f0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d972..f6c2ce5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892..a32f943 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
int i;
- int index;
- unsigned long tmp = 0UL;
+ unsigned long key = 0UL;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
-
- return &css_set_table[index];
+ return key;
}
/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
css_set_count--;
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
struct cgroup *cgrp = link->cgrp;
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
+
+ /*
+ * We may not be holding cgroup_mutex, and if cgrp->count is
+ * dropped to 0 the cgroup can be destroyed at any time, hence
+ * rcu_read_lock is used to keep it alive.
+ */
+ rcu_read_lock();
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
+ rcu_read_unlock();
kfree(link);
}
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
{
int i;
struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
struct css_set *cg;
+ unsigned long key;
/*
* Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cg, hlist, key) {
if (!compare_css_sets(cg, oldcg, cgrp, template))
continue;
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
struct list_head tmp_cg_links;
- struct hlist_head *hhead;
struct cg_cgroup_link *link;
+ unsigned long key;
/* First see if we already have a cgroup group that matches
* the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
css_set_count++;
/* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ key = css_set_hash(res->subsys);
+ hash_add(css_set_table, &res->hlist, key);
write_unlock(&css_set_lock);
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void cgroup_free_fn(struct work_struct *work)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
+ struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+ struct cgroup_subsys *ss;
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->css_free(cgrp);
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss)
+ ss->css_free(cgrp);
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup
+ */
+ deactivate_super(cgrp->root->sb);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
+ simple_xattrs_free(&cgrp->xattrs);
+
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+ kfree(cgrp);
+}
- simple_xattrs_free(&cgrp->xattrs);
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+ struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+ schedule_work(&cgrp->free_work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+ /* is dentry a directory ? if so, kfree() associated cgroup */
+ if (S_ISDIR(inode->i_mode)) {
+ struct cgroup *cgrp = dentry->d_fsdata;
- ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
- kfree_rcu(cgrp, rcu_head);
+ BUG_ON(!(cgroup_is_removed(cgrp)));
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
dput(parent);
}
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
struct cfent *cfe;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
+ /*
+ * If we're doing cleanup due to failure of cgroup_create(),
+ * the corresponding @cfe may not exist.
+ */
list_for_each_entry(cfe, &cgrp->files, node) {
struct dentry *d = cfe->dentry;
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
list_del_init(&cfe->node);
dput(d);
- return 0;
+ break;
}
- return -ENOENT;
}
/**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
}
}
root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
- synchronize_rcu();
return 0;
}
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_WORK(&cgrp->free_work, cgroup_free_fn);
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
+ struct css_set *cg;
BUG_ON(sb->s_root != NULL);
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
-
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
- }
+ hash_for_each(css_set_table, i, cg, hlist)
+ link_css_set(&tmp_cg_links, cg, root_cgrp);
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
"cgroup_path() called without proper locking");
- if (!dentry || cgrp == dummytop) {
+ if (cgrp == dummytop) {
/*
* Inactive subsystems have no dentry for their root
* cgroup
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
ss->attach(cgrp, &tset);
}
- synchronize_rcu();
out:
if (retval) {
for_each_subsys(root, ss) {
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/*
* step 5: success! and cleanup
*/
- synchronize_rcu();
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
*/
static inline struct cftype *__file_cft(struct file *file)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+ if (file_inode(file)->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
continue;
- if (is_add)
+ if (is_add) {
err = cgroup_add_file(cgrp, subsys, cft);
- else
- err = cgroup_rm_file(cgrp, cft);
- if (err) {
- pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
- is_add ? "add" : "remove", cft->name, err);
+ if (err)
+ pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+ cft->name, err);
ret = err;
+ } else {
+ cgroup_rm_file(cgrp, cft);
}
}
return ret;
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant,
+ * @pos is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last, *tmp;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ list_for_each_entry_rcu(tmp, &last->children, sibling)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
{
struct cgroup *last;
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
remove);
struct cgroup *cgrp = event->cgrp;
+ remove_wait_queue(event->wqh, &event->wait);
+
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
eventfd_ctx_put(event->eventfd);
kfree(event);
dput(cgrp->dentry);
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
- __remove_wait_queue(event->wqh, &event->wait);
- spin_lock(&cgrp->event_list_lock);
- list_del_init(&event->list);
- spin_unlock(&cgrp->event_list_lock);
/*
- * We are in atomic context, but cgroup_event_remove() may
- * sleep, so we have to call it in workqueue.
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
*/
- schedule_work(&event->remove);
+ spin_lock(&cgrp->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&cgrp->event_list_lock);
}
return 0;
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
+ struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+ ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto fail;
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
goto fail;
}
+ /*
+ * The file to be monitored must be in the same cgroup as
+ * cgroup.event_control is.
+ */
+ cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+ if (cgrp_cfile != cgrp) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
init_cgroup_housekeeping(cgrp);
+ dentry->d_fsdata = cgrp;
+ cgrp->dentry = dentry;
+
cgrp->parent = parent;
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
lockdep_assert_held(&dentry->d_inode->i_mutex);
/* allocation complete, commit to creation */
- dentry->d_fsdata = cgrp;
- cgrp->dentry = dentry;
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace. Use
- * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
- * cgroup_event_wake() is called with the wait queue head locked,
- * remove_wait_queue() cannot be called while holding event_list_lock.
+ * directory to avoid race between userspace and kernelspace.
*/
spin_lock(&cgrp->event_list_lock);
- list_splice_init(&cgrp->event_list, &tmp_list);
- spin_unlock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+ list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del_init(&event->list);
- remove_wait_queue(event->wqh, &event->wait);
- eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
+ spin_unlock(&cgrp->event_list_lock);
return 0;
}
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
int i, ret;
+ struct hlist_node *tmp;
+ struct css_set *cg;
+ unsigned long key;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct css_set *cg;
- struct hlist_node *node, *tmp;
- struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
- hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
- /* skip entries that we already rehashed */
- if (cg->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hlist_del(&cg->hlist);
- /* set new value */
- cg->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- new_bucket = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, new_bucket);
- }
+ hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
+ /* skip entries that we already rehashed */
+ if (cg->subsys[ss->subsys_id])
+ continue;
+ /* remove existing entry */
+ hash_del(&cg->hlist);
+ /* set new value */
+ cg->subsys[ss->subsys_id] = css;
+ /* recompute hash and restore entry */
+ key = css_set_hash(cg->subsys);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cg_cgroup_link *link;
- struct hlist_head *hhead;
BUG_ON(ss->module == NULL);
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
offline_css(ss, dummytop);
ss->active = 0;
- if (ss->use_id) {
- idr_remove_all(&ss->idr);
+ if (ss->use_id)
idr_destroy(&ss->idr);
- }
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
write_lock(&css_set_lock);
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
struct css_set *cg = link->cg;
+ unsigned long key;
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
cg->subsys[ss->subsys_id] = NULL;
- hhead = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, hhead);
+ key = css_set_hash(cg->subsys);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void)
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void)
{
int err;
int i;
- struct hlist_head *hhead;
+ unsigned long key;
err = bdi_init(&cgroup_backing_dev_info);
if (err)
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void)
}
/* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
BUG_ON(!init_root_id(&rootnode));
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
+ put_css_set_taskexit(cg);
}
/**
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
- int myid, error, size;
+ int ret, size;
BUG_ON(!ss->use_id);
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
+
+ idr_preload(GFP_KERNEL);
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+ ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
spin_unlock(&ss->id_lock);
+ idr_preload_end();
/* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
+ if (ret < 0)
goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
- newid->id = myid;
+ newid->id = ret;
newid->depth = depth;
return newid;
-remove_idr:
- error = -ENOSPC;
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
- return ERR_PTR(error);
+ return ERR_PTR(ret);
}
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
struct inode *inode;
struct cgroup_subsys_state *css;
- inode = f->f_dentry->d_inode;
+ inode = file_inode(f);
/* check in cgroup filesystem dir */
if (inode->i_op != &cgroup_dir_inode_operations)
return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index 36700e9..19971d8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
memcpy(blocked->sig, &set, sizeof(set));
}
-asmlinkage long compat_sys_sigprocmask(int how,
- compat_old_sigset_t __user *nset,
- compat_old_sigset_t __user *oset)
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
{
old_sigset_t old_set, new_set;
sigset_t new_blocked;
@@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
else
ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -982,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
}
EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
@@ -1013,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
return ret;
-
-}
-
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
- struct compat_siginfo __user *uinfo)
-{
- siginfo_t info;
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1067,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- return sigsuspend(&newset);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
{
struct timex txc;
@@ -1222,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
return 0;
}
-#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
-asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
- struct compat_timespec __user *interval)
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
{
struct timespec t;
int ret;
@@ -1237,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
return -EFAULT;
return ret;
}
-#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
/*
* Allocate user-space memory for the duration of a single system call,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63ee..4f9dfe4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
#include <linux/cgroup.h>
/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
-
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
+ struct work_struct hotplug_work;
};
/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+ struct cgroup *pcgrp = cs->css.cgroup->parent;
+
+ if (pcgrp)
+ return cgroup_cs(pcgrp);
+ return NULL;
+}
+
#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task)
{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
- CPUSET_CPU_OFFLINE,
- CPUSET_MEM_OFFLINE,
-};
-
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
+ cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
+ if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+ if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
/*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{
while (cs && !nodes_intersects(cs->mems_allowed,
node_states[N_MEMORY]))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
nodes_and(*pmask, cs->mems_allowed,
node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
{
struct cgroup *cont;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, cont, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, cont, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
+ (cpumask_empty(trial->cpus_allowed) ||
+ nodes_empty(trial->mems_allowed)))
+ goto out;
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup *pos_cgrp;
doms = NULL;
dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
- continue;
-
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
}
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
}
#endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
void rebuild_sched_domains(void)
{
- do_rebuild_sched_domains(NULL);
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
* @tsk: task to test
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Called for each task in a cgroup by cgroup_scan_tasks().
* Return nonzero if this tasks's cpus_allowed mask should be changed (in other
* words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
* cpus_allowed mask needs to be changed.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
heap_free(&heap);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
+ * Call holding cpuset_mutex, so current's cpuset won't change
* during this call, as manage_mutex holds off any cpuset_attach()
* calls. Therefore we don't need to take task_lock around the
* call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
/*
* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
* of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cgroup_mutex */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
* @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* Called by cgroup_scan_tasks() for each task in a cgroup.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_flag(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
* @cs: the cpuset in which each task's spread flags needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct cpuset *cs = cgroup_cs(cgrp);
struct task_struct *task;
int ret;
+ mutex_lock(&cpuset_mutex);
+
+ ret = -ENOSPC;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ goto out_unlock;
cgroup_taskset_for_each(task, cgrp, tset) {
/*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* set_cpus_allowed_ptr() on all attached tasks before
* cpus_allowed may be changed.
*/
+ ret = -EINVAL;
if (task->flags & PF_THREAD_BOUND)
- return -EINVAL;
- if ((ret = security_task_setscheduler(task)))
- return ret;
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
- return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ cgroup_cs(cgrp)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
+ /* static bufs protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_from;
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
cgroup_taskset_for_each(task, cgrp, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
&cpuset_attach_nodemask_to);
mmput(mm);
}
+
+ cs->attach_in_progress--;
+
+ /*
+ * We may have raced with CPU/memory hotunplug. Trigger hotplug
+ * propagation if @cs doesn't have any CPU or memory. It will move
+ * the newly added tasks to the nearest parent which can execute.
+ */
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ schedule_cpuset_propagate_hotplug(cs);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
const char *buf)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *trialcs;
+ int retval = -ENODEV;
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * Flushing cpuset_hotplug_work is enough to synchronize against
+ * hotplug hanlding; however, cpuset_attach() may schedule
+ * propagation work directly. Flush the workqueue too.
+ */
+ flush_work(&cpuset_hotplug_work);
+ flush_workqueue(cpuset_propagate_hotplug_wq);
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
- goto out;
+ goto out_unlock;
}
switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
-out:
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
{
- struct cgroup *parent_cg = cont->parent;
- struct cgroup *tmp_cg;
- struct cpuset *parent, *cs;
+ struct cpuset *cs;
- if (!parent_cg)
+ if (!cont->parent)
return &top_cpuset.css;
- parent = cgroup_cs(parent_cg);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
+ INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
cs->relax_domain_level = -1;
- cs->parent = parent;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup *pos_cg;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
number_of_cpusets++;
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
- goto skip_clone;
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+ goto out_unlock;
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup.
*/
- list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
- struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
-
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
- goto skip_clone;
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
}
+ rcu_read_unlock();
mutex_lock(&callback_mutex);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
mutex_unlock(&callback_mutex);
-skip_clone:
- return &cs->css;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
+}
+
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+
+ mutex_lock(&cpuset_mutex);
+
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ number_of_cpusets--;
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
static void cpuset_css_free(struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- if (is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- number_of_cpusets--;
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = {
.name = "cpuset",
.css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.subsys_id = cpuset_subsys_id,
.base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
{
struct cgroup *new_cgroup = scan->data;
+ cgroup_lock();
cgroup_attach_task(new_cgroup, tsk);
+ cgroup_unlock();
}
/**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
* @from: cpuset in which the tasks currently reside
* @to: cpuset to which the tasks will be moved
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
* callback_mutex must not be held, as cpuset_attach() will take it.
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
- */
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
-
- /*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
- parent = cs->parent;
+ parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ parent = parent_cs(parent);
move_member_tasks_to_cpuset(cs, parent);
}
-/*
- * Helper function to traverse cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
+/**
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
+ * @cs: cpuset in interest
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
{
- struct cpuset *cp;
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems, tmp_mems;
+ struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+ bool is_empty;
- if (list_empty(queue))
- return NULL;
+ mutex_lock(&cpuset_mutex);
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
- cp = list_first_entry(queue, struct cpuset, stack_list);
- list_del(queue->next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, queue);
+ /* remove offline cpus from @cs */
+ if (!cpumask_empty(&off_cpus)) {
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
+ update_tasks_cpumask(cs, NULL);
+ }
+
+ /* remove offline mems from @cs */
+ if (!nodes_empty(off_mems)) {
+ tmp_mems = cs->mems_allowed;
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(cs, &tmp_mems, NULL);
}
- return cp;
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If @cs became empty, move tasks to the nearest ancestor with
+ * execution resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ /* the following may free @cs, should be the last operation */
+ css_put(&cs->css);
}
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
+ *
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
+ */
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
+{
+ /*
+ * Pin @cs. The refcnt will be released when the work item
+ * finishes executing.
+ */
+ if (!css_tryget(&cs->css))
+ return;
-/*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
- * online/offline) and update the cpusets accordingly.
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
- * cpuset must be moved to a parent cpuset.
+ /*
+ * Queue @cs->hotplug_work. If already pending, lose the css ref.
+ * cpuset_propagate_hotplug_wq is ordered and propagation will
+ * happen in the order this function is called.
+ */
+ if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+ css_put(&cs->css);
+}
+
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
+ * descendants.
*
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
- * if all present pages from a node are offlined.
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static void
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
+static void cpuset_hotplug_workfn(struct work_struct *work)
{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- static nodemask_t oldmems; /* protected by cgroup_mutex */
+ static cpumask_t new_cpus, tmp_cpus;
+ static nodemask_t new_mems, tmp_mems;
+ bool cpus_updated, mems_updated;
+ bool cpus_offlined, mems_offlined;
- list_add_tail((struct list_head *)&root->stack_list, &queue);
+ mutex_lock(&cpuset_mutex);
- switch (event) {
- case CPUSET_CPU_OFFLINE:
- while ((cp = cpuset_next(&queue)) != NULL) {
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
- /* Continue past cpusets with all cpus online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
- continue;
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
+ &new_cpus);
- /* Remove offline cpus from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- mutex_unlock(&callback_mutex);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
+ mems_offlined = !nodes_empty(tmp_mems);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else
- update_tasks_cpumask(cp, NULL);
- }
- break;
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
- case CPUSET_MEM_OFFLINE:
- while ((cp = cpuset_next(&queue)) != NULL) {
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ tmp_mems = top_cpuset.mems_allowed;
+ mutex_lock(&callback_mutex);
+ top_cpuset.mems_allowed = new_mems;
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+ }
- /* Continue past cpusets with all mems online */
- if (nodes_subset(cp->mems_allowed,
- node_states[N_MEMORY]))
- continue;
+ /* if cpus or mems went down, we need to propagate to descendants */
+ if (cpus_offlined || mems_offlined) {
+ struct cpuset *cs;
+ struct cgroup *pos_cgrp;
- oldmems = cp->mems_allowed;
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+ schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_unlock();
+ }
- /* Remove offline mems from this cpuset. */
- mutex_lock(&callback_mutex);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_MEMORY]);
- mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else
- update_tasks_nodemask(cp, &oldmems, NULL);
- }
+ /* wait for propagations to finish */
+ flush_workqueue(cpuset_propagate_hotplug_wq);
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated) {
+ struct sched_domain_attr *attr;
+ cpumask_var_t *doms;
+ int ndoms;
+
+ mutex_lock(&cpuset_mutex);
+ ndoms = generate_sched_domains(&doms, &attr);
+ mutex_unlock(&cpuset_mutex);
+
+ partition_sched_domains(ndoms, doms, attr);
}
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
void cpuset_update_active_cpus(bool cpu_online)
{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
-
- cgroup_lock();
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- mutex_unlock(&callback_mutex);
-
- if (!cpu_online)
- scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
-
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
-
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- oldmems = top_cpuset.mems_allowed;
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
- mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
- break;
- case MEM_OFFLINE:
- /*
- * needn't update top_cpuset.mems_allowed explicitly because
- * scan_cpusets_upon_hotplug() will update it.
- */
- scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
- break;
- default:
- break;
- }
- cgroup_unlock();
-
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
#endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ cpuset_propagate_hotplug_wq =
+ alloc_ordered_workqueue("cpuset_hotplug", 0);
+ BUG_ON(!cpuset_propagate_hotplug_wq);
}
/**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
*/
static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
*
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+
+ if (!dentry) {
+ strcpy(cpuset_name, "/");
+ } else {
+ spin_lock(&dentry->d_lock);
+ strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+ CPUSET_NAME_LEN);
+ spin_unlock(&dentry->d_lock);
+ }
+
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
+ rcu_read_lock();
css = task_subsys_state(tsk, cpuset_subsys_id);
retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+ rcu_read_unlock();
if (retval < 0)
- goto out_unlock;
+ goto out_put_task;
seq_puts(m, buf);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738..c26278f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
*/
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
+#include <linux/serial_core.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28..2235967 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
#ifdef CONFIG_KGDB_KDB
extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e0..19d9a57 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/serial_core.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
len = len / 2;
remcom_out_buffer[len++] = 0;
+ kdb_common_init_state(ks);
kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
strcpy(remcom_out_buffer, "OK");
}
break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f..70a5046 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
/*
* kdb_ss
*
- * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
- * commands.
+ * Process the 'ss' (Single Step) command.
*
* ss
- * ssb
*
* Parameters:
* argc Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
* Outputs:
* None.
* Returns:
- * KDB_CMD_SS[B] for success, a kdb error if failure.
+ * KDB_CMD_SS for success, a kdb error if failure.
* Locking:
* None.
* Remarks:
*
* Set the arch specific option to trigger a debug trap after the next
* instruction.
- *
- * For 'ssb', set the trace flag in the debug trap handler
- * after printing the current insn and return directly without
- * invoking the kdb command processor, until a branch instruction
- * is encountered.
*/
static int kdb_ss(int argc, const char **argv)
{
- int ssb = 0;
-
- ssb = (strcmp(argv[0], "ssb") == 0);
if (argc != 0)
return KDB_ARGCOUNT;
/*
* Set trace flag and go.
*/
KDB_STATE_SET(DOING_SS);
- if (ssb) {
- KDB_STATE_SET(DOING_SSB);
- return KDB_CMD_SSB;
- }
return KDB_CMD_SS;
}
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
kdb_register_repeat("ss", kdb_ss, "",
"Single Step", 1, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("ssb", kdb_ss, "",
- "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b..328d18ef 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
- kdb_initial_cpu = atomic_read(&kgdb_active);
- kdb_current_task = kgdb_info[ks->cpu].task;
- kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ kdb_common_init_state(ks);
/* Remove any breakpoints as needed by kdb and clear single step */
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
- KDB_STATE_CLEAR(DOING_SSB);
KDB_STATE_SET(PAGER);
/* zero out any offline cpu data */
for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
* Upon exit from the kdb main loop setup break points and restart
* the system based on the requested continue state
*/
- kdb_initial_cpu = -1;
- kdb_current_task = NULL;
- kdb_current_regs = NULL;
+ kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 8875254..00eb8f7 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
};
#undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
@@ -175,7 +175,7 @@ static char *__env[] = {
(char *)0,
};
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
struct task_struct *kdb_curr_task(int cpu)
{
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
}
if (argc != 3)
return KDB_ARGCOUNT;
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set) {
- kdb_printf("Could not allocate new defcmd_set entry for %s\n",
- argv[1]);
- defcmd_set = save_defcmd_set;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
memcpy(defcmd_set, save_defcmd_set,
defcmd_set_count * sizeof(*defcmd_set));
- kfree(save_defcmd_set);
s = defcmd_set + defcmd_set_count;
memset(s, 0, sizeof(*s));
s->usable = 1;
s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
if (s->usage[0] == '"') {
- strcpy(s->usage, s->usage+1);
+ strcpy(s->usage, argv[2]+1);
s->usage[strlen(s->usage)-1] = '\0';
}
if (s->help[0] == '"') {
- strcpy(s->help, s->help+1);
+ strcpy(s->help, argv[3]+1);
s->help[strlen(s->help)-1] = '\0';
}
++defcmd_set_count;
defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
}
/*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
* KDB_CMD_GO User typed 'go'.
* KDB_CMD_CPU User switched to another cpu.
* KDB_CMD_SS Single step.
- * KDB_CMD_SSB Single step until branch.
*/
static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
instruction_pointer(regs));
break;
- case KDB_DB_SSB:
- /*
- * In the midst of ssb command. Just return.
- */
- KDB_DEBUG_STATE("kdb_local 3", reason);
- return KDB_CMD_SSB; /* Continue with SSB command */
-
- break;
case KDB_DB_SS:
break;
case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
if (diag == KDB_CMD_GO
|| diag == KDB_CMD_CPU
|| diag == KDB_CMD_SS
- || diag == KDB_CMD_SSB
|| diag == KDB_CMD_KGDB)
break;
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
break;
}
- if (result == KDB_CMD_SSB) {
- KDB_STATE_SET(DOING_SS);
- KDB_STATE_SET(DOING_SSB);
- break;
- }
-
if (result == KDB_CMD_KGDB) {
if (!KDB_STATE(DOING_KGDB))
kdb_printf("Entering please attach debugger "
@@ -2350,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
return 0;
}
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- * linked list and executes an arbitrary command for each
- * element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
- int diag = 0;
- unsigned long addr;
- long offset = 0;
- unsigned long va;
- unsigned long linkoffset;
- int nextarg;
- const char *command;
-
- if (argc != 3)
- return KDB_ARGCOUNT;
-
- nextarg = 1;
- diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
- if (diag)
- return diag;
-
- diag = kdbgetularg(argv[2], &linkoffset);
- if (diag)
- return diag;
-
- /*
- * Using the starting address as
- * the first element in the list, and assuming that
- * the list ends with a null pointer.
- */
-
- va = addr;
- command = kdb_strdup(argv[3], GFP_KDB);
- if (!command) {
- kdb_printf("%s: cannot duplicate command\n", __func__);
- return 0;
- }
- /* Recursive use of kdb_parse, do not use argv after this point */
- argv = NULL;
-
- while (va) {
- char buf[80];
-
- if (KDB_FLAG(CMD_INTERRUPT))
- goto out;
-
- sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
- diag = kdb_parse(buf);
- if (diag)
- goto out;
-
- addr = va + linkoffset;
- if (kdb_getword(&va, addr, sizeof(va)))
- goto out;
- }
-
-out:
- kfree(command);
- return diag;
-}
-
static int kdb_kgdb(int argc, const char **argv)
{
return KDB_CMD_KGDB;
@@ -2430,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
kdb_printf("-----------------------------"
"-----------------------------\n");
for_each_kdbcmd(kt, i) {
- if (kt->cmd_name)
- kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
- kt->cmd_usage, kt->cmd_help);
+ char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
}
return 0;
}
@@ -2739,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
(kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
kfree(kdb_commands);
}
- memset(new + kdb_max_commands, 0,
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
kdb_command_extend * sizeof(*new));
kdb_commands = new;
kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2843,15 +2784,13 @@ static void __init kdb_inittab(void)
"Stack traceback", 1, KDB_REPEAT_NONE);
kdb_register_repeat("btp", kdb_bt, "<pid>",
"Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
- "Display stack all processes", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btc", kdb_bt, "",
"Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
KDB_REPEAT_NONE);
- kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
- "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
kdb_register_repeat("env", kdb_env, "",
"Show environment variables", 0, KDB_REPEAT_NONE);
kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a..7afd3c8 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
#define KDB_CMD_SS (-1003)
-#define KDB_CMD_SSB (-1004)
#define KDB_CMD_KGDB (-1005)
/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
* kdb control */
#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
-#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
- * DOING_SS is also set */
#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
* after one ss, independent of
* DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
typedef enum {
KDB_DB_BPT, /* Breakpoint */
KDB_DB_SS, /* Single-step trap */
- KDB_DB_SSB, /* Single step to branch */
KDB_DB_SSBPT, /* Single step over breakpoint */
KDB_DB_NOBPT /* Spurious breakpoint */
} kdb_dbtrap_t;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5c75791..9fcb094 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3691,7 +3691,7 @@ unlock:
static int perf_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
@@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_task_ctx(ctx, task_event);
}
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
+ if (task_event->task_ctx)
+ perf_event_task_ctx(task_event->task_ctx, task_event);
+
rcu_read_unlock();
}
@@ -4593,6 +4596,7 @@ void perf_event_comm(struct task_struct *task)
struct perf_event_context *ctx;
int ctxn;
+ rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
@@ -4600,6 +4604,7 @@ void perf_event_comm(struct task_struct *task)
perf_event_enable_on_exec(ctx);
}
+ rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -4734,7 +4739,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
} else {
if (arch_vma_name(mmap_event->vma)) {
name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp));
+ sizeof(tmp) - 1);
+ tmp[sizeof(tmp) - 1] = '\0';
goto got_name;
}
@@ -5126,7 +5132,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct perf_event *event;
- struct hlist_node *node;
struct hlist_head *head;
rcu_read_lock();
@@ -5134,7 +5139,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (!head)
goto end;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
@@ -5328,7 +5333,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
@@ -5419,7 +5424,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
- struct hlist_node *node;
struct perf_raw_record raw = {
.size = entry_size,
@@ -5429,7 +5433,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
@@ -5649,6 +5653,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5965,13 +5970,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
pmu->name = name;
if (type < 0) {
- int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
- if (!err)
- goto free_pdc;
-
- err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
- if (err) {
- ret = err;
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
goto free_pdc;
}
}
@@ -5988,6 +5989,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
+ ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c..eb675c4 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
- int writable; /* are we writable */
+ int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34f..97fddb0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
unsigned long offset, unsigned long head)
{
- unsigned long mask;
+ unsigned long sz = perf_data_size(rb);
+ unsigned long mask = sz - 1;
- if (!rb->writable)
+ /*
+ * check if user-writable
+ * overwrite : over-write its own tail
+ * !overwrite: buffer possibly drops events.
+ */
+ if (rb->overwrite)
return true;
- mask = perf_data_size(rb) - 1;
+ /*
+ * verify that payload is not bigger than buffer
+ * otherwise masking logic may fail to detect
+ * the "not enough space" condition
+ */
+ if ((head - offset) > sz)
+ return false;
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
- rb->writable = 1;
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
diff --git a/kernel/exit.c b/kernel/exit.c
index 7dd2040..60bc027 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -485,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4133876..1766d32 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* If unsharing a user namespace must also unshare the thread.
*/
if (unshare_flags & CLONE_NEWUSER)
- unshare_flags |= CLONE_THREAD;
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing a pid namespace must also unshare the thread.
*/
@@ -1861,10 +1864,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
exit_sem(current);
}
- if (new_nsproxy) {
+ if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
- new_nsproxy = NULL;
- }
task_lock(current);
@@ -1894,9 +1895,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
- if (new_nsproxy)
- put_nsproxy(new_nsproxy);
-
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 9618b6e..b26dcfc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * 1 - acquired the lock;
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2472,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
if (!futex_cmpxchg_enabled)
return -ENOSYS;
- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
rcu_read_lock();
ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b..f9f44fd 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
#include <linux/nsproxy.h>
#include <linux/futex.h>
#include <linux/ptrace.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
}
}
-asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
- compat_size_t len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
return 0;
}
-asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
- compat_size_t __user *len_ptr)
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
if (!futex_cmpxchg_enabled)
return -ENOSYS;
- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
rcu_read_lock();
ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
return ret;
}
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
- struct compat_timespec __user *utime, u32 __user *uaddr2,
- u32 val3)
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a920281..d4da55d 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812..14be27f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa..397db02 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ unsigned int irq = (int)(long)PDE(file_inode(file))->data;
cpumask_var_t new_value;
int err;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd78..ffd4e11 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
int kexec_should_crash(struct task_struct *p)
{
@@ -223,6 +229,8 @@ out:
}
+static void kimage_free_page_list(struct list_head *list);
+
static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
if (result)
goto out;
- *rimage = image;
-
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ goto out_free;
}
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
printk(KERN_ERR "Could not allocate swap buffer\n");
- goto out;
+ goto out_free;
}
- result = 0;
- out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+out:
return result;
}
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out;
+ goto out_free;
}
/*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ goto out_free;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kfree(image);
+out:
return result;
}
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
- if (hole_end > crashk_res.end)
- break;
/* See if I overlap any of the segments */
for (i = 0; i < image->nr_segments; i++) {
unsigned long mstart, mend;
@@ -1365,34 +1368,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
return 0;
}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
*/
-int __init parse_crashkernel(char *cmdline,
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
{
- char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
+ char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, "crashkernel=");
- while (p) {
- ck_cmdline = p;
- p = strstr(p+1, "crashkernel=");
- }
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
- ck_cmdline += 12; /* strlen("crashkernel=") */
+ ck_cmdline += strlen(name);
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ crash_base, suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1409,6 +1492,36 @@ int __init parse_crashkernel(char *cmdline,
return 0;
}
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
static void update_vmcoreinfo_note(void)
{
@@ -1490,6 +1603,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, _count);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1627,11 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b..0000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
- return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
- size_t esize, gfp_t gfp_mask)
-{
- /*
- * round down to the next power of 2, since our 'let the indices
- * wrap' technique works only in this case.
- */
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
-
- if (size < 2) {
- fifo->data = NULL;
- fifo->mask = 0;
- return -EINVAL;
- }
-
- fifo->data = kmalloc(size * esize, gfp_mask);
-
- if (!fifo->data) {
- fifo->mask = 0;
- return -ENOMEM;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-
-void __kfifo_free(struct __kfifo *fifo)
-{
- kfree(fifo->data);
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = 0;
- fifo->data = NULL;
- fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
- unsigned int size, size_t esize)
-{
- size /= esize;
-
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
- fifo->data = buffer;
-
- if (size < 2) {
- fifo->mask = 0;
- return -EINVAL;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(fifo->data + off, src, l);
- memcpy(fifo->data, src + l, len - l);
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_in(struct __kfifo *fifo,
- const void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- kfifo_copy_in(fifo, buf, len, fifo->in);
- fifo->in += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(dst, fifo->data + off, l);
- memcpy(dst + l, fifo->data, len - l);
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- kfifo_copy_out(fifo, buf, len, fifo->out);
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-
-unsigned int __kfifo_out(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- len = __kfifo_out_peek(fifo, buf, len);
- fifo->out += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int off,
- unsigned int *copied)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned long ret;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_from_user(fifo->data + off, from, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_from_user(fifo->data, from + l, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->in += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
- unsigned int len, unsigned int off, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_to_user(to, fifo->data + off, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_to_user(to + l, fifo->data, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->out += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
- int nents, unsigned int len)
-{
- int n;
- unsigned int l;
- unsigned int off;
- struct page *page;
-
- if (!nents)
- return 0;
-
- if (!len)
- return 0;
-
- n = 0;
- page = virt_to_page(buf);
- off = offset_in_page(buf);
- l = 0;
-
- while (len >= l + PAGE_SIZE - off) {
- struct page *npage;
-
- l += PAGE_SIZE;
- buf += PAGE_SIZE;
- npage = virt_to_page(buf);
- if (page_to_phys(page) != page_to_phys(npage) - l) {
- sg_set_page(sgl, page, l - off, off);
- sgl = sg_next(sgl);
- if (++n == nents || sgl == NULL)
- return n;
- page = npage;
- len -= l - off;
- l = off = 0;
- }
- }
- sg_set_page(sgl, page, len, off);
- return n + 1;
-}
-
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
- int nents, unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned int n;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
- n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-
- return n;
-}
-
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
- unsigned int max = (1 << (recsize << 3)) - 1;
-
- if (len > max)
- return max;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_max_r);
-
-#define __KFIFO_PEEK(data, out, mask) \
- ((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int l;
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- l = __KFIFO_PEEK(data, fifo->out, mask);
-
- if (--recsize)
- l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-
- return l;
-}
-
-#define __KFIFO_POKE(data, in, mask, val) \
- ( \
- (data)[(in) & (mask)] = (unsigned char)(val) \
- )
-
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- __KFIFO_POKE(data, fifo->in, mask, n);
-
- if (recsize > 1)
- __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
- return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
- unsigned int len, size_t recsize)
-{
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- __kfifo_poke_n(fifo, len, recsize);
-
- kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
- fifo->in += len + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
- void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
- *n = __kfifo_peek_n(fifo, recsize);
-
- if (len > *n)
- len = *n;
-
- kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
- return len;
-}
-
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
- fifo->out += n + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-
-void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int n;
-
- n = __kfifo_peek_n(fifo, recsize);
- fifo->out += n + recsize;
-}
-EXPORT_SYMBOL(__kfifo_skip_r);
-
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo)) {
- *copied = 0;
- return 0;
- }
-
- __kfifo_poke_n(fifo, len, recsize);
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->in += len + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
- unsigned int n;
-
- if (fifo->in == fifo->out) {
- *copied = 0;
- return 0;
- }
-
- n = __kfifo_peek_n(fifo, recsize);
- if (len > n)
- len = n;
-
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->out += n + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
- unsigned int len, size_t recsize)
-{
- len = __kfifo_max_r(len, recsize);
- __kfifo_poke_n(fifo, len, recsize);
- fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > fifo->in - fifo->out)
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int len;
-
- len = __kfifo_peek_n(fifo, recsize);
- fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 550294d..3fed7f0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
struct kprobe __kprobes *get_kprobe(void *addr)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (p->addr == addr)
return p;
}
@@ -795,53 +794,58 @@ out:
}
#ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
- return;
+ goto out;
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+ mutex_unlock(&kprobe_mutex);
}
-/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
- if (!kprobes_allow_optimization)
+ if (!kprobes_allow_optimization) {
+ mutex_unlock(&kprobe_mutex);
return;
+ }
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
}
+ mutex_unlock(&kprobe_mutex);
+
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@@ -849,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
{
int ret;
- mutex_lock(&kprobe_mutex);
+ mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -857,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_mutex);
+ mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
@@ -1148,7 +1152,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
unsigned long hash, flags = 0;
if (unlikely(!kprobes_initialized))
@@ -1159,12 +1163,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task == tk)
recycle_rp_inst(ri, &empty_rp);
}
kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -1173,9 +1177,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
- hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+ hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -1185,14 +1189,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
struct hlist_head *head;
/* No race here */
for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
kretprobe_table_lock(hash, &flags);
head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+ hlist_for_each_entry_safe(ri, next, head, hlist) {
if (ri->rp == rp)
ri->rp = NULL;
}
@@ -2028,7 +2032,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
{
struct module *mod = data;
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
@@ -2045,7 +2048,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2192,7 +2195,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
@@ -2201,7 +2203,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
head = &kprobe_table[i];
preempt_disable();
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (kprobe_aggrprobe(p)) {
@@ -2236,7 +2238,6 @@ static const struct file_operations debugfs_kprobes_operations = {
static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2249,7 +2250,7 @@ static void __kprobes arm_all_kprobes(void)
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
arm_kprobe(p);
}
@@ -2265,7 +2266,6 @@ already_enabled:
static void __kprobes disarm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2282,7 +2282,7 @@ static void __kprobes disarm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
disarm_kprobe(p, false);
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2e..9eb7fed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
complete(&self->parked);
schedule();
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
}
clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, state)) {
+ WARN_ON(1);
+ return;
+ }
/* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
- if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
- WARN_ON(1);
- return;
- }
- __kthread_bind(p, cpu);
+ __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
return NULL;
}
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+ wake_up_state(k, TASK_PARKED);
+ }
+}
+
/**
* kthread_unpark - unpark a thread created by kthread_create().
* @k: thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = task_get_live_kthread(k);
- if (kthread) {
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- /*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
- */
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu);
- wake_up_process(k);
- }
- }
+ if (kthread)
+ __kthread_unpark(k, kthread);
put_task_struct(k);
}
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
trace_sched_kthread_stop(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ __kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b..6a3bccb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
+
static int save_trace(struct stack_trace *trace)
{
trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
}
raw_local_irq_restore(flags);
- printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
return NULL;
}
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
- printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
return NULL;
}
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- struct held_lock *hlock_curr, *hlock_next;
+ struct held_lock *hlock_curr;
int i, j;
/*
@@ -2048,8 +2052,7 @@ cache_hit:
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
return 0;
}
@@ -2057,12 +2060,10 @@ cache_hit:
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
- hlock_next = hlock;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock_next->irq_context)
+ if (hlock_curr->irq_context != hlock->irq_context)
break;
- hlock_next = hlock;
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
- printk("BUG: MAX_LOCK_DEPTH too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
+ curr->lockdep_depth, MAX_LOCK_DEPTH);
+
+ lockdep_print_held_locks(current);
+ debug_show_all_locks();
dump_stack();
+
return 0;
}
@@ -3203,7 +3209,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
unsigned long ip)
{
if (!debug_locks_off())
@@ -3246,7 +3252,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 0;
if (curr->lockdep_depth <= 0)
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
return 1;
}
@@ -3317,7 +3323,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3390,7 @@ lock_release_non_nested(struct task_struct *curr,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
if (hlock->instance == lock)
diff --git a/kernel/module.c b/kernel/module.c
index eab0827..0925c9a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
return -ENOENT;
}
-static inline void add_taint_module(struct module *mod, unsigned flag)
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
{
- add_taint(flag);
+ add_taint(flag, lockdep_ok);
mod->taints |= (1U << flag);
}
@@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
{
int ret = (flags & O_TRUNC);
if (ret)
- add_taint(TAINT_FORCED_RMMOD);
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
return ret;
}
#else
@@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
if (!test_taint(TAINT_FORCED_MODULE))
printk(KERN_WARNING "%s: %s: kernel tainted.\n",
mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
return 0;
#else
return -ENOEXEC;
@@ -2147,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
if (!test_taint(TAINT_PROPRIETARY_MODULE))
printk(KERN_WARNING "%s: module license '%s' taints "
"kernel.\n", mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
}
}
@@ -2539,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
if (err)
goto out;
- err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
+ err = vfs_getattr(&file->f_path, &stat);
if (err)
goto out;
@@ -2700,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
}
if (!get_modinfo(info, "intree"))
- add_taint_module(mod, TAINT_OOT_MODULE);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP);
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
printk(KERN_WARNING "%s: module is from the staging directory,"
" the quality is unknown, you have been warned.\n",
mod->name);
@@ -2869,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
* using GPL-only symbols it needs.
*/
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
/* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
/* lve claims to be GPL but upstream won't provide source */
if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
@@ -3141,12 +3145,72 @@ static int may_init_module(void)
return 0;
}
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+ struct module *old;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+again:
+ mutex_lock(&module_mutex);
+ if ((old = find_module_all(mod->name, true)) != NULL) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto out_unlocked;
+ goto again;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ list_add_rcu(&mod->list, &modules);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+out_unlocked:
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* This relies on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+ /* Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us. */
+ mod->state = MODULE_STATE_COMING;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
- struct module *mod, *old;
+ struct module *mod;
long err;
err = module_sig_check(info);
@@ -3164,36 +3228,20 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_copy;
}
- /*
- * We try to place it in the list now to make sure it's unique
- * before we dedicate too many resources. In particular,
- * temporary percpu memory exhaustion.
- */
- mod->state = MODULE_STATE_UNFORMED;
-again:
- mutex_lock(&module_mutex);
- if ((old = find_module_all(mod->name, true)) != NULL) {
- if (old->state == MODULE_STATE_COMING
- || old->state == MODULE_STATE_UNFORMED) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto free_module;
- goto again;
- }
- err = -EEXIST;
- mutex_unlock(&module_mutex);
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
goto free_module;
- }
- list_add_rcu(&mod->list, &modules);
- mutex_unlock(&module_mutex);
#ifdef CONFIG_MODULE_SIG
mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok)
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ if (!mod->sig_ok) {
+ printk_once(KERN_NOTICE
+ "%s: module verification failed: signature and/or"
+ " required key missing - tainting kernel\n",
+ mod->name);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+ }
#endif
/* Now module is in final location, initialize linked lists, etc. */
@@ -3236,21 +3284,11 @@ again:
dynamic_debug_setup(info->debug, info->num_debug);
- mutex_lock(&module_mutex);
- /* Find duplicate symbols (must be called under lock). */
- err = verify_export_symbols(mod);
- if (err < 0)
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
goto ddebug_cleanup;
- /* This relies on module_mutex for list integrity. */
- module_bug_finalize(info->hdr, info->sechdrs, mod);
-
- /* Mark state as coming so strong_try_module_get() ignores us,
- * but kallsyms etc. can see us. */
- mod->state = MODULE_STATE_COMING;
-
- mutex_unlock(&module_mutex);
-
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, &ddebug_dyndbg_module_param_cb);
@@ -3274,8 +3312,8 @@ again:
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
- ddebug_cleanup:
mutex_unlock(&module_mutex);
+ ddebug_cleanup:
dynamic_debug_remove(info->debug);
synchronize_sched();
kfree(mod->args);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f2301..ad53a66 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
# include <asm/mutex.h>
#endif
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
+
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ lock->spin_mlock = NULL;
+#endif
debug_mutex_init(lock, name, key);
}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+ struct mspin_node *next ;
+ int locked; /* 1 if lock acquired */
+};
+#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
+
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+ ACCESS_ONCE(next->locked) = 1;
+ smp_wmb();
+}
+
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ int retval = 1;
+
+ rcu_read_lock();
+ if (lock->owner)
+ retval = lock->owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+#endif
+
static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
*/
+ if (!mutex_can_spin_on_owner(lock))
+ goto slowpath;
for (;;) {
struct task_struct *owner;
+ struct mspin_node node;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
+ mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
+ if (owner && !mutex_spin_on_owner(lock, owner)) {
+ mspin_unlock(MLOCK(lock), &node);
break;
+ }
- if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+ if ((atomic_read(&lock->count) == 1) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
+ mspin_unlock(MLOCK(lock), &node);
preempt_enable();
return 0;
}
+ mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
arch_mutex_cpu_relax();
}
+slowpath:
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
goto done;
lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) &&
+ (atomic_xchg(&lock->count, -1) == 1))
break;
/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb..afc0456 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
goto out;
}
- new_ns = create_new_namespaces(flags, tsk,
- task_cred_xxx(tsk, user_ns), tsk->fs);
+ new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns)) {
err = PTR_ERR(new_ns);
goto out;
@@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = PROC_I(file->f_dentry->d_inode);
+ ei = PROC_I(file_inode(file));
ops = ei->ns_ops;
if (nstype && (ops->type != nstype))
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822..7c57cc9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
return tainted_mask;
}
-void add_taint(unsigned flag)
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
- /*
- * Can't trust the integrity of the kernel anymore.
- * We don't call directly debug_locks_off() because the issue
- * is not necessarily serious enough to set oops_in_progress to 1
- * Also we want to keep up lockdep for staging/out-of-tree
- * development and post-warning case.
- */
- switch (flag) {
- case TAINT_CRAP:
- case TAINT_OOT_MODULE:
- case TAINT_WARN:
- case TAINT_FIRMWARE_WORKAROUND:
- break;
-
- default:
- if (__debug_locks_off())
- printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
- }
+ if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
+ printk(KERN_WARNING
+ "Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
print_modules();
dump_stack();
print_oops_end_marker();
- add_taint(taint);
+ /* Just a warning, don't kill lockdep. */
+ add_taint(taint, LOCKDEP_STILL_OK);
}
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index f2c6a68..047dc62 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns)
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct hlist_node *elem;
struct upid *pnr;
- hlist_for_each_entry_rcu(pnr, elem,
+ hlist_for_each_entry_rcu(pnr,
&pid_hash[pid_hashfn(nr, ns)], pid_chain)
if (pnr->nr == nr && pnr->ns == ns)
return container_of(pnr, struct pid,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1..bea15bd 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int nr;
int rc;
struct task_struct *task, *me = current;
+ int init_pids = thread_group_leader(me) ? 1 : 2;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (pid_ns->nr_hashed == 1)
+ if (pid_ns->nr_hashed == init_pids)
break;
schedule();
}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 10349d5..6edbb2c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
- retry:
- if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
- error = -EAGAIN;
- goto out;
- }
+
+ idr_preload(GFP_KERNEL);
spin_lock_irq(&idr_lock);
- error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
+ error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
spin_unlock_irq(&idr_lock);
- if (error) {
- if (error == -EAGAIN)
- goto retry;
+ idr_preload_end();
+ if (error < 0) {
/*
* Weird looking, but we return EAGAIN if the IDR is
* full (proper POSIX return value for this)
*/
- error = -EAGAIN;
+ if (error == -ENOSPC)
+ error = -EAGAIN;
goto out;
}
+ new_timer_id = error;
it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
+ /*
+ * timer_t could be any type >= int and we want to make sure any
+ * @timer_id outside positive int range fails lookup.
+ */
+ if ((unsigned long long)timer_id > INT_MAX)
+ return NULL;
+
rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046..c6422ff 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
void queue_up_suspend_work(void)
{
- if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+ if (autosleep_state > PM_SUSPEND_ON)
queue_work(autosleep_wq, &suspend_work);
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f91..d77663b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_STANDBY;
+ suspend_state_t state = PM_SUSPEND_MIN;
const char * const *s;
#endif
char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ freeze_timeout_msecs = val;
+ return n;
+}
+
+power_attr(pm_freeze_timeout);
+
+#endif /* CONFIG_FREEZER*/
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
&pm_print_times_attr.attr,
#endif
#endif
+#ifdef CONFIG_FREEZER
+ &pm_freeze_timeout_attr.attr,
+#endif
NULL,
};
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b..98088e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
/*
* Timeout for stopping processes
*/
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
do_gettimeofday(&start);
- end_time = jiffies + TIMEOUT;
+ end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
if (!user_only)
freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7..587ddde 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
if (new_value != req->node.prio)
pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
"%s called for unknown object.", __func__))
return;
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
if (new_value != req->node.prio)
pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
return;
}
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446..d4feda0 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
#include "power.h"
const char *const pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_FREEZE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
static const struct platform_suspend_ops *suspend_ops;
+static bool need_suspend_ops(suspend_state_t state)
+{
+ return !!(state > PM_SUSPEND_FREEZE);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+
+static void freeze_begin(void)
+{
+ suspend_freeze_wake = false;
+}
+
+static void freeze_enter(void)
+{
+ wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+
+void freeze_wake(void)
+{
+ suspend_freeze_wake = true;
+ wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
+
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
bool valid_state(suspend_state_t state)
{
+ if (state == PM_SUSPEND_FREEZE)
+ return true;
/*
- * All states need lowlevel support and need to be valid to the lowlevel
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
+ * support and need to be valid to the lowlevel
* implementation, no valid callback implies that none are valid.
*/
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
* hibernation). Run suspend notifiers, allocate the "suspend" console and
* freeze processes.
*/
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
{
int error;
- if (!suspend_ops || !suspend_ops->enter)
+ if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
return -EPERM;
pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (suspend_ops->prepare) {
+ if (need_suspend_ops(state) && suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_finish;
}
- if (suspend_ops->prepare_late) {
+ if (need_suspend_ops(state) && suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
goto Platform_wake;
}
+ /*
+ * PM_SUSPEND_FREEZE equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus we should invoke freeze_enter() soon after
+ * all the devices are suspended.
+ */
+ if (state == PM_SUSPEND_FREEZE) {
+ freeze_enter();
+ goto Platform_wake;
+ }
+
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
enable_nonboot_cpus();
Platform_wake:
- if (suspend_ops->wake)
+ if (need_suspend_ops(state) && suspend_ops->wake)
suspend_ops->wake();
dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (suspend_ops->finish)
+ if (need_suspend_ops(state) && suspend_ops->finish)
suspend_ops->finish();
return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (!suspend_ops)
+ if (need_suspend_ops(state) && !suspend_ops)
return -ENOSYS;
trace_machine_suspend(state);
- if (suspend_ops->begin) {
+ if (need_suspend_ops(state) && suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup
+ } while (!error && !wakeup && need_suspend_ops(state)
&& suspend_ops->suspend_again && suspend_ops->suspend_again());
Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
ftrace_start();
resume_console();
Close:
- if (suspend_ops->end)
+ if (need_suspend_ops(state) && suspend_ops->end)
suspend_ops->end();
trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
- if (suspend_ops->recover)
+ if (need_suspend_ops(state) && suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
+ if (state == PM_SUSPEND_FREEZE)
+ freeze_begin();
+
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
- error = suspend_prepare();
+ error = suspend_prepare(state);
if (error)
goto Unlock;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e4..9b2a1d5 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
rtc_set_alarm(rtc, &alm);
}
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
+static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
if (!device_may_wakeup(candidate->dev.parent))
return 0;
- *(const char **)name_ptr = dev_name(dev);
return 1;
}
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
static char warn_no_rtc[] __initdata =
KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
- char *pony = NULL;
struct rtc_device *rtc = NULL;
+ struct device *dev;
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
}
/* RTCs have initialized by now too ... can we use one? */
- class_find_device(rtc_class, NULL, &pony, has_wakealarm);
- if (pony)
- rtc = rtc_class_open(pony);
+ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ if (dev)
+ rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index f24633a..abbdd9e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-
int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -88,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_lock_dep_map = {
+ .name = "console_lock"
+};
+#endif
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -218,6 +222,7 @@ struct log {
static DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
@@ -1919,6 +1924,7 @@ void console_lock(void)
return;
console_locked = 1;
console_may_schedule = 1;
+ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
}
EXPORT_SYMBOL(console_lock);
@@ -1940,6 +1946,7 @@ int console_trylock(void)
}
console_locked = 1;
console_may_schedule = 0;
+ mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -1949,45 +1956,6 @@ int is_console_locked(void)
return console_locked;
}
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE 512
-
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_SCHED 0x02
-
-static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-
-static void wake_up_klogd_work_func(struct irq_work *irq_work)
-{
- int pending = __this_cpu_xchg(printk_pending, 0);
-
- if (pending & PRINTK_PENDING_SCHED) {
- char *buf = __get_cpu_var(printk_sched_buf);
- printk(KERN_WARNING "[sched_delayed] %s", buf);
- }
-
- if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
-}
-
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
-};
-
-void wake_up_klogd(void)
-{
- preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
- irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
- }
- preempt_enable();
-}
-
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2102,6 +2070,7 @@ skip:
local_irq_restore(flags);
}
console_locked = 0;
+ mutex_release(&console_lock_dep_map, 1, _RET_IP_);
/* Release the exclusive_console once it is used */
if (unlikely(exclusive_console))
@@ -2449,6 +2418,44 @@ static int __init printk_late_init(void)
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE 512
+
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_SCHED 0x02
+
+static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+ int pending = __this_cpu_xchg(printk_pending, 0);
+
+ if (pending & PRINTK_PENDING_SCHED) {
+ char *buf = __get_cpu_var(printk_sched_buf);
+ printk(KERN_WARNING "[sched_delayed] %s", buf);
+ }
+
+ if (pending & PRINTK_PENDING_WAKEUP)
+ wake_up_interruptible(&log_wait);
+}
+
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+ .func = wake_up_klogd_work_func,
+ .flags = IRQ_WORK_LAZY,
+};
+
+void wake_up_klogd(void)
+{
+ preempt_disable();
+ if (waitqueue_active(&log_wait)) {
+ this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ }
+ preempt_enable();
+}
int printk_sched(const char *fmt, ...)
{
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd202..01ab081 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_lock(&file_inode(filp)->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&file_inode(filp)->i_mutex);
return desc->written;
}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10..1d96dd0 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/stat.h>
#include "rtmutex.h"
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
return curr - buf;
}
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
static struct bus_type rttest_subsys = {
.name = "rttest",
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21..64de5f8 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
+ sched_offline_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
if (IS_ERR(tg))
goto out_free;
+ sched_online_group(tg, &root_task_group);
+
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31..c3ae144 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 03d7784..4205354 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
- /* Look for allowed, online CPU in same node. */
- for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return dest_cpu;
+ /*
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
+ */
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
}
for (;;) {
@@ -1488,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1742,9 +1754,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
@@ -1753,9 +1764,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
@@ -1969,11 +1979,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -1985,23 +1994,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
@@ -2786,7 +2778,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (irqs_disabled())
print_irqtrace_events(prev);
dump_stack();
- add_taint(TAINT_WARN);
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
/*
@@ -3007,51 +2999,6 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
- if (lock->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
-
- return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
- if (!sched_feat(OWNER_SPIN))
- return 0;
-
- rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
- break;
-
- arch_mutex_cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
-}
-#endif
-
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -3268,7 +3215,8 @@ void complete_all(struct completion *x)
EXPORT_SYMBOL(complete_all);
static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
@@ -3281,7 +3229,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
+ timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
@@ -3292,17 +3240,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
return timeout ?: 1;
}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, timeout, state);
+ timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
@@ -3339,6 +3300,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
@@ -4364,7 +4358,10 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
*/
bool __sched yield_to(struct task_struct *p, bool preempt)
{
@@ -4378,6 +4375,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
again:
p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
double_rq_lock(rq, p_rq);
while (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
@@ -4385,13 +4391,13 @@ again:
}
if (!curr->sched_class->yield_to_task)
- goto out;
+ goto out_unlock;
if (curr->sched_class != p->sched_class)
- goto out;
+ goto out_unlock;
if (task_running(p_rq, p) || p->state)
- goto out;
+ goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
@@ -4404,11 +4410,12 @@ again:
resched_task(p_rq->curr);
}
-out:
+out_unlock:
double_rq_unlock(rq, p_rq);
+out_irq:
local_irq_restore(flags);
- if (yielded)
+ if (yielded > 0)
schedule();
return yielded;
@@ -4949,7 +4956,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
}
static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
@@ -7161,7 +7168,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7173,6 +7179,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7182,12 +7199,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7200,6 +7211,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7211,9 +7228,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7584,6 +7598,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
return &tg->css;
}
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *parent;
+
+ if (!cgrp->parent)
+ return 0;
+
+ parent = cgroup_tg(cgrp->parent);
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
@@ -7591,6 +7618,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
sched_destroy_group(tg);
}
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_offline_group(tg);
+}
+
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
@@ -7946,6 +7980,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9857329..e93cca9 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- task_cputime(tsk, &utime, &stime);
+ task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
@@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk)
{
unsigned long long clock;
- clock = sched_clock();
+ clock = local_clock();
if (clock < tsk->vtime_snap)
return 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c..75024a6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- /*
- * May be NULL if the underlying cgroup isn't fully-created yet
- */
- if (!tg->css.cgroup) {
- group_path[0] = '\0';
- return group_path;
- }
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
return group_path;
}
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
{
unsigned int freq = cpu_khz ? : 1;
- SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
#else
- SEQ_printf(m, "\ncpu#%d\n", cpu);
+ SEQ_printf(m, "cpu#%d\n", cpu);
#endif
#define P(x) \
@@ -330,6 +323,7 @@ do { \
print_rq(m, rq, cpu);
rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
"linear"
};
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
{
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
- int cpu;
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P
- SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
sysctl_sched_tunable_scaling,
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
- for_each_online_cpu(cpu)
- print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
- SEQ_printf(m, "\n");
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
return 0;
}
void sysrq_sched_debug_show(void)
{
- sched_debug_show(NULL, NULL);
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
}
static int sched_debug_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_debug_show, NULL);
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
}
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = sched_debug_release,
};
static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b..99399f8 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
-/*
* Decrement CPU power based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e..e036eda 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
if (mask_str == NULL)
return -ENOMEM;
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
/* runqueue-specific stats */
seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
return 0;
}
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
+ unsigned long n = *offset;
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &schedstat_sops);
}
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+ return 0;
+};
+
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = schedstat_release,
};
static int __init proc_schedstat_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 7f82adb..598dc06 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
@@ -1157,11 +1160,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = signal_pt_regs();
- printk("%s/%d: potentially unexpected fatal signal %d.\n",
+ printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
current->comm, task_pid_nr(current), signr);
#if defined(__i386__) && !defined(__arch_um__)
- printk("code at %08lx: ", regs->ip);
+ printk(KERN_INFO "code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
@@ -1169,11 +1172,11 @@ static void print_fatal_signal(int signr)
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
- printk("%02x ", insn);
+ printk(KERN_CONT "%02x ", insn);
}
}
+ printk(KERN_CONT "\n");
#endif
- printk("\n");
preempt_disable();
show_regs(regs);
preempt_enable();
@@ -2399,6 +2402,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
tracehook_signal_handler(sig, info, ka, regs, stepping);
}
+void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
+{
+ if (failed)
+ force_sigsegv(ksig->sig, current);
+ else
+ signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+ signal_pt_regs(), stepping);
+}
+
/*
* It could be that complete_signal() picked us to notify about the
* group-wide signal. Other threads should be notified now to take
@@ -2616,40 +2628,95 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
return 0;
}
-long do_sigpending(void __user *set, unsigned long sigsetsize)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
+ compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
- long error = -EINVAL;
- sigset_t pending;
+#ifdef __BIG_ENDIAN
+ sigset_t old_set = current->blocked;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (nset) {
+ compat_sigset_t new32;
+ sigset_t new_set;
+ int error;
+ if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+
+ sigset_from_compat(&new_set, &new32);
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ error = sigprocmask(how, &new_set, NULL);
+ if (error)
+ return error;
+ }
+ if (oset) {
+ compat_sigset_t old32;
+ sigset_to_compat(&old32, &old_set);
+ if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+ return 0;
+#else
+ return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
+ (sigset_t __user *)oset, sigsetsize);
+#endif
+}
+#endif
+static int do_sigpending(void *set, unsigned long sigsetsize)
+{
if (sigsetsize > sizeof(sigset_t))
- goto out;
+ return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
- sigorsets(&pending, &current->pending.signal,
+ sigorsets(set, &current->pending.signal,
&current->signal->shared_pending.signal);
spin_unlock_irq(&current->sighand->siglock);
/* Outside the lock because only this thread touches it. */
- sigandsets(&pending, &current->blocked, &pending);
-
- error = -EFAULT;
- if (!copy_to_user(set, &pending, sigsetsize))
- error = 0;
-
-out:
- return error;
+ sigandsets(set, &current->blocked, set);
+ return 0;
}
/**
* sys_rt_sigpending - examine a pending signal that has been raised
* while blocked
- * @set: stores pending signals
+ * @uset: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
-SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
- return do_sigpending(set, sigsetsize);
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err && copy_to_user(uset, &set, sigsetsize))
+ err = -EFAULT;
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
+ compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err) {
+ compat_sigset_t set32;
+ sigset_to_compat(&set32, &set);
+ /* we can get here only if sigsetsize <= sizeof(set) */
+ if (copy_to_user(uset, &set32, sigsetsize))
+ err = -EFAULT;
+ }
+ return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
+#endif
}
+#endif
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
@@ -2881,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
@@ -2927,6 +2994,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
return do_tkill(0, pid, sig);
}
+static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
+{
+ /* Not even root can pretend to send signals from the kernel.
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid)) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
+ return -EPERM;
+ }
+ info->si_signo = sig;
+
+ /* POSIX.1b doesn't mention process groups. */
+ return kill_proc_info(sig, info, pid);
+}
+
/**
* sys_rt_sigqueueinfo - send signal information to a signal
* @pid: the PID of the thread
@@ -2937,25 +3021,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
siginfo_t info;
-
if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
return -EFAULT;
+ return do_rt_sigqueueinfo(pid, sig, &info);
+}
- /* Not even root can pretend to send signals from the kernel.
- * Nor can they impersonate a kill()/tgkill(), which adds source info.
- */
- if (info.si_code >= 0 || info.si_code == SI_TKILL) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info.si_code < 0);
- return -EPERM;
- }
- info.si_signo = sig;
-
- /* POSIX.1b doesn't mention process groups. */
- return kill_proc_info(sig, &info, pid);
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+ int ret = copy_siginfo_from_user32(&info, uinfo);
+ if (unlikely(ret))
+ return ret;
+ return do_rt_sigqueueinfo(pid, sig, &info);
}
+#endif
-long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
@@ -2964,7 +3049,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
- if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+ if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
+ (task_pid_vnr(current) != pid)) {
/* We used to allow any < 0 si_code */
WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
@@ -2985,6 +3071,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
+ compat_pid_t, tgid,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+#endif
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *t = current;
@@ -3030,7 +3131,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
return 0;
}
-int
+static int
do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
{
stack_t oss;
@@ -3095,12 +3196,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
out:
return error;
}
-#ifdef CONFIG_GENERIC_SIGALTSTACK
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
return do_sigaltstack(uss, uoss, current_user_stack_pointer());
}
-#endif
int restore_altstack(const stack_t __user *uss)
{
@@ -3118,7 +3217,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
}
#ifdef CONFIG_COMPAT
-#ifdef CONFIG_GENERIC_SIGALTSTACK
COMPAT_SYSCALL_DEFINE2(sigaltstack,
const compat_stack_t __user *, uss_ptr,
compat_stack_t __user *, uoss_ptr)
@@ -3168,7 +3266,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
__put_user(t->sas_ss_size, &uss->ss_size);
}
#endif
-#endif
#ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3178,7 +3275,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
*/
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
{
- return do_sigpending(set, sizeof(*set));
+ return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
}
#endif
@@ -3234,7 +3331,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
-#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+#ifndef CONFIG_ODD_RT_SIGACTION
/**
* sys_rt_sigaction - alter an action taken by a process
* @sig: signal to be sent
@@ -3268,7 +3365,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
out:
return ret;
}
-#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
+ const struct compat_sigaction __user *, act,
+ struct compat_sigaction __user *, oact,
+ compat_size_t, sigsetsize)
+{
+ struct k_sigaction new_ka, old_ka;
+ compat_sigset_t mask;
+#ifdef __ARCH_HAS_SA_RESTORER
+ compat_uptr_t restorer;
+#endif
+ int ret;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+
+ if (act) {
+ compat_uptr_t handler;
+ ret = get_user(handler, &act->sa_handler);
+ new_ka.sa.sa_handler = compat_ptr(handler);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= get_user(restorer, &act->sa_restorer);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+#endif
+ ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+ if (ret)
+ return -EFAULT;
+ sigset_from_compat(&new_ka.sa.sa_mask, &mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+ if (!ret && oact) {
+ sigset_to_compat(&mask, &old_ka.sa.sa_mask);
+ ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler);
+ ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer);
+#endif
+ }
+ return ret;
+}
+#endif
+#endif /* !CONFIG_ODD_RT_SIGACTION */
+
+#ifdef CONFIG_OLD_SIGACTION
+SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct old_sigaction __user *, act,
+ struct old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+
+ if (act) {
+ old_sigset_t mask;
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+#ifdef CONFIG_COMPAT_OLD_SIGACTION
+COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct compat_old_sigaction __user *, act,
+ struct compat_old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+ compat_old_sigset_t mask;
+ compat_uptr_t handler, restorer;
+
+ if (act) {
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(handler, &act->sa_handler) ||
+ __get_user(restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ new_ka.sa.sa_handler = compat_ptr(handler);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+ return ret;
+}
+#endif
#ifdef __ARCH_WANT_SYS_SGETMASK
@@ -3336,7 +3558,6 @@ int sigsuspend(sigset_t *set)
return -ERESTARTNOHAND;
}
-#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
/**
* sys_rt_sigsuspend - replace the signal mask for a value with the
* @unewset value until a signal is received
@@ -3355,7 +3576,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
return -EFAULT;
return sigsuspend(&newset);
}
-#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t newset;
+ compat_sigset_t newset32;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&newset, &newset32);
+ return sigsuspend(&newset);
+#else
+ /* on little-endian bitmaps don't care about granularity */
+ return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
+#endif
+}
+#endif
+
+#ifdef CONFIG_OLD_SIGSUSPEND
+SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND3
+SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
{
diff --git a/kernel/smp.c b/kernel/smp.c
index 69f38bd..8e451f3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,22 +16,12 @@
#include "smpboot.h"
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
- struct list_head queue;
- raw_spinlock_t lock;
-} call_function __cacheline_aligned_in_smp =
- {
- .queue = LIST_HEAD_INIT(call_function.queue),
- .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
- };
-
enum {
CSD_FLAG_LOCK = 0x01,
};
struct call_function_data {
- struct call_single_data csd;
- atomic_t refs;
+ struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
cpu_to_node(cpu)))
return notifier_from_errno(-ENOMEM);
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
+ free_cpumask_var(cfd->cpumask);
+ return notifier_from_errno(-ENOMEM);
+ }
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
+ free_percpu(cfd->csd);
break;
#endif
};
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
}
/*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
- */
-void generic_smp_call_function_interrupt(void)
-{
- struct call_function_data *data;
- int cpu = smp_processor_id();
-
- /*
- * Shouldn't receive this interrupt on a cpu that is not yet online.
- */
- WARN_ON_ONCE(!cpu_online(cpu));
-
- /*
- * Ensure entry is visible on call_function_queue after we have
- * entered the IPI. See comment in smp_call_function_many.
- * If we don't have this, then we may miss an entry on the list
- * and never get another IPI to process it.
- */
- smp_mb();
-
- /*
- * It's ok to use list_for_each_rcu() here even though we may
- * delete 'pos', since list_del_rcu() doesn't clear ->next
- */
- list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
- int refs;
- smp_call_func_t func;
-
- /*
- * Since we walk the list without any locks, we might
- * see an entry that was completed, removed from the
- * list and is in the process of being reused.
- *
- * We must check that the cpu is in the cpumask before
- * checking the refs, and both must be set before
- * executing the callback on this cpu.
- */
-
- if (!cpumask_test_cpu(cpu, data->cpumask))
- continue;
-
- smp_rmb();
-
- if (atomic_read(&data->refs) == 0)
- continue;
-
- func = data->csd.func; /* save for later warn */
- func(data->csd.info);
-
- /*
- * If the cpu mask is not still set then func enabled
- * interrupts (BUG), and this cpu took another smp call
- * function interrupt and executed func(info) twice
- * on this cpu. That nested execution decremented refs.
- */
- if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
- WARN(1, "%pf enabled interrupts and double executed\n", func);
- continue;
- }
-
- refs = atomic_dec_return(&data->refs);
- WARN_ON(refs < 0);
-
- if (refs)
- continue;
-
- WARN_ON(!cpumask_empty(data->cpumask));
-
- raw_spin_lock(&call_function.lock);
- list_del_rcu(&data->csd.list);
- raw_spin_unlock(&call_function.lock);
-
- csd_unlock(&data->csd);
- }
-
-}
-
-/*
* Invoked by arch to handle an IPI for call function single. Must be
* called from the arch with interrupts disabled.
*/
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
struct call_function_data *data;
- unsigned long flags;
- int refs, cpu, next_cpu, this_cpu = smp_processor_id();
+ int cpu, next_cpu, this_cpu = smp_processor_id();
/*
* Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
}
data = &__get_cpu_var(cfd_data);
- csd_lock(&data->csd);
-
- /* This BUG_ON verifies our reuse assertions and can be removed */
- BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
-
- /*
- * The global call function queue list add and delete are protected
- * by a lock, but the list is traversed without any lock, relying
- * on the rcu list add and delete to allow safe concurrent traversal.
- * We reuse the call function data without waiting for any grace
- * period after some other cpu removes it from the global queue.
- * This means a cpu might find our data block as it is being
- * filled out.
- *
- * We hold off the interrupt handler on the other cpu by
- * ordering our writes to the cpu mask vs our setting of the
- * refs counter. We assert only the cpu owning the data block
- * will set a bit in cpumask, and each bit will only be cleared
- * by the subject cpu. Each cpu must first find its bit is
- * set and then check that refs is set indicating the element is
- * ready to be processed, otherwise it must skip the entry.
- *
- * On the previous iteration refs was set to 0 by another cpu.
- * To avoid the use of transitivity, set the counter to 0 here
- * so the wmb will pair with the rmb in the interrupt handler.
- */
- atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
-
- data->csd.func = func;
- data->csd.info = info;
- /* Ensure 0 refs is visible before mask. Also orders func and info */
- smp_wmb();
-
- /* We rely on the "and" being processed before the store */
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
- refs = cpumask_weight(data->cpumask);
/* Some callers race with other cpus changing the passed mask */
- if (unlikely(!refs)) {
- csd_unlock(&data->csd);
+ if (unlikely(!cpumask_weight(data->cpumask)))
return;
- }
/*
* After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
* a SMP function call, so data->cpumask will be zero.
*/
cpumask_copy(data->cpumask_ipi, data->cpumask);
- raw_spin_lock_irqsave(&call_function.lock, flags);
- /*
- * Place entry at the _HEAD_ of the list, so that any cpu still
- * observing the entry in generic_smp_call_function_interrupt()
- * will not miss any other list entries:
- */
- list_add_rcu(&data->csd.list, &call_function.queue);
- /*
- * We rely on the wmb() in list_add_rcu to complete our writes
- * to the cpumask before this write to refs, which indicates
- * data is on the list and is ready to be processed.
- */
- atomic_set(&data->refs, refs);
- raw_spin_unlock_irqrestore(&call_function.lock, flags);
- /*
- * Make the list addition visible before sending the ipi.
- * (IPIs must obey or appear to obey normal Linux cache
- * coherency rules -- see comment in generic_exec_single).
- */
- smp_mb();
+ for_each_cpu(cpu, data->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
+ struct call_single_queue *dst =
+ &per_cpu(call_single_queue, cpu);
+ unsigned long flags;
+
+ csd_lock(csd);
+ csd->func = func;
+ csd->info = info;
+
+ raw_spin_lock_irqsave(&dst->lock, flags);
+ list_add_tail(&csd->list, &dst->list);
+ raw_spin_unlock_irqrestore(&dst->lock, flags);
+ }
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi_mask(data->cpumask_ipi);
- /* Optionally wait for the CPUs to complete */
- if (wait)
- csd_lock_wait(&data->csd);
+ if (wait) {
+ for_each_cpu(cpu, data->cpumask) {
+ struct call_single_data *csd =
+ per_cpu_ptr(data->csd, cpu);
+ csd_lock_wait(csd);
+ }
+ }
}
EXPORT_SYMBOL(smp_call_function_many);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d4abac2..02fc5c9 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
}
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
- if (ht->create)
- ht->create(cpu);
+ if (ht->create) {
+ /*
+ * Make sure that the task has actually scheduled out
+ * into park position, before calling the create
+ * callback. At least the migration thread callback
+ * requires that the task is off the runqueue.
+ */
+ if (!wait_task_inactive(tsk, TASK_PARKED))
+ WARN_ON(1);
+ else
+ ht->create(cpu);
+ }
return 0;
}
@@ -209,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ if (ht->pre_unpark)
+ ht->pre_unpark(cpu);
kthread_unpark(tsk);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f5cc25f..14d7758 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
EXPORT_SYMBOL(local_bh_enable_ip);
/*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
- * and we fall back to softirqd after that.
+ * We restart softirq processing for at most 2 ms,
+ * and if need_resched() is not set.
*
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
* The two things to balance is latency against fairness -
* we want to handle softirqs as soon as possible, but they
* should not be able to lock up the box.
*/
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
__u32 pending;
- int max_restart = MAX_SOFTIRQ_RESTART;
+ unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
unsigned long old_flags = current->flags;
@@ -264,11 +264,12 @@ restart:
local_irq_disable();
pending = local_softirq_pending();
- if (pending && --max_restart)
- goto restart;
+ if (pending) {
+ if (time_before(jiffies, end) && !need_resched())
+ goto restart;
- if (pending)
wakeup_softirqd();
+ }
lockdep_softirq_exit();
@@ -322,18 +323,10 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
- if (!force_irqthreads) {
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+ if (!force_irqthreads)
__do_softirq();
-#else
- do_softirq();
-#endif
- } else {
- __local_bh_disable((unsigned long)__builtin_return_address(0),
- SOFTIRQ_OFFSET);
+ else
wakeup_softirqd();
- __local_bh_enable(SOFTIRQ_OFFSET);
- }
}
/*
@@ -341,9 +334,15 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
+#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+ local_irq_disable();
+#else
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
account_irq_exit_time(current);
trace_hardirq_exit();
- sub_preempt_count(IRQ_EXIT_OFFSET);
+ sub_preempt_count(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
@@ -353,7 +352,6 @@ void irq_exit(void)
tick_nohz_irq_exit();
#endif
rcu_irq_exit();
- sched_preempt_enable_no_resched();
}
/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 95d178c..c09f295 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
.create = cpu_stop_create,
.setup = cpu_stop_unpark,
.park = cpu_stop_park,
- .unpark = cpu_stop_unpark,
+ .pre_unpark = cpu_stop_unpark,
.selfparking = true,
};
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b376..0da73cf 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
@@ -323,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
- syscore_shutdown();
}
/**
@@ -369,6 +369,7 @@ void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
disable_nonboot_cpus();
+ syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -394,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
+ disable_nonboot_cpus();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
@@ -433,11 +435,12 @@ static DEFINE_MUTEX(reboot_mutex);
SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
char buffer[256];
int ret = 0;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT))
+ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
return -EPERM;
/* For safety, we require "magic" arguments. */
@@ -453,7 +456,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
* pid_namespace, the command is handled by reboot_pid_ns() which will
* call do_exit().
*/
- ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+ ret = reboot_pid_ns(pid_ns, cmd);
if (ret)
return ret;
@@ -1792,14 +1795,14 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
- struct dentry *dentry;
+ struct inode *inode;
int err;
exe = fdget(fd);
if (!exe.file)
return -EBADF;
- dentry = exe.file->f_path.dentry;
+ inode = file_inode(exe.file);
/*
* Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1810,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(dentry->d_inode->i_mode) ||
+ if (!S_ISREG(inode->i_mode) ||
exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- err = inode_permission(dentry->d_inode, MAY_EXEC);
+ err = inode_permission(inode, MAY_EXEC);
if (err)
goto exit;
@@ -2012,160 +2015,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = 0;
switch (option) {
- case PR_SET_PDEATHSIG:
- if (!valid_signal(arg2)) {
- error = -EINVAL;
- break;
- }
- me->pdeath_signal = arg2;
- break;
- case PR_GET_PDEATHSIG:
- error = put_user(me->pdeath_signal, (int __user *)arg2);
- break;
- case PR_GET_DUMPABLE:
- error = get_dumpable(me->mm);
+ case PR_SET_PDEATHSIG:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
break;
- case PR_SET_DUMPABLE:
- if (arg2 < 0 || arg2 > 1) {
- error = -EINVAL;
- break;
- }
- set_dumpable(me->mm, arg2);
+ }
+ me->pdeath_signal = arg2;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(me->pdeath_signal, (int __user *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ error = get_dumpable(me->mm);
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ error = -EINVAL;
break;
+ }
+ set_dumpable(me->mm, arg2);
+ break;
- case PR_SET_UNALIGN:
- error = SET_UNALIGN_CTL(me, arg2);
- break;
- case PR_GET_UNALIGN:
- error = GET_UNALIGN_CTL(me, arg2);
- break;
- case PR_SET_FPEMU:
- error = SET_FPEMU_CTL(me, arg2);
- break;
- case PR_GET_FPEMU:
- error = GET_FPEMU_CTL(me, arg2);
- break;
- case PR_SET_FPEXC:
- error = SET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_FPEXC:
- error = GET_FPEXC_CTL(me, arg2);
- break;
- case PR_GET_TIMING:
- error = PR_TIMING_STATISTICAL;
- break;
- case PR_SET_TIMING:
- if (arg2 != PR_TIMING_STATISTICAL)
- error = -EINVAL;
- break;
- case PR_SET_NAME:
- comm[sizeof(me->comm)-1] = 0;
- if (strncpy_from_user(comm, (char __user *)arg2,
- sizeof(me->comm) - 1) < 0)
- return -EFAULT;
- set_task_comm(me, comm);
- proc_comm_connector(me);
- break;
- case PR_GET_NAME:
- get_task_comm(comm, me);
- if (copy_to_user((char __user *)arg2, comm,
- sizeof(comm)))
- return -EFAULT;
- break;
- case PR_GET_ENDIAN:
- error = GET_ENDIAN(me, arg2);
- break;
- case PR_SET_ENDIAN:
- error = SET_ENDIAN(me, arg2);
- break;
- case PR_GET_SECCOMP:
- error = prctl_get_seccomp();
- break;
- case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2, (char __user *)arg3);
- break;
- case PR_GET_TSC:
- error = GET_TSC_CTL(arg2);
- break;
- case PR_SET_TSC:
- error = SET_TSC_CTL(arg2);
- break;
- case PR_TASK_PERF_EVENTS_DISABLE:
- error = perf_event_task_disable();
- break;
- case PR_TASK_PERF_EVENTS_ENABLE:
- error = perf_event_task_enable();
- break;
- case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
- break;
- case PR_SET_TIMERSLACK:
- if (arg2 <= 0)
- current->timer_slack_ns =
+ case PR_SET_UNALIGN:
+ error = SET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_GET_UNALIGN:
+ error = GET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_SET_FPEMU:
+ error = SET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_GET_FPEMU:
+ error = GET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_SET_FPEXC:
+ error = SET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_FPEXC:
+ error = GET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_TIMING:
+ error = PR_TIMING_STATISTICAL;
+ break;
+ case PR_SET_TIMING:
+ if (arg2 != PR_TIMING_STATISTICAL)
+ error = -EINVAL;
+ break;
+ case PR_SET_NAME:
+ comm[sizeof(me->comm) - 1] = 0;
+ if (strncpy_from_user(comm, (char __user *)arg2,
+ sizeof(me->comm) - 1) < 0)
+ return -EFAULT;
+ set_task_comm(me, comm);
+ proc_comm_connector(me);
+ break;
+ case PR_GET_NAME:
+ get_task_comm(comm, me);
+ if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
+ return -EFAULT;
+ break;
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(me, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(me, arg2);
+ break;
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+ break;
+ case PR_SET_TSC:
+ error = SET_TSC_CTL(arg2);
+ break;
+ case PR_TASK_PERF_EVENTS_DISABLE:
+ error = perf_event_task_disable();
+ break;
+ case PR_TASK_PERF_EVENTS_ENABLE:
+ error = perf_event_task_enable();
+ break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
current->default_timer_slack_ns;
- else
- current->timer_slack_ns = arg2;
- break;
- case PR_MCE_KILL:
- if (arg4 | arg5)
- return -EINVAL;
- switch (arg2) {
- case PR_MCE_KILL_CLEAR:
- if (arg3 != 0)
- return -EINVAL;
- current->flags &= ~PF_MCE_PROCESS;
- break;
- case PR_MCE_KILL_SET:
- current->flags |= PF_MCE_PROCESS;
- if (arg3 == PR_MCE_KILL_EARLY)
- current->flags |= PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_LATE)
- current->flags &= ~PF_MCE_EARLY;
- else if (arg3 == PR_MCE_KILL_DEFAULT)
- current->flags &=
- ~(PF_MCE_EARLY|PF_MCE_PROCESS);
- else
- return -EINVAL;
- break;
- default:
+ else
+ current->timer_slack_ns = arg2;
+ break;
+ case PR_MCE_KILL:
+ if (arg4 | arg5)
+ return -EINVAL;
+ switch (arg2) {
+ case PR_MCE_KILL_CLEAR:
+ if (arg3 != 0)
return -EINVAL;
- }
+ current->flags &= ~PF_MCE_PROCESS;
break;
- case PR_MCE_KILL_GET:
- if (arg2 | arg3 | arg4 | arg5)
- return -EINVAL;
- if (current->flags & PF_MCE_PROCESS)
- error = (current->flags & PF_MCE_EARLY) ?
- PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ case PR_MCE_KILL_SET:
+ current->flags |= PF_MCE_PROCESS;
+ if (arg3 == PR_MCE_KILL_EARLY)
+ current->flags |= PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_LATE)
+ current->flags &= ~PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_DEFAULT)
+ current->flags &=
+ ~(PF_MCE_EARLY|PF_MCE_PROCESS);
else
- error = PR_MCE_KILL_DEFAULT;
- break;
- case PR_SET_MM:
- error = prctl_set_mm(arg2, arg3, arg4, arg5);
- break;
- case PR_GET_TID_ADDRESS:
- error = prctl_get_tid_address(me, (int __user **)arg2);
- break;
- case PR_SET_CHILD_SUBREAPER:
- me->signal->is_child_subreaper = !!arg2;
- break;
- case PR_GET_CHILD_SUBREAPER:
- error = put_user(me->signal->is_child_subreaper,
- (int __user *) arg2);
- break;
- case PR_SET_NO_NEW_PRIVS:
- if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;
-
- current->no_new_privs = 1;
break;
- case PR_GET_NO_NEW_PRIVS:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- return current->no_new_privs ? 1 : 0;
default:
- error = -EINVAL;
- break;
+ return -EINVAL;
+ }
+ break;
+ case PR_MCE_KILL_GET:
+ if (arg2 | arg3 | arg4 | arg5)
+ return -EINVAL;
+ if (current->flags & PF_MCE_PROCESS)
+ error = (current->flags & PF_MCE_EARLY) ?
+ PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ else
+ error = PR_MCE_KILL_DEFAULT;
+ break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_TID_ADDRESS:
+ error = prctl_get_tid_address(me, (int __user **)arg2);
+ break;
+ case PR_SET_CHILD_SUBREAPER:
+ me->signal->is_child_subreaper = !!arg2;
+ break;
+ case PR_GET_CHILD_SUBREAPER:
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *)arg2);
+ break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ current->no_new_privs = 1;
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return current->no_new_privs ? 1 : 0;
+ default:
+ error = -EINVAL;
+ break;
}
return error;
}
@@ -2184,14 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(struct subprocess_info *info)
+static int __orderly_poweroff(bool force)
{
- argv_free(info->argv);
-}
-
-static int __orderly_poweroff(void)
-{
- int argc;
char **argv;
static char *envp[] = {
"HOME=/",
@@ -2200,21 +2196,40 @@ static int __orderly_poweroff(void)
};
int ret;
- argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
- if (argv == NULL) {
+ argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+ if (argv) {
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ argv_free(argv);
+ } else {
printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
- __func__, poweroff_cmd);
- return -ENOMEM;
+ __func__, poweroff_cmd);
+ ret = -ENOMEM;
}
- ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
- NULL, argv_cleanup, NULL);
- if (ret == -ENOMEM)
- argv_free(argv);
+ if (ret && force) {
+ printk(KERN_WARNING "Failed to start orderly shutdown: "
+ "forcing the issue\n");
+ /*
+ * I guess this should try to kick off some daemon to sync and
+ * poweroff asap. Or not even bother syncing if we're doing an
+ * emergency shutdown?
+ */
+ emergency_sync();
+ kernel_power_off();
+ }
return ret;
}
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+ __orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
/**
* orderly_poweroff - Trigger an orderly system poweroff
* @force: force poweroff if command execution fails
@@ -2224,21 +2239,9 @@ static int __orderly_poweroff(void)
*/
int orderly_poweroff(bool force)
{
- int ret = __orderly_poweroff();
-
- if (ret && force) {
- printk(KERN_WARNING "Failed to start orderly shutdown: "
- "forcing the issue\n");
-
- /*
- * I guess this should try to kick off some daemon to sync and
- * poweroff asap. Or not even bother syncing if we're doing an
- * emergency shutdown?
- */
- emergency_sync();
- kernel_power_off();
- }
-
- return ret;
+ if (force) /* do not override the pending "true" */
+ poweroff_force = true;
+ schedule_work(&poweroff_work);
+ return 0;
}
EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4fc9be9..afc1dc6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,7 +105,6 @@ extern char core_pattern[];
extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
-extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
@@ -158,14 +157,20 @@ extern int sysctl_tsb_ratio;
#ifdef __hppa__
extern int pwrsw_enabled;
+#endif
+
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
extern int unaligned_enabled;
#endif
#ifdef CONFIG_IA64
-extern int no_unaligned_warning;
extern int unaligned_dump_stack;
#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+extern int no_unaligned_warning;
+#endif
+
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -553,6 +558,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
{
.procname = "unaligned-trap",
.data = &unaligned_enabled,
@@ -919,7 +926,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
#endif
-#ifdef CONFIG_IA64
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
{
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
@@ -927,6 +934,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#ifdef CONFIG_IA64
{
.procname = "unaligned-dump-stack",
.data = &unaligned_dump_stack,
@@ -2014,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
int i;
for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
if ((tmptaint >> i) & 1)
- add_taint(i);
+ add_taint(i, LOCKDEP_STILL_OK);
}
}
@@ -2091,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+ if (suid_dumpable == SUID_DUMP_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
printk(KERN_WARNING "Unsafe core_pattern used with "\
"suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a63844..ebf7235 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
{ CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
{ CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
- { CTL_INT, NET_TCP_ABC, "tcp_abc" },
{ CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
{ CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
{ CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
static ssize_t bin_intvec(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t copied = 0;
char *buffer;
ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
if (oldval && oldlen) {
unsigned __user *vec = oldval;
size_t length = oldlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buffer, BUFSZ - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
if (result < 0)
goto out_kfree;
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
if (newval && newlen) {
unsigned __user *vec = newval;
size_t length = newlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
str += snprintf(str, end - str, "%lu\t", value);
}
- set_fs(KERNEL_DS);
- result = vfs_write(file, buffer, str - buffer, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buffer, str - buffer, 0);
if (result < 0)
goto out_kfree;
}
@@ -1049,7 +1041,6 @@ out:
static ssize_t bin_ulongvec(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t copied = 0;
char *buffer;
ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
if (oldval && oldlen) {
unsigned long __user *vec = oldval;
size_t length = oldlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buffer, BUFSZ - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
if (result < 0)
goto out_kfree;
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
if (newval && newlen) {
unsigned long __user *vec = newval;
size_t length = newlen / sizeof(*vec);
- loff_t pos = 0;
char *str, *end;
int i;
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
str += snprintf(str, end - str, "%lu\t", value);
}
- set_fs(KERNEL_DS);
- result = vfs_write(file, buffer, str - buffer, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buffer, str - buffer, 0);
if (result < 0)
goto out_kfree;
}
@@ -1127,19 +1112,15 @@ out:
static ssize_t bin_uuid(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t result, copied = 0;
/* Only supports reads */
if (oldval && oldlen) {
- loff_t pos = 0;
char buf[40], *str = buf;
unsigned char uuid[16];
int i;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
goto out;
@@ -1175,18 +1156,14 @@ out:
static ssize_t bin_dn_node_address(struct file *file,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
- mm_segment_t old_fs = get_fs();
ssize_t result, copied = 0;
if (oldval && oldlen) {
- loff_t pos = 0;
char buf[15], *nodep;
unsigned long area, node;
__le16 dnaddr;
- set_fs(KERNEL_DS);
- result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
- set_fs(old_fs);
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
goto out;
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
/* Convert the decnet address to binary */
result = -EIO;
- nodep = strchr(buf, '.') + 1;
+ nodep = strchr(buf, '.');
if (!nodep)
goto out;
+ ++nodep;
area = simple_strtoul(buf, NULL, 10);
node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
}
if (newval && newlen) {
- loff_t pos = 0;
__le16 dnaddr;
char buf[15];
int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
- set_fs(KERNEL_DS);
- result = vfs_write(file, buf, len, &pos);
- set_fs(old_fs);
+ result = kernel_write(file, buf, len, 0);
if (result < 0)
goto out;
}
diff --git a/kernel/time.c b/kernel/time.c
index c2a27dd..f8342a4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -240,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
* Avoid unnecessary multiplications/divisions in the
* two most common HZ cases:
*/
-inline unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
@@ -256,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
-inline unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned int jiffies_to_usecs(const unsigned long j)
{
#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0..c6d6400 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
clockevents_config(dev, freq);
clockevents_register_device(dev);
}
+EXPORT_SYMBOL_GPL(clockevents_config_and_register);
/**
* clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b10a42b..072bb06 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -23,7 +23,7 @@
* NTP timekeeping variables:
*/
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
/* USER_HZ period (usecs): */
@@ -348,7 +348,7 @@ void ntp_clear(void)
{
unsigned long flags;
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
@@ -362,7 +362,7 @@ void ntp_clear(void)
/* Clear PPS state variables */
pps_clear();
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
}
@@ -372,9 +372,9 @@ u64 ntp_tick_length(void)
unsigned long flags;
s64 ret;
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
ret = tick_length;
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
return ret;
}
@@ -395,7 +395,7 @@ int second_overflow(unsigned long secs)
int leap = 0;
unsigned long flags;
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
/*
* Leap second processing. If in leap-insert state at the end of the
@@ -479,7 +479,7 @@ int second_overflow(unsigned long secs)
time_adjust = 0;
out:
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
return leap;
}
@@ -672,7 +672,7 @@ int do_adjtimex(struct timex *txc)
getnstimeofday(&ts);
- spin_lock_irq(&ntp_lock);
+ raw_spin_lock_irq(&ntp_lock);
if (txc->modes & ADJ_ADJTIME) {
long save_adjust = time_adjust;
@@ -714,7 +714,7 @@ int do_adjtimex(struct timex *txc)
/* fill PPS status fields */
pps_fill_timex(txc);
- spin_unlock_irq(&ntp_lock);
+ raw_spin_unlock_irq(&ntp_lock);
txc->time.tv_sec = ts.tv_sec;
txc->time.tv_usec = ts.tv_nsec;
@@ -912,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
pts_norm = pps_normalize_ts(*phase_ts);
- spin_lock_irqsave(&ntp_lock, flags);
+ raw_spin_lock_irqsave(&ntp_lock, flags);
/* clear the error bits, they will be set again if needed */
time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -925,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
* just start the frequency interval */
if (unlikely(pps_fbase.tv_sec == 0)) {
pps_fbase = *raw_ts;
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
return;
}
@@ -940,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
time_status |= STA_PPSJITTER;
/* restart the frequency calibration interval */
pps_fbase = *raw_ts;
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
pr_err("hardpps: PPSJITTER: bad pulse\n");
return;
}
@@ -957,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
hardpps_update_phase(pts_norm.nsec);
- spin_unlock_irqrestore(&ntp_lock, flags);
+ raw_spin_unlock_irqrestore(&ntp_lock, flags);
}
EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb8..7f32fe0 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
*/
int tick_check_broadcast_device(struct clock_event_device *dev)
{
- if ((tick_broadcast_device.evtdev &&
+ if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (tick_broadcast_device.evtdev &&
tick_broadcast_device.evtdev->rating >= dev->rating) ||
(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 314b9ee..a19a399 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -554,6 +554,7 @@ void tick_nohz_idle_enter(void)
local_irq_enable();
}
+EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -685,6 +686,7 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
+EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e35515..9a0bc98 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -138,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
}
/* Timekeeper helper functions. */
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 (*arch_gettimeoffset)(void);
+
+u32 get_arch_timeoffset(void)
+{
+ if (likely(arch_gettimeoffset))
+ return arch_gettimeoffset();
+ return 0;
+}
+#else
+static inline u32 get_arch_timeoffset(void) { return 0; }
+#endif
+
static inline s64 timekeeping_get_ns(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
@@ -154,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
nsec = cycle_delta * tk->mult + tk->xtime_nsec;
nsec >>= tk->shift;
- /* If arch requires, add in gettimeoffset() */
- return nsec + arch_gettimeoffset();
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + get_arch_timeoffset();
}
static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -174,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
/* convert delta to nanoseconds. */
nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
- /* If arch requires, add in gettimeoffset() */
- return nsec + arch_gettimeoffset();
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + get_arch_timeoffset();
}
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -257,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->xtime_nsec += cycle_delta * tk->mult;
- /* If arch requires, add in gettimeoffset() */
- tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
tk_normalize_xtime(tk);
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 0000000..511bdf2
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
+scale=0
+
+define gcd(a,b) {
+ auto t;
+ while (b) {
+ t = b;
+ b = a % b;
+ a = t;
+ }
+ return a;
+}
+
+/* Division by reciprocal multiplication. */
+define fmul(b,n,d) {
+ return (2^b*n+d-1)/d;
+}
+
+/* Adjustment factor when a ceiling value is used. Use as:
+ (imul * n) + (fmulxx * n + fadjxx) >> xx) */
+define fadj(b,n,d) {
+ auto v;
+ d = d/gcd(n,d);
+ v = 2^b*(d-1)/d;
+ return v;
+}
+
+/* Compute the appropriate mul/adj values as well as a shift count,
+ which brings the mul value into the range 2^b-1 <= x < 2^b. Such
+ a shift value will be correct in the signed integer range and off
+ by at most one in the upper half of the unsigned range. */
+define fmuls(b,n,d) {
+ auto s, m;
+ for (s = 0; 1; s++) {
+ m = fmul(s,n,d);
+ if (m >= 2^(b-1))
+ return s;
+ }
+ return 0;
+}
+
+define timeconst(hz) {
+ print "/* Automatically generated by kernel/timeconst.bc */\n"
+ print "/* Time conversion constants for HZ == ", hz, " */\n"
+ print "\n"
+
+ print "#ifndef KERNEL_TIMECONST_H\n"
+ print "#define KERNEL_TIMECONST_H\n\n"
+
+ print "#include <linux/param.h>\n"
+ print "#include <linux/types.h>\n\n"
+
+ print "#if HZ != ", hz, "\n"
+ print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#endif\n\n"
+
+ if (hz < 2) {
+ print "#error Totally bogus HZ value!\n"
+ } else {
+ s=fmuls(32,1000,hz)
+ obase=16
+ print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
+ print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000)
+ obase=16
+ print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
+ print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
+ obase=10
+ print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000)
+ print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
+ print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
+ print "\n"
+
+ s=fmuls(32,1000000,hz)
+ obase=16
+ print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
+ print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_USEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000000)
+ obase=16
+ print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
+ print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
+ obase=10
+ print "#define USEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000000)
+ print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
+ print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+ print "\n"
+
+ print "#endif /* KERNEL_TIMECONST_H */\n"
+ }
+ halt
+}
+
+timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index 3f42652..0000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,376 +0,0 @@
-#!/usr/bin/perl
-# -----------------------------------------------------------------------
-#
-# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
-#
-# This file is part of the Linux kernel, and is made available under
-# the terms of the GNU General Public License version 2 or (at your
-# option) any later version; incorporated herein by reference.
-#
-# -----------------------------------------------------------------------
-#
-
-#
-# Usage: timeconst.pl HZ > timeconst.h
-#
-
-# Precomputed values for systems without Math::BigInt
-# Generated by:
-# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
-%canned_values = (
- 24 => [
- '0xa6aaaaab','0x2aaaaaa',26,
- 125,3,
- '0xc49ba5e4','0x1fbe76c8b4',37,
- 3,125,
- '0xa2c2aaab','0xaaaa',16,
- 125000,3,
- '0xc9539b89','0x7fffbce4217d',47,
- 3,125000,
- ], 32 => [
- '0xfa000000','0x6000000',27,
- 125,4,
- '0x83126e98','0xfdf3b645a',36,
- 4,125,
- '0xf4240000','0x0',17,
- 31250,1,
- '0x8637bd06','0x3fff79c842fa',46,
- 1,31250,
- ], 48 => [
- '0xa6aaaaab','0x6aaaaaa',27,
- 125,6,
- '0xc49ba5e4','0xfdf3b645a',36,
- 6,125,
- '0xa2c2aaab','0x15555',17,
- 62500,3,
- '0xc9539b89','0x3fffbce4217d',46,
- 3,62500,
- ], 64 => [
- '0xfa000000','0xe000000',28,
- 125,8,
- '0x83126e98','0x7ef9db22d',35,
- 8,125,
- '0xf4240000','0x0',18,
- 15625,1,
- '0x8637bd06','0x1fff79c842fa',45,
- 1,15625,
- ], 100 => [
- '0xa0000000','0x0',28,
- 10,1,
- '0xcccccccd','0x733333333',35,
- 1,10,
- '0x9c400000','0x0',18,
- 10000,1,
- '0xd1b71759','0x1fff2e48e8a7',45,
- 1,10000,
- ], 122 => [
- '0x8325c53f','0xfbcda3a',28,
- 500,61,
- '0xf9db22d1','0x7fbe76c8b',35,
- 61,500,
- '0x8012e2a0','0x3ef36',18,
- 500000,61,
- '0xffda4053','0x1ffffbce4217',45,
- 61,500000,
- ], 128 => [
- '0xfa000000','0x1e000000',29,
- 125,16,
- '0x83126e98','0x3f7ced916',34,
- 16,125,
- '0xf4240000','0x40000',19,
- 15625,2,
- '0x8637bd06','0xfffbce4217d',44,
- 2,15625,
- ], 200 => [
- '0xa0000000','0x0',29,
- 5,1,
- '0xcccccccd','0x333333333',34,
- 1,5,
- '0x9c400000','0x0',19,
- 5000,1,
- '0xd1b71759','0xfff2e48e8a7',44,
- 1,5000,
- ], 250 => [
- '0x80000000','0x0',29,
- 4,1,
- '0x80000000','0x180000000',33,
- 1,4,
- '0xfa000000','0x0',20,
- 4000,1,
- '0x83126e98','0x7ff7ced9168',43,
- 1,4000,
- ], 256 => [
- '0xfa000000','0x3e000000',30,
- 125,32,
- '0x83126e98','0x1fbe76c8b',33,
- 32,125,
- '0xf4240000','0xc0000',20,
- 15625,4,
- '0x8637bd06','0x7ffde7210be',43,
- 4,15625,
- ], 300 => [
- '0xd5555556','0x2aaaaaaa',30,
- 10,3,
- '0x9999999a','0x1cccccccc',33,
- 3,10,
- '0xd0555556','0xaaaaa',20,
- 10000,3,
- '0x9d495183','0x7ffcb923a29',43,
- 3,10000,
- ], 512 => [
- '0xfa000000','0x7e000000',31,
- 125,64,
- '0x83126e98','0xfdf3b645',32,
- 64,125,
- '0xf4240000','0x1c0000',21,
- 15625,8,
- '0x8637bd06','0x3ffef39085f',42,
- 8,15625,
- ], 1000 => [
- '0x80000000','0x0',31,
- 1,1,
- '0x80000000','0x0',31,
- 1,1,
- '0xfa000000','0x0',22,
- 1000,1,
- '0x83126e98','0x1ff7ced9168',41,
- 1,1000,
- ], 1024 => [
- '0xfa000000','0xfe000000',32,
- 125,128,
- '0x83126e98','0x7ef9db22',31,
- 128,125,
- '0xf4240000','0x3c0000',22,
- 15625,16,
- '0x8637bd06','0x1fff79c842f',41,
- 16,15625,
- ], 1200 => [
- '0xd5555556','0xd5555555',32,
- 5,6,
- '0x9999999a','0x66666666',31,
- 6,5,
- '0xd0555556','0x2aaaaa',22,
- 2500,3,
- '0x9d495183','0x1ffcb923a29',41,
- 3,2500,
- ]
-);
-
-$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
-
-sub bint($)
-{
- my($x) = @_;
- return Math::BigInt->new($x);
-}
-
-#
-# Constants for division by reciprocal multiplication.
-# (bits, numerator, denominator)
-#
-sub fmul($$$)
-{
- my ($b,$n,$d) = @_;
-
- $n = bint($n);
- $d = bint($d);
-
- return scalar (($n << $b)+$d-bint(1))/$d;
-}
-
-sub fadj($$$)
-{
- my($b,$n,$d) = @_;
-
- $n = bint($n);
- $d = bint($d);
-
- $d = $d/bgcd($n, $d);
- return scalar (($d-bint(1)) << $b)/$d;
-}
-
-sub fmuls($$$) {
- my($b,$n,$d) = @_;
- my($s,$m);
- my($thres) = bint(1) << ($b-1);
-
- $n = bint($n);
- $d = bint($d);
-
- for ($s = 0; 1; $s++) {
- $m = fmul($s,$n,$d);
- return $s if ($m >= $thres);
- }
- return 0;
-}
-
-# Generate a hex value if the result fits in 64 bits;
-# otherwise skip.
-sub bignum_hex($) {
- my($x) = @_;
- my $s = $x->as_hex();
-
- return (length($s) > 18) ? undef : $s;
-}
-
-# Provides mul, adj, and shr factors for a specific
-# (bit, time, hz) combination
-sub muladj($$$) {
- my($b, $t, $hz) = @_;
- my $s = fmuls($b, $t, $hz);
- my $m = fmul($s, $t, $hz);
- my $a = fadj($s, $t, $hz);
- return (bignum_hex($m), bignum_hex($a), $s);
-}
-
-# Provides numerator, denominator values
-sub numden($$) {
- my($n, $d) = @_;
- my $g = bgcd($n, $d);
- return ($n/$g, $d/$g);
-}
-
-# All values for a specific (time, hz) combo
-sub conversions($$) {
- my ($t, $hz) = @_;
- my @val = ();
-
- # HZ_TO_xx
- push(@val, muladj(32, $t, $hz));
- push(@val, numden($t, $hz));
-
- # xx_TO_HZ
- push(@val, muladj(32, $hz, $t));
- push(@val, numden($hz, $t));
-
- return @val;
-}
-
-sub compute_values($) {
- my($hz) = @_;
- my @val = ();
- my $s, $m, $a, $g;
-
- if (!$has_bigint) {
- die "$0: HZ == $hz not canned and ".
- "Math::BigInt not available\n";
- }
-
- # MSEC conversions
- push(@val, conversions(1000, $hz));
-
- # USEC conversions
- push(@val, conversions(1000000, $hz));
-
- return @val;
-}
-
-sub outputval($$)
-{
- my($name, $val) = @_;
- my $csuf;
-
- if (defined($val)) {
- if ($name !~ /SHR/) {
- $val = "U64_C($val)";
- }
- printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
- }
-}
-
-sub output($@)
-{
- my($hz, @val) = @_;
- my $pfx, $bit, $suf, $s, $m, $a;
-
- print "/* Automatically generated by kernel/timeconst.pl */\n";
- print "/* Conversion constants for HZ == $hz */\n";
- print "\n";
- print "#ifndef KERNEL_TIMECONST_H\n";
- print "#define KERNEL_TIMECONST_H\n";
- print "\n";
-
- print "#include <linux/param.h>\n";
- print "#include <linux/types.h>\n";
-
- print "\n";
- print "#if HZ != $hz\n";
- print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
- print "#endif\n";
- print "\n";
-
- foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
- 'HZ_TO_USEC','USEC_TO_HZ') {
- foreach $bit (32) {
- foreach $suf ('MUL', 'ADJ', 'SHR') {
- outputval("${pfx}_$suf$bit", shift(@val));
- }
- }
- foreach $suf ('NUM', 'DEN') {
- outputval("${pfx}_$suf", shift(@val));
- }
- }
-
- print "\n";
- print "#endif /* KERNEL_TIMECONST_H */\n";
-}
-
-# Pretty-print Perl values
-sub perlvals(@) {
- my $v;
- my @l = ();
-
- foreach $v (@_) {
- if (!defined($v)) {
- push(@l, 'undef');
- } elsif ($v =~ /^0x/) {
- push(@l, "\'".$v."\'");
- } else {
- push(@l, $v.'');
- }
- }
- return join(',', @l);
-}
-
-($hz) = @ARGV;
-
-# Use this to generate the %canned_values structure
-if ($hz eq '--can') {
- shift(@ARGV);
- @hzlist = sort {$a <=> $b} (@ARGV);
-
- print "# Precomputed values for systems without Math::BigInt\n";
- print "# Generated by:\n";
- print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
- print "\%canned_values = (\n";
- my $pf = "\t";
- foreach $hz (@hzlist) {
- my @values = compute_values($hz);
- print "$pf$hz => [\n";
- while (scalar(@values)) {
- my $bit;
- foreach $bit (32) {
- my $m = shift(@values);
- my $a = shift(@values);
- my $s = shift(@values);
- print "\t\t", perlvals($m,$a,$s), ",\n";
- }
- my $n = shift(@values);
- my $d = shift(@values);
- print "\t\t", perlvals($n,$d), ",\n";
- }
- print "\t]";
- $pf = ', ';
- }
- print "\n);\n";
-} else {
- $hz += 0; # Force to number
- if ($hz < 1) {
- die "Usage: $0 HZ\n";
- }
-
- $cv = $canned_values{$hz};
- @val = defined($cv) ? @$cv : compute_values($hz);
- output($hz, @val);
-}
-exit 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 0b5ecf5..5e9efd4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -81,21 +81,6 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
-config EVENT_POWER_TRACING_DEPRECATED
- depends on EVENT_TRACING
- bool "Deprecated power event trace API, to be removed"
- default y
- help
- Provides old power event types:
- C-state/idle accounting events:
- power:power_start
- power:power_end
- and old cpufreq accounting event:
- power:power_frequency
- This is for userspace compatibility
- and will vanish after 5 kernel iterations,
- namely 3.1.
-
config CONTEXT_SWITCH_TRACER
bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 90a5505..ed58a32 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -783,6 +783,7 @@ static void blk_add_trace_bio_complete(void *ignore,
static void blk_add_trace_bio_backmerge(void *ignore,
struct request_queue *q,
+ struct request *rq,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +791,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
static void blk_add_trace_bio_frontmerge(void *ignore,
struct request_queue *q,
+ struct request *rq,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9b44abb..8a5c017 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
};
/* ftrace_enabled is a method to turn ftrace on or off */
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
free_page(tmp);
}
- free_page((unsigned long)stat->pages);
stat->pages = NULL;
stat->start = NULL;
@@ -755,7 +754,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
{
struct ftrace_profile *rec;
struct hlist_head *hhd;
- struct hlist_node *n;
unsigned long key;
key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
@@ -764,7 +762,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
if (hlist_empty(hhd))
return NULL;
- hlist_for_each_entry_rcu(rec, n, hhd, node) {
+ hlist_for_each_entry_rcu(rec, hhd, node) {
if (rec->ip == ip)
return rec;
}
@@ -1047,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_lseek(file, offset, whence);
+ else
+ file->f_pos = ret = 1;
+
+ return ret;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1126,7 +1137,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
unsigned long key;
struct ftrace_func_entry *entry;
struct hlist_head *hhd;
- struct hlist_node *n;
if (ftrace_hash_empty(hash))
return NULL;
@@ -1138,7 +1148,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
hhd = &hash->buckets[key];
- hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+ hlist_for_each_entry_rcu(entry, hhd, hlist) {
if (entry->ip == ip)
return entry;
}
@@ -1195,7 +1205,7 @@ remove_hash_entry(struct ftrace_hash *hash,
static void ftrace_hash_clear(struct ftrace_hash *hash)
{
struct hlist_head *hhd;
- struct hlist_node *tp, *tn;
+ struct hlist_node *tn;
struct ftrace_func_entry *entry;
int size = 1 << hash->size_bits;
int i;
@@ -1205,7 +1215,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
for (i = 0; i < size; i++) {
hhd = &hash->buckets[i];
- hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
free_hash_entry(hash, entry);
}
FTRACE_WARN_ON(hash->count);
@@ -1268,7 +1278,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
- struct hlist_node *tp;
int size;
int ret;
int i;
@@ -1283,7 +1292,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
ret = add_hash_entry(new_hash, entry->ip);
if (ret < 0)
goto free_hash;
@@ -1309,7 +1318,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
- struct hlist_node *tp, *tn;
+ struct hlist_node *tn;
struct hlist_head *hhd;
struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
@@ -1354,7 +1363,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
- hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
remove_hash_entry(src, entry);
__add_hash_entry(new_hash, entry);
}
@@ -2604,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
* routine, you can use ftrace_filter_write() for the write
* routine if @flag has FTRACE_ITER_FILTER set, or
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
*/
int
@@ -2688,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
-{
- loff_t ret;
-
- if (file->f_mode & FMODE_READ)
- ret = seq_lseek(file, offset, whence);
- else
- file->f_pos = ret = 1;
-
- return ret;
-}
-
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
@@ -2889,7 +2885,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
{
struct ftrace_func_probe *entry;
struct hlist_head *hhd;
- struct hlist_node *n;
unsigned long key;
key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2905,7 +2900,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
* on the hash. rcu_read_lock is too dangerous here.
*/
preempt_disable_notrace();
- hlist_for_each_entry_rcu(entry, n, hhd, node) {
+ hlist_for_each_entry_rcu(entry, hhd, node) {
if (entry->ip == ip)
entry->ops->func(ip, parent_ip, &entry->data);
}
@@ -3081,7 +3076,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
struct list_head free_list;
struct ftrace_hash *hash;
- struct hlist_node *n, *tmp;
+ struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int type = MATCH_FULL;
int i, len = 0;
@@ -3112,7 +3107,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
- hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+ hlist_for_each_entry_safe(entry, tmp, hhd, node) {
/* break up if statements for readability */
if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -3614,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3622,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
.read = seq_read,
.write = ftrace_notrace_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3828,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_graph_release,
- .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4175,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT);
do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!ftrace_function_local_disabled(op) &&
+ if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+ !ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
@@ -4483,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {
.open = ftrace_pid_open,
.write = ftrace_pid_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_pid_release,
};
@@ -4599,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end) {
- if (ftrace_ops_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_ops_list->func;
- else
- ftrace_trace_function = ftrace_ops_list_func;
- }
+ if (ftrace_ops_list != &ftrace_list_end)
+ update_ftrace_function();
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf6..1c71382 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
-#ifdef EVENT_POWER_TRACING_DEPRECATED
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e5472f7..b59aea2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -181,7 +181,7 @@ void tracing_off_permanent(void)
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
# define RB_FORCE_8BYTE_ALIGNMENT 0
# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
#else
@@ -189,6 +189,8 @@ void tracing_off_permanent(void)
# define RB_ARCH_ALIGNMENT 8U
#endif
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
+
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -337,7 +339,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
- unsigned char data[]; /* data of buffer page */
+ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7297079..581630a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -904,8 +904,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (WARN_ON_ONCE(!tr->allocated_snapshot))
+ if (tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
return;
+ }
arch_spin_lock(&ftrace_max_lock);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f475b2a..bb922d9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -781,12 +781,11 @@ static int task_state_char(unsigned long state)
struct trace_event *ftrace_find_event(int type)
{
struct trace_event *event;
- struct hlist_node *n;
unsigned key;
key = type & (EVENT_HASHSIZE - 1);
- hlist_for_each_entry(event, n, &event_hash[key], node) {
+ hlist_for_each_entry(event, &event_hash[key], node) {
if (event->type == type)
return event;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8c3f37e..b20428c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
.open = stack_trace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 99e7e31..29f2654 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -197,12 +197,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
static struct tracepoint_entry *get_tracepoint(const char *name)
{
struct hlist_head *head;
- struct hlist_node *node;
struct tracepoint_entry *e;
u32 hash = jhash(name, strlen(name), 0);
head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
+ hlist_for_each_entry(e, head, hlist) {
if (!strcmp(name, e->name))
return e;
}
@@ -216,13 +215,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
static struct tracepoint_entry *add_tracepoint(const char *name)
{
struct hlist_head *head;
- struct hlist_node *node;
struct tracepoint_entry *e;
size_t name_len = strlen(name) + 1;
u32 hash = jhash(name, name_len-1, 0);
head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
+ hlist_for_each_entry(e, head, hlist) {
if (!strcmp(name, e->name)) {
printk(KERN_NOTICE
"tracepoint %s busy\n", name);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb8..394f70b 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
void fire_user_return_notifiers(void)
{
struct user_return_notifier *urn;
- struct hlist_node *tmp1, *tmp2;
+ struct hlist_node *tmp2;
struct hlist_head *head;
head = &get_cpu_var(return_notifier_list);
- hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+ hlist_for_each_entry_safe(urn, tmp2, head, link)
urn->on_user_return(urn);
put_cpu_var(return_notifier_list);
}
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e..8e635a1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -47,12 +47,12 @@ struct user_namespace init_user_ns = {
.count = 4294967295U,
},
},
- .kref = {
- .refcount = ATOMIC_INIT(3),
- },
+ .count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
+ .may_mount_sysfs = true,
+ .may_mount_proc = true,
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -107,9 +107,8 @@ static void uid_hash_remove(struct user_struct *up)
static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
{
struct user_struct *user;
- struct hlist_node *h;
- hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ hlist_for_each_entry(user, hashent, uidhash_node) {
if (uid_eq(user->uid, uid)) {
atomic_inc(&user->__count);
return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c4..e134d8f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,10 +21,12 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
+#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)
kgid_t group = new->egid;
int ret;
+ /*
+ * Verify that we can not violate the policy of which files
+ * may be accessed that is specified by the root directory,
+ * by verifing that the root directory is at the root of the
+ * mount namespace which allows all files to be accessed.
+ */
+ if (current_chrooted())
+ return -EPERM;
+
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
@@ -78,7 +89,7 @@ int create_user_ns(struct cred *new)
return ret;
}
- kref_init(&ns->kref);
+ atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->owner = owner;
@@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)
set_cred_user_ns(new, ns);
+ update_mnt_policy(ns);
+
return 0;
}
@@ -104,15 +117,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
return create_user_ns(cred);
}
-void free_user_ns(struct kref *kref)
+void free_user_ns(struct user_namespace *ns)
{
- struct user_namespace *parent, *ns =
- container_of(kref, struct user_namespace, kref);
+ struct user_namespace *parent;
- parent = ns->parent;
- proc_free_inum(ns->proc_inum);
- kmem_cache_free(user_ns_cachep, ns);
- put_user_ns(parent);
+ do {
+ parent = ns->parent;
+ proc_free_inum(ns->proc_inum);
+ kmem_cache_free(user_ns_cachep, ns);
+ ns = parent;
+ } while (atomic_dec_and_test(&parent->count));
}
EXPORT_SYMBOL(free_user_ns);
@@ -519,6 +533,42 @@ struct seq_operations proc_projid_seq_operations = {
.show = projid_m_show,
};
+static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
+{
+ u32 upper_first, lower_first, upper_last, lower_last;
+ unsigned idx;
+
+ upper_first = extent->first;
+ lower_first = extent->lower_first;
+ upper_last = upper_first + extent->count - 1;
+ lower_last = lower_first + extent->count - 1;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ u32 prev_upper_first, prev_lower_first;
+ u32 prev_upper_last, prev_lower_last;
+ struct uid_gid_extent *prev;
+
+ prev = &new_map->extent[idx];
+
+ prev_upper_first = prev->first;
+ prev_lower_first = prev->lower_first;
+ prev_upper_last = prev_upper_first + prev->count - 1;
+ prev_lower_last = prev_lower_first + prev->count - 1;
+
+ /* Does the upper range intersect a previous extent? */
+ if ((prev_upper_first <= upper_last) &&
+ (prev_upper_last >= upper_first))
+ return true;
+
+ /* Does the lower range intersect a previous extent? */
+ if ((prev_lower_first <= lower_last) &&
+ (prev_lower_last >= lower_first))
+ return true;
+ }
+ return false;
+}
+
+
static DEFINE_MUTEX(id_map_mutex);
static ssize_t map_write(struct file *file, const char __user *buf,
@@ -531,7 +581,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct user_namespace *ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
- struct uid_gid_extent *extent, *last = NULL;
+ struct uid_gid_extent *extent = NULL;
unsigned long page = 0;
char *kbuf, *pos, *next_line;
ssize_t ret = -EINVAL;
@@ -563,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (map->nr_extents != 0)
goto out;
- /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
- * over the user namespace in order to set the id mapping.
+ /*
+ * Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
/* Get a buffer */
@@ -634,14 +684,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if ((extent->lower_first + extent->count) <= extent->lower_first)
goto out;
- /* For now only accept extents that are strictly in order */
- if (last &&
- (((last->first + last->count) > extent->first) ||
- ((last->lower_first + last->count) > extent->lower_first)))
+ /* Do the ranges in extent overlap any previous extents? */
+ if (mappings_overlap(&new_map, extent))
goto out;
new_map.nr_extents++;
- last = extent;
/* Fail if the file contains too many extents */
if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -654,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
/* Map the lower ids from the parent user namespace to the
@@ -741,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
&ns->projid_map, &ns->parent->projid_map);
}
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
/* Allow mapping to your own filesystem ids */
@@ -749,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, current_fsuid()))
+ if (uid_eq(uid, file->f_cred->fsuid))
return true;
}
else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, current_fsgid()))
+ if (gid_eq(gid, file->f_cred->fsgid))
return true;
}
}
@@ -765,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+ * And the opener of the id file also had the approprpiate capability.
*/
- if (ns_capable(ns->parent, cap_setid))
+ if (ns_capable(ns->parent, cap_setid) &&
+ file_ns_capable(file, ns->parent, cap_setid))
return true;
return false;
@@ -803,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
if (atomic_read(&current->mm->mm_users) > 1)
return -EINVAL;
+ if (current->fs->users != 1)
+ return -EINVAL;
+
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e..a47fc5d 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void)
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
*/
static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c..4f69f9a 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
#include <linux/sysctl.h>
#include <linux/wait.h>
+#ifdef CONFIG_PROC_SYSCTL
+
static void *get_uts(ctl_table *table, int write)
{
char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
up_write(&uts_sem);
}
-#ifdef CONFIG_PROC_SYSCTL
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 2768942..4a94467 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -113,9 +113,9 @@ static int get_softlockup_thresh(void)
* resolution, and we don't need to waste time with a big divide when
* 2^30ns == 1.074s.
*/
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
{
- return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+ return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
}
static void set_sample_period(void)
@@ -133,9 +133,7 @@ static void set_sample_period(void)
/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
- int this_cpu = smp_processor_id();
-
- __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
+ __this_cpu_write(watchdog_touch_ts, get_timestamp());
}
void touch_softlockup_watchdog(void)
@@ -196,7 +194,7 @@ static int is_hardlockup(void)
static int is_softlockup(unsigned long touch_ts)
{
- unsigned long now = get_timestamp(smp_processor_id());
+ unsigned long now = get_timestamp();
/* Warn about unreasonable delays: */
if (time_after(now, touch_ts + get_softlockup_thresh()))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f4feaca..b48cd59 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,8 +251,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
for ((pool) = &std_worker_pools(cpu)[0]; \
(pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
-#define for_each_busy_worker(worker, i, pos, pool) \
- hash_for_each(pool->busy_hash, i, pos, worker, hentry)
+#define for_each_busy_worker(worker, i, pool) \
+ hash_for_each(pool->busy_hash, i, worker, hentry)
static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
unsigned int sw)
@@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool)
int ret;
mutex_lock(&worker_pool_idr_mutex);
- idr_pre_get(&worker_pool_idr, GFP_KERNEL);
- ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
+ ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+ if (ret >= 0)
+ pool->id = ret;
mutex_unlock(&worker_pool_idr_mutex);
- return ret;
+ return ret < 0 ? ret : 0;
}
/*
@@ -909,9 +910,8 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
struct work_struct *work)
{
struct worker *worker;
- struct hlist_node *tmp;
- hash_for_each_possible(pool->busy_hash, worker, tmp, hentry,
+ hash_for_each_possible(pool->busy_hash, worker, hentry,
(unsigned long)work)
if (worker->current_work == work &&
worker->current_func == work->func)
@@ -1626,7 +1626,6 @@ static void busy_worker_rebind_fn(struct work_struct *work)
static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker, *n;
- struct hlist_node *pos;
int i;
lockdep_assert_held(&pool->assoc_mutex);
@@ -1648,7 +1647,7 @@ static void rebind_workers(struct worker_pool *pool)
}
/* rebind busy workers */
- for_each_busy_worker(worker, i, pos, pool) {
+ for_each_busy_worker(worker, i, pool) {
struct work_struct *rebind_work = &worker->rebind_work;
struct workqueue_struct *wq;
@@ -3423,7 +3422,6 @@ static void wq_unbind_fn(struct work_struct *work)
int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
- struct hlist_node *pos;
int i;
for_each_std_worker_pool(pool, cpu) {
@@ -3442,35 +3440,41 @@ static void wq_unbind_fn(struct work_struct *work)
list_for_each_entry(worker, &pool->idle_list, entry)
worker->flags |= WORKER_UNBOUND;
- for_each_busy_worker(worker, i, pos, pool)
+ for_each_busy_worker(worker, i, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
spin_unlock_irq(&pool->lock);
mutex_unlock(&pool->assoc_mutex);
- }
- /*
- * Call schedule() so that we cross rq->lock and thus can guarantee
- * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
- * as scheduler callbacks may be invoked from other cpus.
- */
- schedule();
+ /*
+ * Call schedule() so that we cross rq->lock and thus can
+ * guarantee sched callbacks see the %WORKER_UNBOUND flag.
+ * This is necessary as scheduler callbacks may be invoked
+ * from other cpus.
+ */
+ schedule();
- /*
- * Sched callbacks are disabled now. Zap nr_running. After this,
- * nr_running stays zero and need_more_worker() and keep_working()
- * are always true as long as the worklist is not empty. Pools on
- * @cpu now behave as unbound (in terms of concurrency management)
- * pools which are served by workers tied to the CPU.
- *
- * On return from this function, the current worker would trigger
- * unbound chain execution of pending work items if other workers
- * didn't already.
- */
- for_each_std_worker_pool(pool, cpu)
+ /*
+ * Sched callbacks are disabled now. Zap nr_running.
+ * After this, nr_running stays zero and need_more_worker()
+ * and keep_working() are always true as long as the
+ * worklist is not empty. This pool now behaves as an
+ * unbound (in terms of concurrency management) pool which
+ * are served by workers tied to the pool.
+ */
atomic_set(&pool->nr_running, 0);
+
+ /*
+ * With concurrency management just turned off, a busy
+ * worker blocking could lead to lengthy stalls. Kick off
+ * unbound chain execution of currently pending work items.
+ */
+ spin_lock_irq(&pool->lock);
+ wake_up_worker(pool);
+ spin_unlock_irq(&pool->lock);
+ }
}
/*
OpenPOWER on IntegriCloud