From 9bf39ab2adafd7cf8740859cb49e7b7952813a5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 19 Jun 2015 10:29:13 +0200 Subject: vfs: add file_path() helper Turn d_path(&file->f_path, ...); into file_path(file, ...); Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 81aa3a4..5c964e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5791,7 +5791,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); + name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; -- cgit v1.1 From f9bd6733d3f11e24f3949becf277507d422ee1eb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 9 May 2015 22:09:14 -0500 Subject: sysctl: Allow creating permanently empty directories that serve as mountpoints. Add a magic sysctl table sysctl_mount_point that when used to create a directory forces that directory to be permanently empty. Update the code to use make_empty_dir_inode when accessing permanently empty directories. Update the code to not allow adding to permanently empty directories. Update /proc/sys/fs/binfmt_misc to be a permanently empty directory. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- kernel/sysctl.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2082b1a..c3eee4c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1531,12 +1531,6 @@ static struct ctl_table vm_table[] = { { } }; -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) -static struct ctl_table binfmt_misc_table[] = { - { } -}; -#endif - static struct ctl_table fs_table[] = { { .procname = "inode-nr", @@ -1690,7 +1684,7 @@ static struct ctl_table fs_table[] = { { .procname = "binfmt_misc", .mode = 0555, - .child = binfmt_misc_table, + .child = sysctl_mount_point, }, #endif { -- cgit v1.1 From f9bb48825a6b5d02f4cabcc78967c75db903dcdc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 May 2015 17:35:41 -0500 Subject: sysfs: Create mountpoints with sysfs_create_mount_point This allows for better documentation in the code and it allows for a simpler and fully correct version of fs_fully_visible to be written. The mount points converted and their filesystems are: /sys/hypervisor/s390/ s390_hypfs /sys/kernel/config/ configfs /sys/kernel/debug/ debugfs /sys/firmware/efi/efivars/ efivarfs /sys/fs/fuse/connections/ fusectl /sys/fs/pstore/ pstore /sys/kernel/tracing/ tracefs /sys/fs/cgroup/ cgroup /sys/kernel/security/ securityfs /sys/fs/selinux/ selinuxfs /sys/fs/smackfs/ smackfs Cc: stable@vger.kernel.org Acked-by: Greg Kroah-Hartman Signed-off-by: "Eric W. Biederman" --- kernel/cgroup.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 469dd54..e8a5491 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1924,8 +1924,6 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; -static struct kobject *cgroup_kobj; - /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -5044,13 +5042,13 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } - cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) - return -ENOMEM; + err = sysfs_create_mount_point(fs_kobj, "cgroup"); + if (err) + return err; err = register_filesystem(&cgroup_fs_type); if (err < 0) { - kobject_put(cgroup_kobj); + sysfs_remove_mount_point(fs_kobj, "cgroup"); return err; } -- cgit v1.1 From 7df9ab845ce5f473c84184873bf5be08bbe8fda5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 2 Jul 2015 16:42:13 -0700 Subject: make certificate list change message more useful It's a bug in our Makefile rules, make it show what the changing certificate list was, and make it a warning so that people actually see it. Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 60c302c..43c4c92 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -137,7 +137,7 @@ endif ifneq ($(wildcard $(obj)/.x509.list),) ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(info X.509 certificate list changed) +$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") $(shell rm $(obj)/.x509.list) endif endif -- cgit v1.1 From 2ecd9d29abb171d6e97a4f3eb29d7456a11401b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2015 18:53:58 +0200 Subject: sched, preempt_notifier: separate notifier registration from static_key inc/dec Commit 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers") had two problems. First, the preempt-notifier API needs to sleep with the addition of the static_key, we do however need to hold off preemption while modifying the preempt notifier list, otherwise a preemption could observe an inconsistent list state. KVM correctly registers and unregisters preempt notifiers with preemption disabled, so the sleep caused dmesg splats. Second, KVM registers and unregisters preemption notifiers very often (in vcpu_load/vcpu_put). With a single uniprocessor guest the static key would move between 0 and 1 continuously, hitting the slow path on every userspace exit. To fix this, wrap the static_key inc/dec in a new API, and call it from KVM. Fixes: 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers") Reported-by: Pontus Fuchs Reported-by: Takashi Iwai Tested-by: Takashi Iwai Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- kernel/sched/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b803e1b..552710a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2320,13 +2320,27 @@ void wake_up_new_task(struct task_struct *p) static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); + +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_dec); + /** * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) { - static_key_slow_inc(&preempt_notifier_key); + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } EXPORT_SYMBOL_GPL(preempt_notifier_register); @@ -2340,7 +2354,6 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register); void preempt_notifier_unregister(struct preempt_notifier *notifier) { hlist_del(¬ifier->link); - static_key_slow_dec(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -- cgit v1.1 From f6db8347993256b58bd4746b0c4c5b935c32210d Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 25 Jun 2015 23:53:37 +0530 Subject: sched/stat: Simplify the sched_info accounting dependency Both CONFIG_SCHEDSTATS=y and CONFIG_TASK_DELAY_ACCT=y track task sched_info, which results in ugly #if clauses. Simplify the code by introducing a synthethic CONFIG_SCHED_INFO switch, selected by both. Signed-off-by: Naveen N. Rao Cc: Balbir Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: a.p.zijlstra@chello.nl Cc: ricklind@us.ibm.com Link: http://lkml.kernel.org/r/8d19eef800811a94b0f91bcbeb27430a884d7433.1435255405.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/stats.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c86935a..abb8785 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1975,7 +1975,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) set_task_cpu(p, cpu); raw_spin_unlock_irqrestore(&p->pi_lock, flags); -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 077ebbd..b0fbc76 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_set(var, val) do { } while (0) #endif -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { t->sched_info.last_queued = 0; @@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq, #define sched_info_depart(rq, t) do { } while (0) #define sched_info_arrive(rq, next) do { } while (0) #define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +#endif /* CONFIG_SCHED_INFO */ /* * The following are functions that support scheduler-internal time accounting. -- cgit v1.1 From 6b55c9654fccf69ae7ace23ca101dc37b903181b Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 25 Jun 2015 22:51:41 +0530 Subject: sched/debug: Move print_cfs_rq() declaration to kernel/sched/sched.h Currently print_cfs_rq() is declared in include/linux/sched.h. However it's not used outside kernel/sched. Hence move the declaration to kernel/sched/sched.h Also some functions are only available for CONFIG_SCHED_DEBUG=y. Hence move the declarations to within the #ifdef. Signed-off-by: Srikar Dronamraju Acked-by: Rik van Riel Cc: Iulia Manda Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435252903-1081-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aea7c1f..7d58952 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1668,9 +1668,14 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); +extern void +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +#endif extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); -- cgit v1.1 From e3d24d0a6048a826de5562d75dedb664d3a2a1b2 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 25 Jun 2015 22:51:42 +0530 Subject: sched/numa: Show numa_group ID in /proc/sched_debug task listings Having the numa group ID in /proc/sched_debug helps to see how the numa groups have spread across the system. Signed-off-by: Srikar Dronamraju Acked-by: Rik van Riel Cc: Iulia Manda Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435252903-1081-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 315c68e..f1dcd1d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -142,7 +142,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0L); #endif #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", task_node(p)); + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); -- cgit v1.1 From 397f2378f136128623fc237746157aa2564d1082 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 25 Jun 2015 22:51:43 +0530 Subject: sched/numa: Fix numa balancing stats in /proc/pid/sched Commit 44dba3d5d6a1 ("sched: Refactor task_struct to use numa_faults instead of numa_* pointers") modified the way tsk->numa_faults stats are accounted. However that commit never touched show_numa_stats() that is displayed in /proc/pid/sched and thus the numbers displayed in /proc/pid/sched don't match the actual numbers. Fix it by making sure that /proc/pid/sched reflects the task fault numbers. Also add group fault stats too. Also couple of more modifications are added here: 1. Format changes: - Previously we would list two entries per node, one for private and one for shared. Also the home node info was listed in each entry. - Now preferred node, total_faults and current node are displayed separately. - Now there is one entry per node, that lists private,shared task and group faults. 2. Unit changes: - p->numa_pages_migrated was getting reset after every read of /proc/pid/sched. It's more useful to have absolute numbers since differential migrations between two accesses can be more easily calculated. Signed-off-by: Srikar Dronamraju Acked-by: Rik van Riel Cc: Iulia Manda Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435252903-1081-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 38 +++++++++++++++++--------------------- kernel/sched/fair.c | 22 +++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 47 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f1dcd1d..4222ec5 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -517,11 +517,21 @@ __initcall(init_sched_debug_procfs); SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#ifdef CONFIG_NUMA_BALANCING +void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf) +{ + SEQ_printf(m, "numa_faults node=%d ", node); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); +} +#endif + + static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING struct mempolicy *pol; - int node, i; if (p->mm) P(mm->numa_scan_seq); @@ -533,26 +543,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) mpol_get(pol); task_unlock(p); - SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); - - for_each_online_node(node) { - for (i = 0; i < 2; i++) { - unsigned long nr_faults = -1; - int cpu_current, home_node; - - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; - - cpu_current = !i ? (task_node(p) == node) : - (pol && node_isset(node, pol->v.nodes)); - - home_node = (p->numa_preferred_nid == node); - - SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", - i, node, cpu_current, home_node, nr_faults); - } - } - + P(numa_pages_migrated); + P(numa_preferred_nid); + P(total_numa_faults); + SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", + task_node(p), task_numa_group_id(p)); + show_numa_stats(p, m); mpol_put(pol); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40a7fcb..7245039 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8468,7 +8468,27 @@ void print_cfs_stats(struct seq_file *m, int cpu) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } -#endif + +#ifdef CONFIG_NUMA_BALANCING +void show_numa_stats(struct task_struct *p, struct seq_file *m) +{ + int node; + unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + + for_each_online_node(node) { + if (p->numa_faults) { + tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; + tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + if (p->numa_group) { + gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + print_numa_stats(m, node, tsf, tpf, gsf, gpf); + } +} +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7d58952..7ef5968 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1675,7 +1675,15 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); -#endif + +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); -- cgit v1.1 From 57ffc5ca679f499f4704fd9b6a372916f59930ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jun 2015 12:32:49 +0200 Subject: perf: Fix AUX buffer refcounting Its currently possible to drop the last refcount to the aux buffer from NMI context, which results in the expected fireworks. The refcounting needs a bigger overhaul, but to cure the immediate problem, delay the freeing by using an irq_work. Reviewed-and-tested-by: Alexander Shishkin Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150618103249.GK19282@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- kernel/events/internal.h | 10 ++++++++++ kernel/events/ring_buffer.c | 27 +++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e965cfa..d3dae34 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4358,14 +4358,6 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -static void rb_free_rcu(struct rcu_head *rcu_head) -{ - struct ring_buffer *rb; - - rb = container_of(rcu_head, struct ring_buffer, rcu_head); - rb_free(rb); -} - struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2deb24c..2bbad9c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,6 +11,7 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; + struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ @@ -55,6 +56,15 @@ struct ring_buffer { }; extern void rb_free(struct ring_buffer *rb); + +static inline void rb_free_rcu(struct rcu_head *rcu_head) +{ + struct ring_buffer *rb; + + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 9647282..b2be01b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static void rb_irq_work(struct irq_work *work); + static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); + init_irq_work(&rb->irq_work, rb_irq_work); +} + +static void ring_buffer_put_async(struct ring_buffer *rb) +{ + if (!atomic_dec_and_test(&rb->refcount)) + return; + + rb->rcu_head.next = (void *)rb; + irq_work_queue(&rb->irq_work); } /* @@ -319,7 +331,7 @@ err_put: rb_free_aux(rb); err: - ring_buffer_put(rb); + ring_buffer_put_async(rb); handle->event = NULL; return NULL; @@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, local_set(&rb->aux_nest, 0); rb_free_aux(rb); - ring_buffer_put(rb); + ring_buffer_put_async(rb); } /* @@ -557,7 +569,18 @@ static void __rb_free_aux(struct ring_buffer *rb) void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) + irq_work_queue(&rb->irq_work); +} + +static void rb_irq_work(struct irq_work *work) +{ + struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); + + if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); + + if (rb->rcu_head.next == (void *)rb) + call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC -- cgit v1.1 From 7c4a976cd55972b68c75a978f171b6db5df4ce66 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 7 Jul 2015 10:14:35 +0200 Subject: clockevents: Allow set-state callbacks to be optional Its mandatory for the drivers to provide set_state_{oneshot|periodic}() (only if related modes are supported) and set_state_shutdown() callbacks today, if they are implementing the new set-state interface. But this leads to unnecessary noop callbacks for drivers which don't want to implement them. Over that, it will lead to a full function call for nothing really useful. Lets make all set-state callbacks optional. Suggested-by: Daniel Lezcano Signed-off-by: Viresh Kumar Signed-off-by: Daniel Lezcano Link: http://lkml.kernel.org/r/1436256875-15562-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 08ccc3d..50eb107 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -120,19 +120,25 @@ static int __clockevents_switch_state(struct clock_event_device *dev, /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: - return dev->set_state_shutdown(dev); + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; - return dev->set_state_periodic(dev); + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; - return dev->set_state_oneshot(dev); + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ @@ -471,18 +477,6 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; - /* New state-specific callbacks */ - if (!dev->set_state_shutdown) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !dev->set_state_periodic) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && - !dev->set_state_oneshot) - return -EINVAL; - return 0; } -- cgit v1.1 From 8eb231261fdd20768db23863d00ef277de4b0543 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 14:11:00 +0200 Subject: tick/broadcast: Prevent hrtimer recursion The hrtimer based broadcast vehicle can cause a hrtimer recursion which went unnoticed until we changed the hrtimer expiry code to keep track of the currently running timer. local_timer_interrupt() local_handler() hrtimer_interrupt() expire_hrtimers() broadcast_hrtimer() send_ipis() local_handler() hrtimer_interrupt() .... Solution is simple: Prevent the local handler call from the broadcast code when the broadcast 'device' is hrtimer based. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index d39f32c..a762040 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -265,8 +265,22 @@ static bool tick_do_broadcast(struct cpumask *mask) * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, mask); - local = true; + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { -- cgit v1.1 From e0454311903d3fd0f12a86c9e65d7b271c2bb05d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 14:07:27 +0200 Subject: tick/broadcast: Sanity check the shutdown of the local clock_event The broadcast code shuts down the local clock event unconditionally even if no broadcast device is installed or if the broadcast device is hrtimer based. Add proper sanity checks. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index a762040..9877d0b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret; + int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt - * is delivered by the broadcast device. + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. */ - ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: - /* Nothing to do */ - ret = 0; break; } } @@ -373,8 +374,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hickup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; -- cgit v1.1 From f32dd117051185da6e923b35491a44d7debeeea5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:29:38 +0200 Subject: tick/broadcast: Make idle check independent from mode and config Currently the broadcast busy check, which prevents the idle code from going into deep idle, works only in one shot mode. If NOHZ and HIGHRES are off (config or command line) there is no sanity check at all, so under certain conditions cpus are allowed to go into deep idle, where the local timer stops, and are not woken up again because there is no broadcast timer installed or a hrtimer based broadcast device is not evaluated. Move tick_broadcast_oneshot_control() into the common code and provide proper subfunctions for the various config combinations. The common check in tick_broadcast_oneshot_control() is for the C3STOP misfeature flag of the local clock event device. If its not set, idle can proceed. If set, further checks are necessary. Provide checks for the trivial cases: - If broadcast is disabled in the config, then return busy - If oneshot mode (NOHZ/HIGHES) is disabled in the config, return busy if the broadcast device is hrtimer based. - If oneshot mode is enabled in the config call the original tick_broadcast_oneshot_control() function. That function needs extra checks which will be implemented in seperate patches. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 26 +++++++++++--------------- kernel/time/tick-common.c | 21 +++++++++++++++++++++ kernel/time/tick-sched.h | 10 ++++++++++ 3 files changed, 42 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9877d0b..ef77b16 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -685,18 +685,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -/** - * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode - * @state: The target state (enter/exit) - * - * The system enters/leaves a state, where affected devices might stop - * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. - * - * Called with interrupts disabled, so clockevents_lock is not - * required here because the local clock event device cannot go away - * under us. - */ -int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; struct tick_device *td; @@ -717,9 +706,6 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; - raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); @@ -961,6 +947,16 @@ bool tick_broadcast_oneshot_available(void) return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} #endif void __init tick_broadcast_init(void) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 76446cb..55e13ef 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -343,6 +343,27 @@ out_bc: tick_install_broadcast_device(newdev); } +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} + #ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 42fdf49..a4a8d4e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu); static inline void tick_cancel_sched_timer(int cpu) { } #endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + #endif -- cgit v1.1 From b78f3f3c898c824bf56ab55cfa59fc72be49c349 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:34:32 +0200 Subject: tick/broadcast: Prevent deep idle if no broadcast device available Add a check for a installed broadcast device to the oneshot control function and return busy if not. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ef77b16..fad3f78 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -693,6 +693,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) ktime_t now; /* + * If there is no broadcast device, tell the caller not to go + * into deep idle. + */ + if (!tick_broadcast_device.evtdev) + return -EBUSY; + + /* * Periodic mode does not care about the enter/exit of power * states */ -- cgit v1.1 From e3ac79e087ffe8a1f953ed44a74acf7616cb0b25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:38:11 +0200 Subject: tick/broadcast: Move the check for periodic mode inside state handling We need to check more than the periodic mode for proper operation in all runtime combinations. To avoid code duplication move the check into the enter state handling. No functional change. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fad3f78..83aa92e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -688,7 +688,6 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; - struct tick_device *td; int cpu, ret = 0; ktime_t now; @@ -699,25 +698,20 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) if (!tick_broadcast_device.evtdev) return -EBUSY; - /* - * Periodic mode does not care about the enter/exit of power - * states - */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return 0; - - /* - * We are called with preemtion disabled from the depth of the - * idle code, so we can't be moved away. - */ - td = this_cpu_ptr(&tick_cpu_device); - dev = td->evtdev; + dev = this_cpu_ptr(&tick_cpu_device)->evtdev; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + goto out; + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); broadcast_shutdown_local(bc, dev); -- cgit v1.1 From d33257264b0267a8fd20f6717abbb484c9e21130 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 17:45:22 +0200 Subject: tick/broadcast: Return busy if periodic mode and hrtimer broadcast If the system is in periodic mode and the broadcast device is hrtimer based, return busy as we have no proper handling for this. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 83aa92e..da7b40f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -709,8 +709,12 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * If the broadcast device is in periodic mode, we * return. */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; goto out; + } if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); -- cgit v1.1 From 0cc5281aa592d0020868f6ccaed359b4ad7b2684 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:45:15 +0200 Subject: tick/broadcast: Return busy when IPI is pending Tell the idle code not to go deep if the broadcast IPI is about to arrive. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da7b40f..70b47bc 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -725,11 +725,15 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be - * woken by the IPI right away. + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. */ - if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && - dev->next_event.tv64 < bc->next_event.tv64) + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event.tv64 < bc->next_event.tv64) { tick_broadcast_set_event(bc, cpu, dev->next_event); + } } /* * If the current CPU owns the hrtimer broadcast -- cgit v1.1 From d5113e13a550bc9c2b53cc9944b8a06453c4a0a1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:43:04 +0200 Subject: tick/broadcast: Check for hrtimer broadcast active early If the current cpu is the one which has the hrtimer based broadcast queued then we better return busy immediately instead of going through loops and hoops to figure that out. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 70b47bc..c8d731a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -706,6 +706,17 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) if (state == TICK_BROADCAST_ENTER) { /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + + /* * If the broadcast device is in periodic mode, we * return. */ @@ -718,7 +729,10 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); + /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -733,18 +747,20 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) ret = -EBUSY; } else if (dev->next_event.tv64 < bc->next_event.tv64) { tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } } } - /* - * If the current CPU owns the hrtimer broadcast - * mechanism, it cannot go deep idle and we remove the - * CPU from the broadcast mask. We don't have to go - * through the EXIT path as the local timer is not - * shutdown. - */ - ret = broadcast_needs_cpu(bc, cpu); - if (ret) - cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); -- cgit v1.1 From c4288334818c81c946acb23d2319881f58c3d497 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 20:53:17 +0000 Subject: tick/broadcast: Handle spurious interrupts gracefully Andriy reported that on a virtual machine the warning about negative expiry time in the clock events programming code triggered: hpet: hpet0 irq 40 for MSI hpet: hpet1 irq 41 for MSI Switching to clocksource hpet WARNING: at kernel/time/clockevents.c:239 [] clockevents_program_event+0xdb/0xf0 [] tick_handle_periodic_broadcast+0x41/0x50 [] timer_interrupt+0x15/0x20 When the second hpet is installed as a per cpu timer the broadcast event is not longer required and stopped, which sets the next_evt of the broadcast device to KTIME_MAX. If after that a spurious interrupt happens on the broadcast device, then the current code blindly handles it and tries to reprogram the broadcast device afterwards, which adds the period to next_evt. KTIME_MAX + period results in a negative expiry value causing the WARN_ON in the clockevents code to trigger. Add a proper check for the state of the broadcast device into the interrupt handler and return if the interrupt is spurious. [ Folded in pointer fix from Sudeep ] Reported-by: Andriy Gapon Signed-off-by: Thomas Gleixner Cc: Sudeep Holla Cc: Peter Zijlstra Cc: Preeti U Murthy Link: http://lkml.kernel.org/r/20150705205221.802094647@linutronix.de --- kernel/time/tick-broadcast.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c8d731a..ee3cf94 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -316,6 +316,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); + + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } + bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { -- cgit v1.1 From a899418167264c7bac574b1a0f1b2c26c5b0995a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:30 +0000 Subject: hotplug: Prevent alloc/free of irq descriptors during cpu up/down When a cpu goes up some architectures (e.g. x86) have to walk the irq space to set up the vector space for the cpu. While this needs extra protection at the architecture level we can avoid a few race conditions by preventing the concurrent allocation/free of irq descriptors and the associated data. When a cpu goes down it moves the interrupts which are targeted to this cpu away by reassigning the affinities. While this happens interrupts can be allocated and freed, which opens a can of race conditions in the code which reassignes the affinities because interrupt descriptors might be freed underneath. Example: CPU1 CPU2 cpu_up/down irq_desc = irq_to_desc(irq); remove_from_radix_tree(desc); raw_spin_lock(&desc->lock); free(desc); We could protect the irq descriptors with RCU, but that would require a full tree change of all accesses to interrupt descriptors. But fortunately these kind of race conditions are rather limited to a few things like cpu hotplug. The normal setup/teardown is very well serialized. So the simpler and obvious solution is: Prevent allocation and freeing of interrupt descriptors accross cpu hotplug. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.063519515@linutronix.de --- kernel/cpu.c | 22 +++++++++++++++++++++- kernel/irq/internals.h | 4 ---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c9c9fa..6a37454 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "smpboot.h" @@ -392,13 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smpboot_park_threads(cpu); /* - * So now all preempt/rcu users must observe !cpu_active(). + * Prevent irq alloc/free while the dying cpu reorganizes the + * interrupt affinities. */ + irq_lock_sparse(); + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + irq_unlock_sparse(); goto out_release; } BUG_ON(cpu_online(cpu)); @@ -415,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ per_cpu(cpu_dead_idle, cpu) = false; + /* Interrupts are moved away from the dying cpu, reenable alloc/free */ + irq_unlock_sparse(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -517,8 +527,18 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; } + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + + irq_unlock_sparse(); + if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4834ee8..61008b8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } -extern void irq_lock_sparse(void); -extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); -static inline void irq_lock_sparse(void) { } -static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -- cgit v1.1 From 45820c294fe1b1a9df495d57f40585ef2d069a39 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jul 2015 09:33:38 -0700 Subject: Fix broken audit tests for exec arg len The "fix" in commit 0b08c5e5944 ("audit: Fix check of return value of strnlen_user()") didn't fix anything, it broke things. As reported by Steven Rostedt: "Yes, strnlen_user() returns 0 on fault, but if you look at what len is set to, than you would notice that on fault len would be -1" because we just subtracted one from the return value. So testing against 0 doesn't test for a fault condition, it tests against a perfectly valid empty string. Also fix up the usual braindamage wrt using WARN_ON() inside a conditional - make it part of the conditional and remove the explicit unlikely() (which is already part of the WARN_ON*() logic, exactly so that you don't have to write unreadable code. Reported-and-tested-by: Steven Rostedt Cc: Jan Kara Cc: Paul Moore Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 09c6564..e85bdfd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1021,8 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == 0) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); + if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { send_sig(SIGKILL, current, 0); return -1; } -- cgit v1.1 From 758556bdc1c8a8dffea0ea9f9df891878cc2468c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jul 2015 06:48:06 +0930 Subject: module: Fix load_module() error path The load_module() error path frees a module but forgot to take it out of the mod_tree, leaving a dangling entry in the tree, causing havoc. Cc: Mathieu Desnoyers Reported-by: Arthur Marsh Tested-by: Arthur Marsh Fixes: 93c2e105f6bc ("module: Optimize __module_address() using a latched RB-tree") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3e0e197..4d2b82e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3557,6 +3557,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); wake_up_all(&module_wq); /* Wait for RCU-sched synchronizing before releasing mod->list. */ synchronize_sched(); -- cgit v1.1 From c4d029f2d43b39de7b9299e8b58102a442ba86f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Jul 2015 14:26:34 +0200 Subject: tick/broadcast: Prevent NULL pointer dereference Dan reported that the recent changes to the broadcast code introduced a potential NULL dereference. Add the proper check. Fixes: e0454311903d "tick/broadcast: Sanity check the shutdown of the local clock_event" Reported-by: Dan Carpenter Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee3cf94..52b9e19 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -409,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) break; } - if (cpumask_empty(tick_broadcast_mask)) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } } raw_spin_unlock(&tick_broadcast_lock); } -- cgit v1.1