From 275d7d44d802ef271a42dc87ac091a495ba72fc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Aug 2015 10:34:59 +0930 Subject: module: Fix locking in symbol_put_addr() Poma (on the way to another bug) reported an assertion triggering: [] module_assert_mutex_or_preempt+0x49/0x90 [] __module_address+0x32/0x150 [] __module_text_address+0x16/0x70 [] symbol_put_addr+0x29/0x40 [] dvb_frontend_detach+0x7d/0x90 [dvb_core] Laura Abbott produced a patch which lead us to inspect symbol_put_addr(). This function has a comment claiming it doesn't need to disable preemption around the module lookup because it holds a reference to the module it wants to find, which therefore cannot go away. This is wrong (and a false optimization too, preempt_disable() is really rather cheap, and I doubt any of this is on uber critical paths, otherwise it would've retained a pointer to the actual module anyway and avoided the second lookup). While its true that the module cannot go away while we hold a reference on it, the data structure we do the lookup in very much _CAN_ change while we do the lookup. Therefore fix the comment and add the required preempt_disable(). Reported-by: poma Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell Fixes: a6e6abd575fc ("module: remove module_text_address()") Cc: stable@kernel.org --- kernel/module.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b86b7bf..8f051a1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr) if (core_kernel_text(a)) return; - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); + preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); -- cgit v1.1 From 74b22c465cd2b6ff4b8cec3997512ec807e6e495 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Aug 2015 09:42:34 +0930 Subject: params: don't ignore the rest of cmdline if parse_one() fails parse_args() just aborts after it hits an error, so other args at the same initcall level are simply ignored. This can lead to other hard-to-understand problems, for example my testing machine panics during the boot if I pass "locktorture.verbose=true". Change parse_args() to save the err code for return and continue. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell --- kernel/params.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b6554aa..ed1e0a1 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -223,7 +223,7 @@ char *parse_args(const char *doing, int (*unknown)(char *param, char *val, const char *doing, void *arg)) { - char *param, *val; + char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); @@ -238,7 +238,7 @@ char *parse_args(const char *doing, args = next_arg(args, ¶m, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) - return args; + return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, arg, unknown); @@ -247,24 +247,25 @@ char *parse_args(const char *doing, doing, param); switch (ret) { + case 0: + continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ERR_PTR(ret); + break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); - case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); + break; } + + err = ERR_PTR(ret); } - /* All parsed OK. */ - return NULL; + return err; } /* Lazy bastard, eh? */ -- cgit v1.1 From a75a6068dac25d4022ebcd82192ed6345407843c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 10 Sep 2015 15:07:50 +0200 Subject: cpu/hotplug: Read_lock(tasklist_lock) doesn't need to disable irqs check_for_tasks() doesn't need to disable irqs, recursive read_lock() from interrupt is fine. While at it, s/do_each_thread/for_each_process_thread/. Signed-off-by: Oleg Nesterov Reviewed-by: Kirill Tkhai Reviewed-by: Srikar Dronamraju Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150910130750.GA20055@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 82cf9df..050c634 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -304,8 +304,8 @@ static inline void check_for_tasks(int dead_cpu) { struct task_struct *g, *p; - read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { if (!p->on_rq) continue; /* @@ -320,8 +320,8 @@ static inline void check_for_tasks(int dead_cpu) pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + } + read_unlock(&tasklist_lock); } struct take_cpu_down_param { -- cgit v1.1 From a05e8c51ff097ff73ec2947631d9102283545f7c Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:56 +0900 Subject: sched/fair: Factor out the {at,de}taching of the per entity load {to,from} the runqueue Currently we open-code the addition/subtraction of the per entity load to/from the runqueue, factor this out into helper functions. Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 77 ++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e2e348..a72a71b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2664,8 +2664,8 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - int decayed; struct sched_avg *sa = &cfs_rq->avg; + int decayed; if (atomic_long_read(&cfs_rq->removed_load_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2695,33 +2695,52 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - int cpu = cpu_of(rq_of(cfs_rq)); u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); } +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + se->avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; +} + +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); +} + /* Add the load generated by se into cfs_rq's load average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; u64 now = cfs_rq_clock_task(cfs_rq); - int migrated = 0, decayed; + int migrated, decayed; - if (sa->last_update_time == 0) { - sa->last_update_time = now; - migrated = 1; - } - else { + migrated = !sa->last_update_time; + if (!migrated) { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); @@ -2732,12 +2751,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) { - cfs_rq->avg.load_avg += sa->load_avg; - cfs_rq->avg.load_sum += sa->load_sum; - cfs_rq->avg.util_avg += sa->util_avg; - cfs_rq->avg.util_sum += sa->util_sum; - } + if (migrated) + attach_entity_load_avg(cfs_rq, se); if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); @@ -2752,7 +2767,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* @@ -2820,6 +2835,11 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + static inline int idle_balance(struct rq *rq) { return 0; @@ -7909,25 +7929,10 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - - cfs_rq->avg.load_avg = - max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = - max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = - max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = - max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); -#endif + detach_entity_load_avg(cfs_rq, se); } -/* - * We switched to the sched_fair class. - */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -8040,14 +8045,8 @@ static void task_move_group_fair(struct task_struct *p, int queued) cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; -#ifdef CONFIG_SMP /* Virtually synchronize task with its new cfs_rq */ - p->se.avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += p->se.avg.load_avg; - cfs_rq->avg.load_sum += p->se.avg.load_sum; - cfs_rq->avg.util_avg += p->se.avg.util_avg; - cfs_rq->avg.util_sum += p->se.avg.util_sum; -#endif + attach_entity_load_avg(cfs_rq, se); } } -- cgit v1.1 From 50a2a3b246149d041065a67ccb3e98145f780a2f Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:57 +0900 Subject: sched/fair: Have task_move_group_fair() unconditionally add the entity load to the runqueue Currently we conditionally add the entity load to the rq when moving the task between cgroups. This doesn't make sense as we always 'migrate' the task between cgroups, so we should always migrate the load too. [ The history here is that we used to only migrate the blocked load which was only meaningfull when !queued. ] Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a72a71b..959b2ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8041,13 +8041,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!queued) { - cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); + if (!queued) se->vruntime += cfs_rq->min_vruntime; - /* Virtually synchronize task with its new cfs_rq */ - attach_entity_load_avg(cfs_rq, se); - } + /* Virtually synchronize task with its new cfs_rq */ + attach_entity_load_avg(cfs_rq, se); } void free_fair_sched_group(struct task_group *tg) -- cgit v1.1 From 1746babbb15594ba2d8d8196589bbbc2b5ff51c9 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:58 +0900 Subject: sched/fair: Have task_move_group_fair() also detach entity load from the old runqueue Since we attach the entity load to the new runqueue, we should also detatch the entity load from the old runqueue, otherwise load can accumulate. Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-4-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 959b2ea..1e1fe7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8037,8 +8037,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) queued = 1; + cfs_rq = cfs_rq_of(se); if (!queued) - se->vruntime -= cfs_rq_of(se)->min_vruntime; + se->vruntime -= cfs_rq->min_vruntime; + + /* Synchronize task with its prev cfs_rq */ + detach_entity_load_avg(cfs_rq, se); set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; cfs_rq = cfs_rq_of(se); -- cgit v1.1 From 6efdb105d392da3ad5cb4ef951aed373cd049813 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:59 +0900 Subject: sched/fair: Fix switched_to_fair()'s per entity load tracking Where switched_from_fair() will remove the entity's load from the runqueue, switched_to_fair() does not currently add it back. This means that when a task leaves the fair class for a short duration; say because of PI; we loose its load contribution. This can ripple forward and disturb the load tracking because other operations (enqueue, dequeue) assume its factored in. Only once the runqueue empties will the load tracking recover. When we add it back in, age the per entity average to match up with the runqueue age. This has the obvious problem that if the task leaves the fair class for a significant time, the load will age to 0. Employ the normal migration rule for inter-runqueue moves in task_move_group_fair(). Again, there is the obvious problem of the task migrating while not in the fair class. The alternative solution would be to to omit the chunk in attach_entity_load_avg(), which would effectively reset the timestamp and use whatever avg there was. Signed-off-by: Byungchul Park [ Rewrote the changelog and comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e1fe7f..5143ea0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2712,6 +2712,20 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * If we got migrated (either between CPUs or between cgroups) we'll + * have aged the average right before clearing @last_update_time. + */ + if (se->avg.last_update_time) { + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, 0, 0, NULL); + + /* + * XXX: we could have just aged the entire load away if we've been + * absent from the fair class for too long. + */ + } + se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; @@ -7945,6 +7959,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif + /* Synchronize task with its cfs_rq */ + attach_entity_load_avg(cfs_rq_of(&p->se), &p->se); + if (!task_on_rq_queued(p)) { /* @@ -8044,6 +8061,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) /* Synchronize task with its prev cfs_rq */ detach_entity_load_avg(cfs_rq, se); set_task_rq(p, task_cpu(p)); + +#ifdef CONFIG_SMP + /* Tell se's cfs_rq has been changed -- migrated */ + p->se.avg.last_update_time = 0; +#endif + se->depth = se->parent ? se->parent->depth + 1 : 0; cfs_rq = cfs_rq_of(se); if (!queued) -- cgit v1.1 From a9280514bf1e54775b8d7cd93d87c05c2b5273e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2015 16:10:59 +0200 Subject: sched/fair: Make the entity load aging on attaching tunable In case there are problems with the aging on attach, provide a debug knob to turn it off. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ kernel/sched/features.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5143ea0..5cd7054 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2712,6 +2712,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + if (!sched_feat(ATTACH_AGE_LOAD)) + goto skip_aging; + /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. @@ -2726,6 +2729,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ } +skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 83a50e7..e6fd23b 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,6 +73,8 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via -- cgit v1.1 From daa59407b558e6e621e9081a308d5db3ef991fb6 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:22:00 +0900 Subject: sched/fair: Unify switched_{from,to}_fair() and task_move_group_fair() By observing that switched_from_fair() detaches from a runqueue, and switched_to_fair() attaches to a runqueue, we can see that task_move_group_fair() is one followed by the other with flipping the runqueue in between. Therefore extract all the common bits and implement all three functions in terms of them. This should fix a few corner cases wrt. vruntime normalization; where, when we take a task off of a runqueue we convert to an approximation of lag by subtracting min_vruntime, and when placing a task on the a runqueue to the reverse. Suggested-by: Peter Zijlstra Signed-off-by: Byungchul Park [peterz: Changelog] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-6-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 129 +++++++++++++++++++++------------------------------- 1 file changed, 52 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cd7054..b96d8dd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7924,21 +7924,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) +static inline bool vruntime_normalized(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when it's - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. + * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, + * the dequeue_entity(.flags=0) will already have normalized the + * vruntime. + */ + if (p->on_rq) + return true; + + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: * - * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping will it still have non-normalized vruntime. + * - A forked child which is waiting for being woken up by + * wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7951,9 +7969,10 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) detach_entity_load_avg(cfs_rq, se); } -static void switched_to_fair(struct rq *rq, struct task_struct *p) +static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -7964,33 +7983,32 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq_of(&p->se), &p->se); + attach_entity_load_avg(cfs_rq, se); + + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; +} - if (!task_on_rq_queued(p)) { +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + if (task_on_rq_queued(p)) { /* - * Ensure the task has a non-normalized vruntime when it is switched - * back to the fair class with !queued, so that enqueue_entity() at - * wake-up time will do the right thing. - * - * If it's queued, then the enqueue_entity(.flags=0) makes the task - * has non-normalized vruntime, if it's !queued, then it still has - * normalized vruntime. + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. */ - if (p->state != TASK_RUNNING) - se->vruntime += cfs_rq_of(se)->min_vruntime; - return; + if (rq->curr == p) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); } - - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_curr(rq); - else - check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. @@ -8027,57 +8045,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int queued) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq; - - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !queued, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - queued = 1; - - cfs_rq = cfs_rq_of(se); - if (!queued) - se->vruntime -= cfs_rq->min_vruntime; - - /* Synchronize task with its prev cfs_rq */ - detach_entity_load_avg(cfs_rq, se); + detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif - - se->depth = se->parent ? se->parent->depth + 1 : 0; - cfs_rq = cfs_rq_of(se); - if (!queued) - se->vruntime += cfs_rq->min_vruntime; - - /* Virtually synchronize task with its new cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_task_cfs_rq(p); } void free_fair_sched_group(struct task_group *tg) -- cgit v1.1 From bc54da2176cd38cedea767eff637229a191a2383 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Aug 2015 17:13:55 +0200 Subject: sched/core: Remove unused argument from sched_class::task_move_group The previous patches made the second argument go unused, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276f..7c099e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7721,7 +7721,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b96d8dd..4e305d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8043,7 +8043,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int queued) +static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda11..637d5ae 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1226,7 +1226,7 @@ struct sched_class { void (*update_curr) (struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p); #endif }; -- cgit v1.1 From 446685e9bfa11174332fbb0b3218b37015fbf4ff Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 31 Aug 2015 15:12:56 +0300 Subject: sched/core: Delete PF_EXITING checks from cpu_cgroup_exit() callback cgroup_exit() is not called from copy_process() after commit: e8604cb43690 ("cgroup: fix spurious lockdep warning in cgroup_exit()") from do_exit(). So this check is useless and the comment is obsolete. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55E444C8.3020402@odin.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c099e6..37ab6f9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8193,14 +8193,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, struct cgroup_subsys_state *old_css, struct task_struct *task) { - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - sched_move_task(task); } -- cgit v1.1 From c5afb6a87f2386bcf09fa051e6ca390d43e2222e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 3 Aug 2015 11:55:50 +0200 Subject: sched/fair: Fix nohz.next_balance update Since commit: d4573c3e1c99 ("sched: Improve load balancing in the presence of idle CPUs") the ILB CPU starts with the idle load balancing of other idle CPUs and finishes with itself in order to speed up the spread of tasks in all idle CPUs. The this_rq->next_balance is still used in nohz_idle_balance() as an intermediate step to gather the shortest next balance before updating nohz.next_balance. But the former has not been updated yet and is likely to be set with the current jiffies. As a result, the nohz.next_balance will be set with current jiffies instead of the real next balance date. This generates spurious kicks of nohz ilde balance. nohz_idle_balance() must set the nohz.next_balance without taking into account this_rq->next_balance which is not updated yet. Then, this_rq will update nohz.next_update with its next_balance once updated and if necessary. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1438595750-20455-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e305d1..36774e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7647,8 +7647,22 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) + if (likely(update_next_balance)) { rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } } #ifdef CONFIG_NO_HZ_COMMON @@ -7661,6 +7675,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) @@ -7692,10 +7709,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rebalance_domains(rq, CPU_IDLE); } - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } } - nohz.next_balance = this_rq->next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } -- cgit v1.1 From 78a9c54649ea220065aad9902460a1d137c7eafd Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:11 +0530 Subject: sched/numa: Rename numabalancing_enabled to sched_numa_balancing Simple rename of the 'numabalancing_enabled' variable to 'sched_numa_balancing'. No functional changes. Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 37ab6f9..2656af0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2124,11 +2124,11 @@ void set_numabalancing_state(bool enabled) sched_feat_set("NO_NUMA"); } #else -__read_mostly bool numabalancing_enabled; +__read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { - numabalancing_enabled = enabled; + sched_numa_balancing = enabled; } #endif /* CONFIG_SCHED_DEBUG */ @@ -2138,7 +2138,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = sched_numa_balancing; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36774e5..3a6ac55 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!numabalancing_enabled) + if (!sched_numa_balancing) return; /* for example, ksmd faulting in a user's mm */ @@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (numabalancing_enabled) + if (sched_numa_balancing) task_tick_numa(rq, curr); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 637d5ae..d0b303d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1006,13 +1006,13 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #ifdef CONFIG_NUMA_BALANCING #define sched_feat_numa(x) sched_feat(x) #ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) +#define sched_numa_balancing sched_feat_numa(NUMA) #else -extern bool numabalancing_enabled; +extern bool sched_numa_balancing; #endif /* CONFIG_SCHED_DEBUG */ #else #define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) +#define sched_numa_balancing (0) #endif /* CONFIG_NUMA_BALANCING */ static inline u64 global_rt_period(void) -- cgit v1.1 From c3b9bc5bbfc3750570d788afffd431263ef695c6 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:12 +0530 Subject: sched/numa: Disable sched_numa_balancing on UMA systems Commit 2a1ed24 ("sched/numa: Prefer NUMA hotness over cache hotness") sets sched feature NUMA to true. However this can enable NUMA hinting faults on a UMA system. This commit ensures that NUMA hinting faults occur only on a NUMA system by setting/resetting sched_numa_balancing. This commit: - Makes sched_numa_balancing common to CONFIG_SCHED_DEBUG and !CONFIG_SCHED_DEBUG. Earlier it was only in !CONFIG_SCHED_DEBUG. - Checks for sched_numa_balancing instead of sched_feat(NUMA). Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++--------- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 6 ------ 3 files changed, 7 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2656af0..ca665f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2115,22 +2115,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) } #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG +__read_mostly bool sched_numa_balancing; + void set_numabalancing_state(bool enabled) { + sched_numa_balancing = enabled; +#ifdef CONFIG_SCHED_DEBUG if (enabled) sched_feat_set("NUMA"); else sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool sched_numa_balancing; - -void set_numabalancing_state(bool enabled) -{ - sched_numa_balancing = enabled; -} #endif /* CONFIG_SCHED_DEBUG */ +} #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a6ac55..e8f0828 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5562,10 +5562,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!sched_numa_balancing) return -1; - if (!sched_feat(NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return -1; src_nid = cpu_to_node(env->src_cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d0b303d..0d8f885 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1004,14 +1004,8 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ #ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define sched_numa_balancing sched_feat_numa(NUMA) -#else extern bool sched_numa_balancing; -#endif /* CONFIG_SCHED_DEBUG */ #else -#define sched_feat_numa(x) (0) #define sched_numa_balancing (0) #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.1 From 2b49d84b259fc18e131026e5d38e7855352f71b9 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:13 +0530 Subject: sched/numa: Remove the NUMA sched_feature Variable sched_numa_balancing is available for both CONFIG_SCHED_DEBUG and !CONFIG_SCHED_DEBUG. All code paths now check for sched_numa_balancing. Hence remove sched_feat(NUMA). Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ------ kernel/sched/features.h | 16 ---------------- 2 files changed, 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca665f8..e0bd88b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2120,12 +2120,6 @@ __read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { sched_numa_balancing = enabled; -#ifdef CONFIG_SCHED_DEBUG - if (enabled) - sched_feat_set("NUMA"); - else - sched_feat_set("NO_NUMA"); -#endif /* CONFIG_SCHED_DEBUG */ } #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e6fd23b..edf5902 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -72,21 +72,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) - SCHED_FEAT(ATTACH_AGE_LOAD, true) -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= - */ -#ifdef CONFIG_NUMA_BALANCING - -/* - * NUMA will favor moving tasks towards nodes where a higher number of - * hinting faults are recorded during active load balancing. It will - * resist moving tasks towards nodes where a lower number of hinting - * faults have been recorded. - */ -SCHED_FEAT(NUMA, true) -#endif -- cgit v1.1 From 2a595721a1fa6b684c1c818f379bef834ac3d65e Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 21:54:21 +0530 Subject: sched/numa: Convert sched_numa_balancing to a static_branch Variable sched_numa_balancing toggles numa_balancing feature. Hence moving from a simple read mostly variable to a more apt static_branch. Suggested-by: Peter Zijlstra Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439310261-16124-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++--- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 6 +----- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e0bd88b..b621271 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2114,12 +2114,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -__read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { - sched_numa_balancing = enabled; + if (enabled) + static_branch_enable(&sched_numa_balancing); + else + static_branch_disable(&sched_numa_balancing); } #ifdef CONFIG_PROC_SYSCTL @@ -2128,7 +2132,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = sched_numa_balancing; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8f0828..47ece22 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!sched_numa_balancing) + if (!static_branch_likely(&sched_numa_balancing)) return; /* for example, ksmd faulting in a user's mm */ @@ -5562,7 +5562,7 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_numa_balancing) + if (!static_branch_likely(&sched_numa_balancing)) return -1; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) @@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (sched_numa_balancing) + if (!static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d8f885..2e8530d0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1003,11 +1003,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -extern bool sched_numa_balancing; -#else -#define sched_numa_balancing (0) -#endif /* CONFIG_NUMA_BALANCING */ +extern struct static_key_false sched_numa_balancing; static inline u64 global_rt_period(void) { -- cgit v1.1 From e0f5f3afd2cffa96291cd852056d83ff4e2e99c7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:09 +0100 Subject: sched/fair: Make load tracking frequency scale-invariant Apply frequency scaling correction factor to per-entity load tracking to make it frequency invariant. Currently, load appears bigger when the CPU is running slower which affects load-balancing decisions. Each segment of the sched_avg.load_sum geometric series is now scaled by the current frequency so that the sched_avg.load_avg of each sched entity will be invariant from frequency scaling. Moreover, cfs_rq.runnable_load_sum is scaled by the current frequency as well. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47ece22..86cb27c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2515,6 +2515,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2547,9 +2549,9 @@ static __always_inline int __update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, periods; + u64 delta, scaled_delta, periods; u32 contrib; - int delta_w, decayed = 0; + int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_update_time; @@ -2585,13 +2587,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; + scaled_delta_w = scale(delta_w, scale_freq); if (weight) { - sa->load_sum += weight * delta_w; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta_w; + sa->load_sum += weight * scaled_delta_w; + if (cfs_rq) { + cfs_rq->runnable_load_sum += + weight * scaled_delta_w; + } } if (running) - sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta_w; delta -= delta_w; @@ -2608,23 +2613,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); + contrib = scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += contrib; } /* Remainder of delta accrued against u_0` */ + scaled_delta = scale(delta, scale_freq); if (weight) { - sa->load_sum += weight * delta; + sa->load_sum += weight * scaled_delta; if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta; + cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta; sa->period_contrib += delta; -- cgit v1.1 From 8cd5601c50603caa195ce86cc465cb04079ed488 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Aug 2015 17:23:10 +0100 Subject: sched/fair: Convert arch_scale_cpu_capacity() from weak function to #define Bring arch_scale_cpu_capacity() in line with the recent change of its arch_scale_freq_capacity() sibling in commit dfbca41f3479 ("sched: Optimize freq invariant accounting") from weak function to #define to allow inlining of the function. While at it, remove the ARCH_CAPACITY sched_feature as well. With the change to #define there isn't a straightforward way to allow runtime switch between an arch implementation and the default implementation of arch_scale_cpu_capacity() using sched_feature. The default was to use the arch-specific implementation, but only the arm architecture provides one and that is essentially equivalent to the default implementation. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 22 +--------------------- kernel/sched/features.h | 5 ----- kernel/sched/sched.h | 11 +++++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 86cb27c..102cdf1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6054,19 +6054,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_cpu_capacity(sd, cpu); -} - static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6096,16 +6083,9 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = SCHED_CAPACITY_SCALE; + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_cpu_capacity(sd, cpu); - else - capacity *= default_scale_cpu_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index edf5902..69631fa 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) */ SCHED_FEAT(WAKEUP_PREEMPTION, true) -/* - * Use arch dependent cpu capacity functions - */ -SCHED_FEAT(ARCH_CAPACITY, true) - SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e8530d0..c0726d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1394,6 +1394,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- cgit v1.1 From e3279a2e6d697e00e74f905851ee7cf532f72b2d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 15 Aug 2015 00:04:41 +0100 Subject: sched/fair: Make utilization tracking CPU scale-invariant Besides the existing frequency scale-invariance correction factor, apply CPU scale-invariance correction factor to utilization tracking to compensate for any differences in compute capacity. This could be due to micro-architectural differences (i.e. instructions per seconds) between cpus in HMP systems (e.g. big.LITTLE), and/or differences in the current maximum frequency supported by individual cpus in SMP systems. In the existing implementation utilization isn't comparable between cpus as it is relative to the capacity of each individual CPU. Each segment of the sched_avg.util_sum geometric series is now scaled by the CPU performance factor too so the sched_avg.util_avg of each sched entity will be invariant from the particular CPU of the HMP/SMP system on which the sched entity is scheduled. With this patch, the utilization of a CPU stays relative to the max CPU performance of the fastest CPU in the system. In contrast to utilization (sched_avg.util_sum), load (sched_avg.load_sum) should not be scaled by compute capacity. The utilization metric is based on running time which only makes sense when cpus are _not_ fully utilized (utilization cannot go beyond 100% even if more tasks are added), where load is runnable time which isn't limited by the capacity of the CPU and therefore is a better metric for overloaded scenarios. If we run two nice-0 busy loops on two cpus with different compute capacity their load should be similar since their compute demands are the same. We have to assume that the compute demand of any task running on a fully utilized CPU (no spare cycles = 100% utilization) is high and the same no matter of the compute capacity of its current CPU, hence we shouldn't scale load by CPU capacity. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55CE7409.1000700@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++--- kernel/sched/sched.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 102cdf1..573dc98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2553,6 +2553,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta = now - sa->last_update_time; /* @@ -2596,7 +2597,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += scaled_delta_w; + sa->util_sum += scale(scaled_delta_w, scale_cpu); delta -= delta_w; @@ -2620,7 +2621,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib; + sa->util_sum += scale(contrib, scale_cpu); } /* Remainder of delta accrued against u_0` */ @@ -2631,7 +2632,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += scaled_delta; + sa->util_sum += scale(scaled_delta, scale_cpu); sa->period_contrib += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0726d5..167ab48 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1398,7 +1398,7 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) return sd->smt_gain / sd->span_weight; return SCHED_CAPACITY_SCALE; -- cgit v1.1 From 9e91d61d9b0ca8d865dbd59af8d0d5c5b68003e9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:12 +0100 Subject: sched/fair: Name utilization related data and functions consistently Use the advent of the per-entity load tracking rewrite to streamline the naming of utilization related data and functions by using {prefix_}util{_suffix} consistently. Moreover call both signals ({se,cfs}.avg.util_avg) utilization. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 573dc98..1b56d63 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4863,31 +4863,32 @@ done: return target; } /* - * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can - * compare the usage with the capacity of the CPU that is available for CFS - * task (ie cpu_capacity). + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full - * capacity of the CPU because it's about the running time on this CPU. + * [0..SCHED_LOAD_SCALE]. The utilization of a CPU can't be higher than the + * full capacity of the CPU because it's about the running time on this CPU. * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the usage stays into the range + * time. So we need to check that the utilization stays into the range * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the usage, a group could be seen as overloaded (CPU0 usage - * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + * Without capping the utilization, a group could be seen as overloaded (CPU0 + * utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. */ -static int get_cpu_usage(int cpu) +static int cpu_util(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (usage >= SCHED_LOAD_SCALE) + if (util >= SCHED_LOAD_SCALE) return capacity; - return (usage * capacity) >> SCHED_LOAD_SHIFT; + return (util * capacity) >> SCHED_LOAD_SHIFT; } /* @@ -5979,7 +5980,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_usage; /* Total usage of the group */ + unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -6212,8 +6213,8 @@ static inline int sg_imbalanced(struct sched_group *group) * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the * number of task is - * smaller than the number of CPUs or if the usage is lower than the available - * capacity for CFS tasks. + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. @@ -6227,7 +6228,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) return true; if ((sgs->group_capacity * 100) > - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6248,7 +6249,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; if ((sgs->group_capacity * 100) < - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6296,7 +6297,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) -- cgit v1.1 From 231678b768da07d19ab5683a39eeb0c250631d02 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:13 +0100 Subject: sched/fair: Get rid of scaling utilization by capacity_orig Utilization is currently scaled by capacity_orig, but since we now have frequency and cpu invariant cfs_rq.avg.util_avg, frequency and cpu scaling now happens as part of the utilization tracking itself. So cfs_rq.avg.util_avg should no longer be scaled in cpu_util(). Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steve Muckle Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/55EDAF43.30500@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b56d63..047fd1c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4862,33 +4862,39 @@ next: done: return target; } + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can * compare the utilization with the capacity of the CPU that is available for * CFS task (ie cpu_capacity). - * cfs.avg.util_avg is the sum of running time of runnable tasks on a - * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The utilization of a CPU can't be higher than the - * full capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in util_avg or just - * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the utilization stays into the range - * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the utilization, a group could be seen as overloaded (CPU0 - * utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). */ static int cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (util >= SCHED_LOAD_SCALE) - return capacity; - - return (util * capacity) >> SCHED_LOAD_SHIFT; + return (util >= capacity) ? capacity : util; } /* -- cgit v1.1 From 98d8fd8126676f7ba6e133e65b2ca4b17989d32c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Aug 2015 17:23:14 +0100 Subject: sched/fair: Initialize task load and utilization before placing task on rq Task load or utilization is not currently considered in select_task_rq_fair(), but if we want that in the future we should make sure it is not zero for new tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b621271..6ab415a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2343,6 +2343,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2352,8 +2354,6 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; -- cgit v1.1 From 54a21385facbdcd89a78e8c3e5025f04c5f2b59c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2015 15:05:42 +0200 Subject: sched/fair: Rename scale() to cap_scale() Rename scale() to cap_scale() to better reflect its purpose, it is after all not a general purpose scale function, it has SCHED_CAPACITY_SHIFT hardcoded in it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 047fd1c..7109047 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2515,7 +2515,7 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* * We can represent the historical contribution to runnable average as the @@ -2588,7 +2588,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; - scaled_delta_w = scale(delta_w, scale_freq); + scaled_delta_w = cap_scale(delta_w, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta_w; if (cfs_rq) { @@ -2597,7 +2597,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += scale(scaled_delta_w, scale_cpu); + sa->util_sum += cap_scale(scaled_delta_w, scale_cpu); delta -= delta_w; @@ -2614,25 +2614,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); - contrib = scale(contrib, scale_freq); + contrib = cap_scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += scale(contrib, scale_cpu); + sa->util_sum += cap_scale(contrib, scale_cpu); } /* Remainder of delta accrued against u_0` */ - scaled_delta = scale(delta, scale_freq); + scaled_delta = cap_scale(delta, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta; if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += scale(scaled_delta, scale_cpu); + sa->util_sum += cap_scale(scaled_delta, scale_cpu); sa->period_contrib += delta; -- cgit v1.1 From 6115c793ca1a6e39c7c15159cbb47baa04009cb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2015 15:09:15 +0200 Subject: sched/fair: Optimize __update_load_avg() Prior to this patch; the line: scaled_delta_w = (delta_w * 1024) >> 10; which is the result of the default arch_scale_freq_capacity() function, turns into: 1b03: 49 89 d1 mov %rdx,%r9 1b06: 49 c1 e1 0a shl $0xa,%r9 1b0a: 49 c1 e9 0a shr $0xa,%r9 Which is silly; when made unsigned int, GCC recognises this as pointless ops and fails to emit them (confirmed on 4.9.3 and 5.1.1). Furthermore, afaict unsigned is actually the correct type for these fields anyway, as we've explicitly ruled out negative delta's earlier in this function. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7109047..c3c5585 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2551,7 +2551,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, { u64 delta, scaled_delta, periods; u32 contrib; - int delta_w, scaled_delta_w, decayed = 0; + unsigned int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); -- cgit v1.1 From 6f2b04524f0b38bfbb8413f98d2d6af234508309 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 7 Sep 2015 14:57:22 +0100 Subject: sched/fair: Defer calling scaling functions Do not call the scaling functions in case time goes backwards or the last update of the sched_avg structure has happened less than 1024ns ago. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/55EDA2E9.8040900@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c3c5585..fc835fa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2552,8 +2552,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u64 delta, scaled_delta, periods; u32 contrib; unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + unsigned long scale_freq, scale_cpu; delta = now - sa->last_update_time; /* @@ -2574,6 +2573,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { -- cgit v1.1 From 006cdf025a33cb008c3d466bed311c2c347b458f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 09:06:17 +0200 Subject: sched/fair: Optimize per entity utilization tracking Currently the load_{sum,avg} and util_{sum,avg} tracking is asymmetric in that load tracking gets a 2^10 unit from the weight, but util gets no such factor. This results in more lost bits for util scaling and asymmetric scaling rules. Fix this by removing shifts, such that we gain the 2^10 factor from scaling. There is no risk of overflowing the u32 as the max value is now LOAD_AVG_MAX << 10, which is still well below UINT_MAX. This further entangles the assumption that both LOAD and CAPACITY shifts are the same (and 10) so put in an assertion for that. This fixes the math for the LOAD_RESOLUTION != 0 case. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc835fa..9176f7c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -682,7 +682,7 @@ void init_entity_runnable_average(struct sched_entity *se) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = LOAD_AVG_MAX; + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -2515,6 +2515,10 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 +#error "load tracking assumes 2^10 as unit" +#endif + #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* @@ -2599,7 +2603,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += cap_scale(scaled_delta_w, scale_cpu); + sa->util_sum += scaled_delta_w * scale_cpu; delta -= delta_w; @@ -2623,7 +2627,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += cap_scale(contrib, scale_cpu); + sa->util_sum += contrib * scale_cpu; } /* Remainder of delta accrued against u_0` */ @@ -2634,7 +2638,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += cap_scale(scaled_delta, scale_cpu); + sa->util_sum += scaled_delta * scale_cpu; sa->period_contrib += delta; @@ -2644,7 +2648,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_avg = div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } - sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; } return decayed; @@ -2686,8 +2690,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - - ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, -- cgit v1.1 From 516792e67c39d31701641ab355acdb9cbfec0643 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 31 Aug 2015 15:12:56 +0300 Subject: perf/core: Delete PF_EXITING checks from perf_cgroup_exit() callback cgroup_exit() is not called from copy_process() after commit: e8604cb43690 ("cgroup: fix spurious lockdep warning in cgroup_exit()") from do_exit(). So this check is useless and the comment is obsolete. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/55E444C8.3020402@odin.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69..76e64be 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9297,14 +9297,6 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css, struct cgroup_subsys_state *old_css, struct task_struct *task) { - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - task_function_call(task, __perf_cgroup_move, task); } -- cgit v1.1 From fbbe07011581990ef74dfac06dc8511b1a14badb Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:45 -0700 Subject: perf/core: Add a 'flags' parameter to the PMU transactional interfaces Currently, the PMU interface allows reading only one counter at a time. But some PMUs like the 24x7 counters in Power, support reading several counters at once. To leveage this functionality, extend the transaction interface to support a "transaction type". The first type, PERF_PMU_TXN_ADD, refers to the existing transactions, i.e. used to _schedule_ all the events on the PMU as a group. A second transaction type, PERF_PMU_TXN_READ, will be used in a follow-on patch, by the 24x7 counters to read several counters at once. Extend the transaction interfaces to the PMU to accept a 'txn_flags' parameter and use this parameter to ignore any transactions that are not of type PERF_PMU_TXN_ADD. Thanks to Peter Zijlstra for his input. Signed-off-by: Sukadev Bhattiprolu [peterz: s390 compile fix] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-3-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 76e64be..c80cee8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1914,7 +1914,7 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - pmu->start_txn(pmu); + pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); @@ -7267,24 +7267,49 @@ static void perf_pmu_nop_void(struct pmu *pmu) { } +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) +{ +} + static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } -static void perf_pmu_start_txn(struct pmu *pmu) +DEFINE_PER_CPU(unsigned int, nop_txn_flags); + +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { + __this_cpu_write(nop_txn_flags, flags); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return 0; + perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_enable(pmu); } @@ -7523,7 +7548,7 @@ got_cpu_context: pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { - pmu->start_txn = perf_pmu_nop_void; + pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } -- cgit v1.1 From 01add3eaf1b25e497b14ca210f3bfe5f5dd2b112 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:46 -0700 Subject: perf/core: Split perf_event_read() and perf_event_count() perf_event_read() does two things: - call the PMU to read/update the counter value, and - compute the total count of the event and its children Not all callers need both. perf_event_reset() for instance needs the first piece but doesn't need the second. Similarly, when we implement the ability to read a group of events using the transaction interface, we would need the two pieces done independently. Break up perf_event_read() and have it just read/update the counter and have the callers compute the total count if necessary. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-4-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c80cee8..260bf8c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3275,7 +3275,7 @@ u64 perf_event_read_local(struct perf_event *event) return val; } -static u64 perf_event_read(struct perf_event *event) +static void perf_event_read(struct perf_event *event) { /* * If event is enabled and currently active on a CPU, update the @@ -3301,8 +3301,6 @@ static u64 perf_event_read(struct perf_event *event) update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } - - return perf_event_count(event); } /* @@ -3818,14 +3816,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) *running = 0; mutex_lock(&event->child_mutex); - total += perf_event_read(event); + + perf_event_read(event); + total += perf_event_count(event); + *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); + perf_event_read(child); + total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -3985,7 +3987,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void _perf_event_reset(struct perf_event *event) { - (void)perf_event_read(event); + perf_event_read(event); local64_set(&event->count, 0); perf_event_update_userpage(event); } -- cgit v1.1 From b15f495b4e9295cf21065d8569835a2f18cfe41b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Thu, 3 Sep 2015 20:07:47 -0700 Subject: perf/core: Rename perf_event_read_{one,group}, perf_read_hw In order to free up the perf_event_read_group() name: s/perf_event_read_\(one\|group\)/perf_read_\1/g s/perf_read_hw/__perf_read/g Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-5-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 260bf8c..67b7dba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3742,7 +3742,7 @@ static void put_event(struct perf_event *event) * see the comment there. * * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while + * perf_read_group(), which takes faults while * holding ctx->mutex, however this is called after * the last filedesc died, so there is no possibility * to trigger the AB-BA case. @@ -3837,7 +3837,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static int perf_event_read_group(struct perf_event *event, +static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; @@ -3885,7 +3885,7 @@ static int perf_event_read_group(struct perf_event *event, return ret; } -static int perf_event_read_one(struct perf_event *event, +static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; @@ -3923,7 +3923,7 @@ static bool is_event_hup(struct perf_event *event) * Read the performance event - simple non blocking version for now */ static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +__perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; @@ -3941,9 +3941,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); + ret = perf_read_group(event, read_format, buf); else - ret = perf_event_read_one(event, read_format, buf); + ret = perf_read_one(event, read_format, buf); return ret; } @@ -3956,7 +3956,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ctx = perf_event_ctx_lock(event); - ret = perf_read_hw(event, buf, count); + ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; -- cgit v1.1 From 0492d4c5b8c4dc3a7591bf6fa0e35d117812cc85 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2015 20:07:48 -0700 Subject: perf/core: Add group reads to perf_event_read() Enable perf_event_read() to update entire groups at once, this will be useful for read transactions. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20150723080435.GE25159@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 67b7dba..4d89866 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3184,12 +3184,18 @@ void perf_event_exec(void) rcu_read_unlock(); } +struct perf_read_data { + struct perf_event *event; + bool group; +}; + /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { - struct perf_event *event = info; + struct perf_read_data *data = info; + struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); @@ -3208,9 +3214,21 @@ static void __perf_event_read(void *info) update_context_time(ctx); update_cgrp_time_from_event(event); } + update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); + + if (!data->group) + goto unlock; + + list_for_each_entry(sub, &event->sibling_list, group_entry) { + update_event_times(sub); + if (sub->state == PERF_EVENT_STATE_ACTIVE) + sub->pmu->read(sub); + } + +unlock: raw_spin_unlock(&ctx->lock); } @@ -3275,15 +3293,19 @@ u64 perf_event_read_local(struct perf_event *event) return val; } -static void perf_event_read(struct perf_event *event) +static void perf_event_read(struct perf_event *event, bool group) { /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data = { + .event = event, + .group = group, + }; smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); + __perf_event_read, &data, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3298,7 +3320,10 @@ static void perf_event_read(struct perf_event *event) update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (group) + update_group_times(event); + else + update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } } @@ -3817,7 +3842,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) mutex_lock(&event->child_mutex); - perf_event_read(event); + perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + @@ -3826,7 +3851,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - perf_event_read(child); + perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; @@ -3987,7 +4012,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void _perf_event_reset(struct perf_event *event) { - perf_event_read(event); + perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } -- cgit v1.1 From fa8c269353d560b7c28119ad7617029f92e40b15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2015 20:07:49 -0700 Subject: perf/core: Invert perf_read_group() loops In order to enable the use of perf_event_read(.group = true), we need to invert the sibling-child loop nesting of perf_read_group(). Currently we iterate the child list for each sibling, this precludes using group reads. Flip things around so we iterate each group for each child. Signed-off-by: Peter Zijlstra (Intel) [ Made the patch compile and things. ] Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-7-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 85 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d89866..dd2a0b8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3862,50 +3862,75 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static int perf_read_group(struct perf_event *event, - u64 read_format, char __user *buf) +static void __perf_read_group_add(struct perf_event *leader, + u64 read_format, u64 *values) { - struct perf_event *leader = event->group_leader, *sub; - struct perf_event_context *ctx = leader->ctx; - int n = 0, size = 0, ret; - u64 count, enabled, running; - u64 values[5]; + struct perf_event *sub; + int n = 1; /* skip @nr */ - lockdep_assert_held(&ctx->mutex); + perf_event_read(leader, true); + + /* + * Since we co-schedule groups, {enabled,running} times of siblings + * will be identical to those of the leader, so we only publish one + * set. + */ + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] += leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } - count = perf_event_read_value(leader, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] += leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; + /* + * Write {count,id} tuples for every sibling. + */ + values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - size = n * sizeof(u64); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + values[n++] += perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + } +} - if (copy_to_user(buf, values, size)) - return -EFAULT; +static int perf_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *child; + struct perf_event_context *ctx = leader->ctx; + int ret = event->read_size; + u64 *values; - ret = size; + lockdep_assert_held(&ctx->mutex); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; + values = kzalloc(event->read_size, GFP_KERNEL); + if (!values) + return -ENOMEM; - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); + values[0] = 1 + leader->nr_siblings; + + /* + * By locking the child_mutex of the leader we effectively + * lock the child list of all siblings.. XXX explain how. + */ + mutex_lock(&leader->child_mutex); - size = n * sizeof(u64); + __perf_read_group_add(leader, read_format, values); + list_for_each_entry(child, &leader->child_list, child_list) + __perf_read_group_add(child, read_format, values); - if (copy_to_user(buf + ret, values, size)) { - return -EFAULT; - } + mutex_unlock(&leader->child_mutex); - ret += size; - } + if (copy_to_user(buf, values, event->read_size)) + ret = -EFAULT; + + kfree(values); return ret; } -- cgit v1.1 From 7d88962e230c8342080e7e2fe9dd5be43dc13b79 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:50 -0700 Subject: perf/core: Add return value for perf_event_read() When we implement the ability to read several counters at once (using the PERF_PMU_TXN_READ transaction interface), perf_event_read() can fail when the 'group' parameter is true (eg: trying to read too many events at once). For now, have perf_event_read() return an integer. Ignore the return value when the 'group' parameter is false. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-8-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dd2a0b8..ade04df 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3187,6 +3187,7 @@ void perf_event_exec(void) struct perf_read_data { struct perf_event *event; bool group; + int ret; }; /* @@ -3227,6 +3228,7 @@ static void __perf_event_read(void *info) if (sub->state == PERF_EVENT_STATE_ACTIVE) sub->pmu->read(sub); } + data->ret = 0; unlock: raw_spin_unlock(&ctx->lock); @@ -3293,8 +3295,10 @@ u64 perf_event_read_local(struct perf_event *event) return val; } -static void perf_event_read(struct perf_event *event, bool group) +static int perf_event_read(struct perf_event *event, bool group) { + int ret = 0; + /* * If event is enabled and currently active on a CPU, update the * value in the event structure: @@ -3303,9 +3307,11 @@ static void perf_event_read(struct perf_event *event, bool group) struct perf_read_data data = { .event = event, .group = group, + .ret = 0, }; smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3326,6 +3332,8 @@ static void perf_event_read(struct perf_event *event, bool group) update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } + + return ret; } /* @@ -3842,7 +3850,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) mutex_lock(&event->child_mutex); - perf_event_read(event, false); + (void)perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + @@ -3851,7 +3859,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - perf_event_read(child, false); + (void)perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; @@ -3862,13 +3870,16 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static void __perf_read_group_add(struct perf_event *leader, +static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event *sub; int n = 1; /* skip @nr */ + int ret; - perf_event_read(leader, true); + ret = perf_event_read(leader, true); + if (ret) + return ret; /* * Since we co-schedule groups, {enabled,running} times of siblings @@ -3897,6 +3908,8 @@ static void __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + + return 0; } static int perf_read_group(struct perf_event *event, @@ -3904,7 +3917,7 @@ static int perf_read_group(struct perf_event *event, { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; - int ret = event->read_size; + int ret; u64 *values; lockdep_assert_held(&ctx->mutex); @@ -3921,17 +3934,27 @@ static int perf_read_group(struct perf_event *event, */ mutex_lock(&leader->child_mutex); - __perf_read_group_add(leader, read_format, values); - list_for_each_entry(child, &leader->child_list, child_list) - __perf_read_group_add(child, read_format, values); + ret = __perf_read_group_add(leader, read_format, values); + if (ret) + goto unlock; + + list_for_each_entry(child, &leader->child_list, child_list) { + ret = __perf_read_group_add(child, read_format, values); + if (ret) + goto unlock; + } mutex_unlock(&leader->child_mutex); + ret = event->read_size; if (copy_to_user(buf, values, event->read_size)) ret = -EFAULT; + goto out; +unlock: + mutex_unlock(&leader->child_mutex); +out: kfree(values); - return ret; } @@ -4037,7 +4060,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void _perf_event_reset(struct perf_event *event) { - perf_event_read(event, false); + (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } -- cgit v1.1 From 4a00c16e552ea5e71756cd29cd2df7557ec9cac4 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:51 -0700 Subject: perf/core: Define PERF_PMU_TXN_READ interface Define a new PERF_PMU_TXN_READ interface to read a group of counters at once. pmu->start_txn() // Initialize before first event for each event in group pmu->read(event); // Queue each event to be read rc = pmu->commit_txn() // Read/update all queued counters Note that we use this interface with all PMUs. PMUs that implement this interface use the ->read() operation to _queue_ the counters to be read and use ->commit_txn() to actually read all the queued counters at once. PMUs that don't implement PERF_PMU_TXN_READ ignore ->start_txn() and ->commit_txn() and continue to read counters one at a time. Thanks to input from Peter Zijlstra. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-9-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ade04df..55b0f7c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3199,6 +3199,7 @@ static void __perf_event_read(void *info) struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is @@ -3217,18 +3218,31 @@ static void __perf_event_read(void *info) } update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; - if (!data->group) + if (!data->group) { + pmu->read(event); + data->ret = 0; goto unlock; + } + + pmu->start_txn(pmu, PERF_PMU_TXN_READ); + + pmu->read(event); list_for_each_entry(sub, &event->sibling_list, group_entry) { update_event_times(sub); - if (sub->state == PERF_EVENT_STATE_ACTIVE) + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + /* + * Use sibling's PMU rather than @event's since + * sibling could be on different (eg: software) PMU. + */ sub->pmu->read(sub); + } } - data->ret = 0; + + data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); -- cgit v1.1 From a6f5f0dd4e21191ce35030dd4d6421e1cca10ee4 Mon Sep 17 00:00:00 2001 From: Alexandra Yates Date: Tue, 15 Sep 2015 10:32:46 -0700 Subject: PM / sleep: Report interrupt that caused system wakeup Add a sysfs attribute, /sys/power/pm_wakeup_irq, reporting the IRQ number of the first wakeup interrupt (that is, the first interrupt from an IRQ line armed for system wakeup) seen by the kernel during the most recent system suspend/resume cycle. This feature will be useful for system wakeup diagnostics of spurious wakeup interrupts. Signed-off-by: Alexandra Yates [ rjw: Fixed up pm_wakeup_irq definition ] Signed-off-by: Rafael J. Wysocki --- kernel/irq/pm.c | 2 +- kernel/power/main.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 21c6261..e80c440 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc) desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); - pm_system_wakeup(); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 63d395b..b2dd4d9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -272,6 +272,22 @@ static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } + +static ssize_t pm_wakeup_irq_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; +} + +static ssize_t pm_wakeup_irq_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} +power_attr(pm_wakeup_irq); + #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ @@ -604,6 +620,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_print_times_attr.attr, + &pm_wakeup_irq_attr.attr, #endif #endif #ifdef CONFIG_FREEZER -- cgit v1.1 From 1ed1328792ff46e4bb86a3d7f7be2971f4549f6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 12:53:17 -0400 Subject: sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem Note: This commit was originally committed as d59cfc09c32a but got reverted by 0c986253b939 due to the performance regression from the percpu_rwsem write down/up operations added to cgroup task migration path. percpu_rwsem changes which alleviate the performance issue are pending for v4.4-rc1 merge window. Re-apply. The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/cgroup.c | 77 +++++++++++++++------------------------------------------ kernel/fork.c | 4 --- 2 files changed, 20 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2c9eae6..115091e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); +struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ @@ -871,48 +874,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -void cgroup_threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} - -void cgroup_threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. - * - * fork and exit explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. - * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. - */ -static void threadgroup_lock(struct task_struct *tsk) -{ - down_write(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock - * - * Reverse threadgroup_lock(). - */ -static inline void threadgroup_unlock(struct task_struct *tsk) -{ - up_write(&tsk->signal->group_rwsem); -} - static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -2113,9 +2074,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through threadgroup_lock() against PF_EXITING - * setting such that we can't race against cgroup_exit() changing the - * css_set to init_css_set and dropping the old one. + * We are synchronized through cgroup_threadgroup_rwsem against + * PF_EXITING setting such that we can't race against cgroup_exit() + * changing the css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); @@ -2172,10 +2133,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding threadgroup_lock even if the - * target is a process. Threads may be created and destroyed but as long - * as cgroup_mutex is not dropped, no new css_set can be put into play and - * the preloaded css_sets are guaranteed to cover all migrations. + * This function may be called without holding cgroup_threadgroup_rwsem + * even if the target is a process. Threads may be created and destroyed + * but as long as cgroup_mutex is not dropped, no new css_set can be put + * into play and the preloaded css_sets are guaranteed to cover all + * migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2278,7 +2240,7 @@ err: * @threadgroup: whether @leader points to the whole process or a single task * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding threadgroup_lock of @leader. The + * process, the caller must be holding cgroup_threadgroup_rwsem. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2406,7 +2368,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and threadgroup_lock of @leader. + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2528,7 +2490,7 @@ retry_find_task: get_task_struct(tsk); rcu_read_unlock(); - threadgroup_lock(tsk); + percpu_down_write(&cgroup_threadgroup_rwsem); if (threadgroup) { if (!thread_group_leader(tsk)) { /* @@ -2538,7 +2500,7 @@ retry_find_task: * try again; this is * "double-double-toil-and-trouble-check locking". */ - threadgroup_unlock(tsk); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(tsk); goto retry_find_task; } @@ -2548,7 +2510,7 @@ retry_find_task: if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - threadgroup_unlock(tsk); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(tsk); out_unlock_cgroup: @@ -2751,17 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - threadgroup_lock(task); + percpu_down_write(&cgroup_threadgroup_rwsem); /* raced against de_thread() from another thread? */ if (!thread_group_leader(task)) { - threadgroup_unlock(task); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); continue; } ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - threadgroup_unlock(task); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -5083,6 +5045,7 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; + BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); diff --git a/kernel/fork.c b/kernel/fork.c index 2845623..7d5f0f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1149,10 +1149,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); -#ifdef CONFIG_CGROUPS - init_rwsem(&sig->group_rwsem); -#endif - sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.1 From 3014dde762f618fbcfe899f9ce9e929a2e5aa6dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 13:03:02 -0400 Subject: cgroup: simplify threadgroup locking Note: This commit was originally committed as b5ba75b5fc0e but got reverted by f9f9e7b77614 due to the performance regression from the percpu_rwsem write down/up operations added to cgroup task migration path. percpu_rwsem changes which alleviate the performance issue are pending for v4.4-rc1 merge window. Re-apply. Now that threadgroup locking is made global, code paths around it can be simplified. * lock-verify-unlock-retry dancing removed from __cgroup_procs_write(). * Race protection against de_thread() removed from cgroup_update_dfl_csses(). Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com --- kernel/cgroup.c | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 115091e..2cf0f79 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2460,14 +2460,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; -retry_find_task: + percpu_down_write(&cgroup_threadgroup_rwsem); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_cgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -2483,37 +2482,23 @@ retry_find_task: */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - rcu_read_unlock(); - goto out_unlock_cgroup; + goto out_unlock_rcu; } get_task_struct(tsk); rcu_read_unlock(); - percpu_down_write(&cgroup_threadgroup_rwsem); - if (threadgroup) { - if (!thread_group_leader(tsk)) { - /* - * a race with de_thread from another thread's exec() - * may strip us of our leadership, if this happens, - * there is no choice but to throw this task away and - * try again; this is - * "double-double-toil-and-trouble-check locking". - */ - percpu_up_write(&cgroup_threadgroup_rwsem); - put_task_struct(tsk); - goto retry_find_task; - } - } - ret = cgroup_procs_write_permission(tsk, cgrp, of); if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - percpu_up_write(&cgroup_threadgroup_rwsem); - put_task_struct(tsk); -out_unlock_cgroup: + goto out_unlock_threadgroup; + +out_unlock_rcu: + rcu_read_unlock(); +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2658,6 +2643,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* look up all csses currently attached to @cgrp's subtree */ down_read(&css_set_rwsem); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2713,17 +2700,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - percpu_down_write(&cgroup_threadgroup_rwsem); - /* raced against de_thread() from another thread? */ - if (!thread_group_leader(task)) { - percpu_up_write(&cgroup_threadgroup_rwsem); - put_task_struct(task); - continue; - } - ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -2733,6 +2711,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) out_finish: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } -- cgit v1.1 From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2015 23:09:29 +0200 Subject: sched: Fix crash trying to dequeue/enqueue the idle thread Sasha reports that his virtual machine tries to schedule the idle thread since commit 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context"). Hit trace shows this happening from idle_thread_get()->init_idle(), which is the _second_ init_idle() invocation on that task_struct, the first being done through idle_init()->fork_idle(). (this code is insane...) Because we call init_idle() twice in a row, its ->sched_class == &idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means do_set_cpus_allowed() think we're queued and will call dequeue_task(), which is implemented with BUG() for the idle class, seeing how dequeueing the idle task is a daft thing. Aside of the whole insanity of calling init_idle() _twice_, change the code to call set_cpus_allowed_common() instead as this is 'obviously' before the idle task gets ran etc.. Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276f..f0d043e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif raw_spin_unlock(&rq->lock); @@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } -- cgit v1.1 From f55fc2a57cc9ca3b1bb4fb8eb25b6e1989e5b993 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 19:06:33 +0200 Subject: perf: Restructure perf syscall point of no return The exclusive_event_installable() stuff only works because its exclusive with the grouping bits. Rework the code such that there is a sane place to error out before we go do things we cannot undo. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69..39679f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8297,13 +8297,30 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; + mutex_lock_double(&gctx->mutex, &ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + } + + /* + * Must be under the same ctx::mutex as perf_install_in_context(), + * because we need to serialize with concurrent event creation. + */ + if (!exclusive_event_installable(event, ctx)) { + /* exclusive and group stuff are assumed mutually exclusive */ + WARN_ON_ONCE(move_group); + + err = -EBUSY; + goto err_locked; + } + WARN_ON_ONCE(ctx->parent_ctx); + + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - mutex_lock_double(&gctx->mutex, &ctx->mutex); - perf_remove_from_context(group_leader, false); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -8311,13 +8328,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(sibling, false); put_ctx(gctx); } - } else { - mutex_lock(&ctx->mutex); - } - - WARN_ON_ONCE(ctx->parent_ctx); - if (move_group) { /* * Wait for everybody to stop referencing the events through * the old lists, before installing it on new lists. @@ -8349,22 +8360,20 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - } - if (!exclusive_event_installable(event, ctx)) { - err = -EBUSY; - mutex_unlock(&ctx->mutex); - fput(event_file); - goto err_context; + /* + * Now that all events are installed in @ctx, nothing + * references @gctx anymore, so drop the last reference we have + * on it. + */ + put_ctx(gctx); } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) { + if (move_group) mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -8391,6 +8400,12 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; +err_locked: + if (move_group) + mutex_unlock(&gctx->mutex); + mutex_unlock(&ctx->mutex); +/* err_file: */ + fput(event_file); err_context: perf_unpin_context(ctx); put_ctx(ctx); -- cgit v1.1 From a723968c0ed36db676478c3d26078f13484fe01c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 19:06:33 +0200 Subject: perf: Fix u16 overflows Vince reported that its possible to overflow the various size fields and get weird stuff if you stick too many events in a group. Put a lid on this by requiring the fixed record size not exceed 16k. This is still a fair amount of events (silly amount really) and leaves plenty room for callchains and stack dwarves while also avoiding overflowing the u16 variables. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 39679f7..dbb5329 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) +static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; @@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; + nr += nr_siblings; size += sizeof(u64); } @@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event) event->read_size = size; } -static void perf_event__header_size(struct perf_event *event) +static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; u16 size = 0; - perf_event__read_size(event); - if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); @@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event) event->header_size = size; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__header_size(struct perf_event *event) +{ + __perf_event_read_size(event, + event->group_leader->nr_siblings); + __perf_event_header_size(event, event->attr.sample_type); +} + static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; @@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +static bool perf_event_validate_size(struct perf_event *event) +{ + /* + * The values computed here will be over-written when we actually + * attach the event. + */ + __perf_event_read_size(event, event->group_leader->nr_siblings + 1); + __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); + perf_event__id_header_size(event); + + /* + * Sum the lot; should not exceed the 64k limit we have on records. + * Conservative limit to allow for callchains and other variable fields. + */ + if (event->read_size + event->header_size + + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + return false; + + return true; +} + static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; @@ -8302,6 +8327,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_locked; + } + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. -- cgit v1.1 From f73e22ab450140830005581c2c7ec389791a1b8d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 20:48:22 +0200 Subject: perf: Fix races in computing the header sizes There are two races with the current code: - Another event can join the group and compute a larger header_size concurrently, if the smaller store wins we'll have an incorrect header_size set. - We compute the header_size after the event becomes active, therefore its possible to use the size before its computed. Remedy the first by moving the computation inside the ctx::mutex lock, and the second by placing it _before_ perf_install_in_context(). Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dbb5329..b11756f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8399,6 +8399,15 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } + /* + * Precalculate sample_data sizes; do while holding ctx::mutex such + * that we're serialized against further additions and before + * perf_install_in_context() which is the point the event is active and + * can use these values. + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8415,12 +8424,6 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in -- cgit v1.1 From 20f9cd2acb1d74a8bf4b4087267f586e6ecdbc03 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Wed, 9 Sep 2015 17:00:41 +0200 Subject: sched/core: Make policy-testing consistent Most of the policy-tests are done via the _policy() helpers with the notable exception of idle. A new wrapper for valid_policy() has also been added to improve readability in set_load_weight(). This commit does not change the logical behavior of the scheduler core. Signed-off-by: Henrik Austad Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441810841-4756-1-git-send-email-henrik@austad.us Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ kernel/sched/sched.h | 9 +++++++++ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ab415a..1b30b5b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -3733,10 +3733,7 @@ recheck: } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + if (!valid_policy(policy)) return -EINVAL; } @@ -3792,7 +3789,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 167ab48..3845a71 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } */ #define RUNTIME_INF ((u64)~0ULL) +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} static inline int fair_policy(int policy) { return policy == SCHED_NORMAL || policy == SCHED_BATCH; @@ -98,6 +102,11 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} static inline int task_has_rt_policy(struct task_struct *p) { -- cgit v1.1 From 4620f8c1fda2af4ccbd11e194e2dd785f7d7f279 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Sep 2015 09:00:27 -0400 Subject: sched/numa: Limit the amount of virtual memory scanned in task_numa_work() Currently task_numa_work() scans up to numa_balancing_scan_size_mb worth of memory per invocation, but only counts memory areas that have at least one PTE that is still present and not marked for numa hint faulting. It will skip over arbitarily large amounts of memory that are either unused, full of swap ptes, or full of PTEs that were already marked for NUMA hint faults but have not been faulted on yet. This can cause excessive amounts of CPU use, due to there being essentially no upper limit on the scan rate of very large processes that are not yet in a phase where they are actively accessing old memory pages (eg. they are still initializing their data). Avoid that problem by placing an upper limit on the amount of virtual memory that task_numa_work() scans in each invocation. This can be a higher limit than "pages", to ensure the task still skips over unused areas fairly quickly. While we are here, also fix the "nr_pte_updates" logic, so it only counts page ranges with ptes in them. Reported-by: Andrea Arcangeli Reported-by: Jan Stancek Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150911090027.4a7987bd@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9176f7c..1bfad9f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2157,7 +2157,7 @@ void task_numa_work(struct callback_head *work) struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; - long pages; + long pages, virtpages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -2203,9 +2203,11 @@ void task_numa_work(struct callback_head *work) start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + virtpages = pages * 8; /* Scan up to this much virtual space */ if (!pages) return; + down_read(&mm->mmap_sem); vma = find_vma(mm, start); if (!vma) { @@ -2240,18 +2242,22 @@ void task_numa_work(struct callback_head *work) start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); + nr_pte_updates = change_prot_numa(vma, start, end); /* - * Scan sysctl_numa_balancing_scan_size but ensure that - * at least one PTE is updated so that unused virtual - * address space is quickly skipped. + * Try to scan sysctl_numa_balancing_size worth of + * hpages that have at least one present PTE that + * is not already pte-numa. If the VMA contains + * areas that are unused or already full of prot_numa + * PTEs, scan up to virtpages, to skip through those + * areas faster. */ if (nr_pte_updates) pages -= (end - start) >> PAGE_SHIFT; + virtpages -= (end - start) >> PAGE_SHIFT; start = end; - if (pages <= 0) + if (pages <= 0 || virtpages <= 0) goto out; cond_resched(); -- cgit v1.1 From 84fb5a182d39221b89f205365386df243135d622 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 15 Sep 2015 18:57:37 +0800 Subject: sched/fair: Polish comments for LOAD_AVG_MAX Macro LOAD_AVG_MAX is defined far away from the precompuated tables for decay calculation in code; So explicitly comments for this. Also fix one typo: s/LOAD_MAX_AVG/LOAD_AVG_MAX. Signed-off-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1442314657-14949-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1bfad9f..80c62bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p); /* * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. + * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are + * dependent on this value. */ #define LOAD_AVG_PERIOD 32 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) -- cgit v1.1 From 79a89f92cbe31ba6bc50caf211a7ac4d97d0f35f Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 15 Sep 2015 18:56:45 +0800 Subject: sched/fair: Remove unnecessary parameter for group_classify() The group_classify() function does not use the "env" parameter, so remove it. Also unify code to always use group_classify() to calculate group's load type. Signed-off-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1442314605-14838-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 80c62bf..4df37a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6273,9 +6273,9 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } -static enum group_type group_classify(struct lb_env *env, - struct sched_group *group, - struct sg_lb_stats *sgs) +static inline enum +group_type group_classify(struct sched_group *group, + struct sg_lb_stats *sgs) { if (sgs->group_no_capacity) return group_overloaded; @@ -6340,7 +6340,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(env, group, sgs); + sgs->group_type = group_classify(group, sgs); } /** @@ -6474,7 +6474,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_overloaded; + sgs->group_type = group_classify(sg, sgs); } if (update_sd_pick_busiest(env, sds, sg, sgs)) { -- cgit v1.1 From 6e1e5196975fb7ecc501b3fe1075b77aea2b7839 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 14 Sep 2015 00:37:22 -0700 Subject: locking/qrwlock: Rename ->lock to ->wait_lock ... trivial, but reads a little nicer when we name our actual primitive 'lock'. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Link: http://lkml.kernel.org/r/1442216244-4409-1-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index f17a3e3..fec0823 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -86,7 +86,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) /* * Put the reader into the wait queue */ - arch_spin_lock(&lock->lock); + arch_spin_lock(&lock->wait_lock); /* * The ACQUIRE semantics of the following spinning code ensure @@ -99,7 +99,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) /* * Signal the next one in queue to become queue head */ - arch_spin_unlock(&lock->lock); + arch_spin_unlock(&lock->wait_lock); } EXPORT_SYMBOL(queued_read_lock_slowpath); @@ -112,7 +112,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) u32 cnts; /* Put the writer into the wait queue */ - arch_spin_lock(&lock->lock); + arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && @@ -144,6 +144,6 @@ void queued_write_lock_slowpath(struct qrwlock *lock) cpu_relax_lowlatency(); } unlock: - arch_spin_unlock(&lock->lock); + arch_spin_unlock(&lock->wait_lock); } EXPORT_SYMBOL(queued_write_lock_slowpath); -- cgit v1.1 From c55a6ffa6285e29f874ed403979472631ec70bff Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 14 Sep 2015 00:37:24 -0700 Subject: locking/osq: Relax atomic semantics ... by using acquire/release for ops around the lock->tail. As such, weakly ordered archs can benefit from more relaxed use of barriers when issuing atomics. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Link: http://lkml.kernel.org/r/1442216244-4409-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/osq_lock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index dc85ee2..d092a0c 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its @@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) node->next = NULL; node->cpu = curr; - old = atomic_xchg(&lock->tail, curr); + /* + * ACQUIRE semantics, pairs with corresponding RELEASE + * in unlock() uncontended, or fastpath. + */ + old = atomic_xchg_acquire(&lock->tail, curr); if (old == OSQ_UNLOCKED_VAL) return true; @@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Fast path for the uncontended case. */ - if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + if (likely(atomic_cmpxchg_release(&lock->tail, curr, + OSQ_UNLOCKED_VAL) == curr)) return; /* -- cgit v1.1 From 93edc8bd7750ff3cae088bfca453ea73dc9004a4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 11 Sep 2015 14:37:34 -0400 Subject: locking/pvqspinlock: Kick the PV CPU unconditionally when _Q_SLOW_VAL If _Q_SLOW_VAL has been set, the vCPU state must have been vcpu_hashed. The extra check at the end of __pv_queued_spin_unlock() is unnecessary and can be removed. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Cc: Andrew Morton Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441996658-62854-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index c8e6e9a..f0450ff 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -267,7 +267,6 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) } if (!lp) { /* ONCE */ - WRITE_ONCE(pn->state, vcpu_hashed); lp = pv_hash(lock, pn); /* @@ -275,11 +274,9 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() * we'll be sure to be able to observe our hash entry. * - * [S] pn->state * [S] [Rmw] l->locked == _Q_SLOW_VAL * MB RMB * [RmW] l->locked = _Q_SLOW_VAL [L] - * [L] pn->state * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ @@ -364,8 +361,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * vCPU is harmless other than the additional latency in completing * the unlock. */ - if (READ_ONCE(node->state) == vcpu_hashed) - pv_kick(node->cpu); + pv_kick(node->cpu); } /* * Include the architecture specific callee-save thunk of the -- cgit v1.1 From 49d1dc4b81797f88270832b11e9f73809e7e7209 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: cgroup: implement static_key based cgroup_subsys_enabled() and cgroup_subsys_on_dfl() Whether a subsys is enabled and attached to the default hierarchy seldom changes and may be tested in the hot paths. This patch implements static_key based cgroup_subsys_enabled() and cgroup_subsys_on_dfl() tests. The following patches will update the users and remove duplicate mechanisms. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79..3619389 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -139,6 +139,27 @@ static const char *cgroup_subsys_name[] = { }; #undef SUBSYS +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ +#define SUBSYS(_x) \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); +#include +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, +static struct static_key_true *cgroup_subsys_enabled_key[] = { +#include +}; +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, +static struct static_key_true *cgroup_subsys_on_dfl_key[] = { +#include +}; +#undef SUBSYS + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are @@ -1319,9 +1340,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) { + if (dst_root == &cgrp_dfl_root) { + static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); + } else { dst_root->cgrp.subtree_control |= 1 << ssid; cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } if (ss->bind) @@ -5483,6 +5507,7 @@ static int __init cgroup_disable(char *str) strcmp(token, ss->legacy_name)) continue; + static_branch_disable(cgroup_subsys_enabled_key[i]); ss->disabled = 1; printk(KERN_INFO "Disabling %s control group subsystem\n", ss->name); -- cgit v1.1 From fc5ed1e95410ad73b2ab8f33cd90eb3bcf6c98a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: cgroup: replace cgroup_subsys->disabled tests with cgroup_subsys_enabled() Replace cgroup_subsys->disabled tests in controllers with cgroup_subsys_enabled(). cgroup_subsys_enabled() requires literal subsys name as its parameter and thus can't be used for cgroup core which iterates through controllers. For cgroup core, introduce and use cgroup_ssid_enabled() which uses slower static_key_enabled() test and can be indexed by subsys ID. This leaves cgroup_subsys->disabled unused. Removed. Signed-off-by: Tejun Heo Acked-by: Zefan Li Cc: Johannes Weiner Cc: Michal Hocko --- kernel/cgroup.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3619389..5703ba7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -224,6 +224,19 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID + * @ssid: subsys ID of interest + * + * cgroup_subsys_enabled() can only be used with literal subsys names which + * is fine for individual subsystems but unsuitable for cgroup core. This + * is slower static_key_enabled() based test indexed by @ssid. + */ +static bool cgroup_ssid_enabled(int ssid) +{ + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -1482,7 +1495,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) for_each_subsys(ss, i) { if (strcmp(token, ss->legacy_name)) continue; - if (ss->disabled) + if (!cgroup_ssid_enabled(i)) continue; /* Mutually exclusive option 'all' + subsystem name */ @@ -1513,7 +1526,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (!ss->disabled) + if (cgroup_ssid_enabled(i)) opts->subsys_mask |= (1 << i); /* @@ -2762,7 +2775,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (tok[0] == '\0') continue; for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - if (ss->disabled || strcmp(tok + 1, ss->name)) + if (!cgroup_ssid_enabled(ssid) || + strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { @@ -3320,7 +3334,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; - if (ss->disabled) + if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') @@ -5082,7 +5096,7 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (ss->disabled) + if (!cgroup_ssid_enabled(ssid)) continue; cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -5217,7 +5231,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), !ss->disabled); + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); mutex_unlock(&cgroup_mutex); return 0; @@ -5508,7 +5523,6 @@ static int __init cgroup_disable(char *str) continue; static_branch_disable(cgroup_subsys_enabled_key[i]); - ss->disabled = 1; printk(KERN_INFO "Disabling %s control group subsystem\n", ss->name); break; -- cgit v1.1 From 9e10a130d9b62af976d17d120c95f3650769312c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: cgroup: replace cgroup_on_dfl() tests in controllers with cgroup_subsys_on_dfl() cgroup_on_dfl() tests whether the cgroup's root is the default hierarchy; however, an individual controller is only interested in whether the controller is attached to the default hierarchy and never tests a cgroup which doesn't belong to the hierarchy that the controller is attached to. This patch replaces cgroup_on_dfl() tests in controllers with faster static_key based cgroup_subsys_on_dfl(). This leaves cgroup core as the only user of cgroup_on_dfl() and the function is moved from the header file to cgroup.c. Signed-off-by: Tejun Heo Acked-by: Zefan Li Cc: Vivek Goyal Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko --- kernel/cgroup.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/cpuset.c | 23 +++++++++++++---------- 2 files changed, 71 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5703ba7..c24f929 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -237,6 +237,64 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +/** + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest + * + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. + * + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. + * + * List of changed behaviors: + * + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. + * + * - When mounting an existing superblock, mount options should match. + * + * - Remount is disallowed. + * + * - rename(2) is disallowed. + * + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. + * + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. + * + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. + * + * - "cgroup.clone_children" is removed. + * + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. + * + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. + * + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. + * + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. + */ +static bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0acff0..20eedd8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_cpuset_subset(trial, par)) goto out; /* @@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_on_dfl(css->cgroup) && + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1952,7 +1955,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(cs->css.cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2029,7 +2032,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(root_css->cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2210,7 +2213,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_on_dfl(cs->css.cgroup)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2241,7 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); + bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); mutex_lock(&cpuset_mutex); -- cgit v1.1 From 4a07c222d3afb00e1113834fee38d23a8e5d71dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:22 -0400 Subject: cgroup: replace "cgroup.populated" with "cgroup.events" memcg already uses "memory.events" for event reporting and other controllers may need event reporting too. Let's standardize on "$SUBSYS.events" interface file for reporting events which don't happen too frequently and thus can share event notification. "cgroup.populated" is replaced with "populated" field in "cgroup.events" and documentation is updated accordingly. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c24f929..75eba25 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -611,8 +611,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - if (cgrp->populated_kn) - kernfs_notify(cgrp->populated_kn); + if (cgrp->events_kn) + kernfs_notify(cgrp->events_kn); cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -3045,9 +3045,10 @@ err_undo_css: goto out_unlock; } -static int cgroup_populated_show(struct seq_file *seq, void *v) +static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + seq_printf(seq, "populated %d\n", + (bool)seq_css(seq)->cgroup->populated_cnt); return 0; } @@ -3214,8 +3215,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) if (cft->write == cgroup_procs_write) cgrp->procs_kn = kn; - else if (cft->seq_show == cgroup_populated_show) - cgrp->populated_kn = kn; + else if (cft->seq_show == cgroup_events_show) + cgrp->events_kn = kn; return 0; } @@ -4388,9 +4389,9 @@ static struct cftype cgroup_dfl_base_files[] = { .write = cgroup_subtree_control_write, }, { - .name = "cgroup.populated", + .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_populated_show, + .seq_show = cgroup_events_show, }, { } /* terminate */ }; -- cgit v1.1 From 7dbdb199d3bf88f043ea17e97113eb28d5b100bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE cftype->mode allows controllers to give arbitrary permissions to interface knobs. Except for "cgroup.event_control", the existing uses are spurious. * Some explicitly specify S_IRUGO | S_IWUSR even though that's the default. * "cpuset.memory_pressure" specifies S_IRUGO while also setting a write callback which returns -EACCES. All it needs to do is simply not setting a write callback. "cgroup.event_control" uses cftype->mode to make the file world-writable. It's a misdesigned interface and we don't want controllers to be tweaking interface file permissions in general. This patch removes cftype->mode and all its spurious uses and implements CFTYPE_WORLD_WRITABLE for "cgroup.event_control" which is marked as compatibility-only. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 19 +++++++------------ kernel/cpuset.c | 6 ------ 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75eba25..5031edc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1139,23 +1139,21 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander + * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; - if (cft->mode) - return cft->mode; - if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write) - mode |= S_IWUSR; + if (cft->write_u64 || cft->write_s64 || cft->write) { + if (cft->flags & CFTYPE_WORLD_WRITABLE) + mode |= S_IWUGO; + else + mode |= S_IWUSR; + } return mode; } @@ -4371,7 +4369,6 @@ static struct cftype cgroup_dfl_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.controllers", @@ -4406,7 +4403,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.clone_children", @@ -4426,7 +4422,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup_tasks_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 20eedd8..312961e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1597,9 +1597,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_MEMORY_PRESSURE_ENABLED: cpuset_memory_pressure_enabled = !!val; break; - case FILE_MEMORY_PRESSURE: - retval = -EACCES; - break; case FILE_SPREAD_PAGE: retval = update_flag(CS_SPREAD_PAGE, cs, val); break; @@ -1866,9 +1863,6 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE, - .mode = S_IRUGO, }, { -- cgit v1.1 From ccdca2187b03decd186df13e44c57fbf1974d0a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: relocate cgroup_populate_dir() Move it upwards so that it's right below cgroup_clear_dir() and the forward declaration is unnecessary. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 63 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5031edc..c38c3a2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1116,7 +1116,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -1333,6 +1332,37 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) } } +/** + * cgroup_populate_dir - create subsys files in a cgroup directory + * @cgrp: target cgroup + * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. + */ +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +{ + struct cgroup_subsys *ss; + int i, ret = 0; + + /* process cftsets of each subsystem */ + for_each_subsys(ss, i) { + struct cftype *cfts; + + if (!(subsys_mask & (1 << i))) + continue; + + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); + if (ret < 0) + goto err; + } + } + return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; +} + static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { @@ -4438,37 +4468,6 @@ static struct cftype cgroup_legacy_base_files[] = { { } /* terminate */ }; -/** - * cgroup_populate_dir - create subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be added - * - * On failure, no file is added. - */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i, ret = 0; - - /* process cftsets of each subsystem */ - for_each_subsys(ss, i) { - struct cftype *cfts; - - if (!(subsys_mask & (1 << i))) - continue; - - list_for_each_entry(cfts, &ss->cfts, node) { - ret = cgroup_addrm_files(cgrp, cfts, true); - if (ret < 0) - goto err; - } - } - return 0; -err: - cgroup_clear_dir(cgrp, subsys_mask); - return ret; -} - /* * css destruction is four-stage process. * -- cgit v1.1 From 6732ed853af942ba20ddbd091336acc7df569119 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: make cgroup_addrm_files() clean up after itself on failures After a file creation failure, cgroup_addrm_files() it didn't remove the files which had already been created. When cgroup_populate_dir() is the caller, this is fine as the caller performs cleanup; however, for other callers, this may leave unactivated dangling files behind. As kernfs directory removals are recursive, this doesn't lead to permanent memory leak but it can, for example, fail future attempts to create those files again. There's no point in keeping around this sort of subtlety and it gets in the way of planned updates to file handling. This patch makes cgroup_addrm_files() clean up after itself on failures. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c38c3a2..f09bab1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3255,19 +3255,18 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * For removals, this function never fails. If addition fails, this - * function doesn't remove files already added. The caller is responsible - * for cleaning up. + * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add) { - struct cftype *cft; + struct cftype *cft, *cft_end = NULL; int ret; lockdep_assert_held(&cgroup_mutex); - for (cft = cfts; cft->name[0] != '\0'; cft++) { +restart: + for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; @@ -3283,7 +3282,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); - return ret; + cft_end = cft; + is_add = false; + goto restart; } } else { cgroup_rm_file(cgrp, cft); -- cgit v1.1 From 1ada48387a31b437074aa8973ce81de6e8c60bde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: cosmetic updates to rebind_subsystems() * Use local variables @scgrp and @dcgrp for @src_root->cgrp and @dst_root->cgrp respectively. * Use initializers to set @src_root and @css in the inner bind loop. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f09bab1..a4ff496 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1366,6 +1366,7 @@ err: static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { + struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; unsigned long tmp_ss_mask; int ssid, i, ret; @@ -1387,7 +1388,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); + ret = cgroup_populate_dir(dcgrp, tmp_ss_mask); if (ret) { if (dst_root != &cgrp_dfl_root) return ret; @@ -1413,37 +1414,35 @@ static int rebind_subsystems(struct cgroup_root *dst_root, cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); for_each_subsys_which(ss, ssid, &ss_mask) { - struct cgroup_root *src_root; - struct cgroup_subsys_state *css; + struct cgroup_root *src_root = ss->root; + struct cgroup *scgrp = &src_root->cgrp; + struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset; - src_root = ss->root; - css = cgroup_css(&src_root->cgrp, ss); - - WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); + WARN_ON(!css || cgroup_css(dcgrp, ss)); - RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); - rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); + RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); + rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = &dst_root->cgrp; + css->cgroup = dcgrp; down_write(&css_set_rwsem); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], - &dst_root->cgrp.e_csets[ss->id]); + &dcgrp->e_csets[ss->id]); up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(&src_root->cgrp); + scgrp->subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(scgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { - dst_root->cgrp.subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + dcgrp->subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } @@ -1451,7 +1450,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->bind(css); } - kernfs_activate(dst_root->cgrp.kn); + kernfs_activate(dcgrp->kn); return 0; } -- cgit v1.1 From 4df8dc903161c03cd9bff9077d8427fefe808e6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: restructure file creation / removal handling The file creation / removal path has always been a bit icky and the planned notification update requires css during file creation. Restructure as follows. * cgroup_addrm_files() now takes both @css and @cgrp and is only called directly by other file handling functions. * cgroup_populate/clear_dir() are replaced with css_populate/clear_dir() taking @css and @cgrp_override. @cgrp_override is used only when files needs to be created on / removed from a cgroup which isn't attached to @css which happens during subsystem rebinds. Subsystem loops are moved to the callers. * cgroup_add_file() now takes both @css and @cgrp. @css isn't used yet but will be used by the planned notification update. This patch doens't cause any behavior changes. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 143 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a4ff496..06c9d1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -221,7 +221,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add); /** @@ -1313,53 +1314,57 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - remove subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be removed + * css_clear_dir - remove subsys files in a cgroup directory + * @css: taget css + * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void css_clear_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) { - struct cgroup_subsys *ss; - int i; + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts; - for_each_subsys(ss, i) { - struct cftype *cfts; - - if (!(subsys_mask & (1 << i))) - continue; - list_for_each_entry(cfts, &ss->cfts, node) - cgroup_addrm_files(cgrp, cfts, false); - } + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); } /** - * cgroup_populate_dir - create subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be added + * css_populate_dir - create subsys files in a cgroup directory + * @css: target css + * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int css_populate_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) { - struct cgroup_subsys *ss; - int i, ret = 0; + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts, *failed_cfts; + int ret; - /* process cftsets of each subsystem */ - for_each_subsys(ss, i) { - struct cftype *cfts; + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_dfl_base_files; + else + cfts = cgroup_legacy_base_files; - if (!(subsys_mask & (1 << i))) - continue; + return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + } - list_for_each_entry(cfts, &ss->cfts, node) { - ret = cgroup_addrm_files(cgrp, cfts, true); - if (ret < 0) - goto err; + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; } } return 0; err: - cgroup_clear_dir(cgrp, subsys_mask); + list_for_each_entry(cfts, &css->ss->cfts, node) { + if (cfts == failed_cfts) + break; + cgroup_addrm_files(css, cgrp, cfts, false); + } return ret; } @@ -1388,10 +1393,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - ret = cgroup_populate_dir(dcgrp, tmp_ss_mask); - if (ret) { - if (dst_root != &cgrp_dfl_root) - return ret; + for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + struct cgroup *scgrp = &ss->root->cgrp; + int tssid; + + ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); + if (!ret) + continue; /* * Rebinding back to the default root is not allowed to @@ -1399,20 +1407,27 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * be rare. Moving subsystems back and forth even more so. * Just warn about it and continue. */ - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + if (dst_root == &cgrp_dfl_root) { + if (cgrp_dfl_root_visible) { + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + } + continue; } + + for_each_subsys_which(ss, tssid, &tmp_ss_mask) { + if (tssid == ssid) + break; + css_clear_dir(cgroup_css(scgrp, ss), dcgrp); + } + return ret; } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys_which(ss, ssid, &ss_mask) - cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - for_each_subsys_which(ss, ssid, &ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1421,6 +1436,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, WARN_ON(!css || cgroup_css(dcgrp, ss)); + css_clear_dir(css, NULL); + RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; @@ -1791,7 +1808,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; - struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1830,12 +1846,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - if (root == &cgrp_dfl_root) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(root_cgrp, base_files, true); + ret = css_populate_dir(&root_cgrp->self, NULL); if (ret) goto destroy_root; @@ -2985,7 +2996,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = create_css(child, ss, cgrp->subtree_control & (1 << ssid)); else - ret = cgroup_populate_dir(child, 1 << ssid); + ret = css_populate_dir(cgroup_css(child, ss), + NULL); if (ret) goto err_undo_css; } @@ -3018,7 +3030,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (css_disable & (1 << ssid)) { kill_css(css); } else { - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); if (ss->css_reset) ss->css_reset(css); } @@ -3066,7 +3078,7 @@ err_undo_css: if (css_enable & (1 << ssid)) kill_css(css); else - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); } } goto out_unlock; @@ -3218,7 +3230,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } -static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, + struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3249,14 +3262,16 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) /** * cgroup_addrm_files - add or remove files to a cgroup directory - * @cgrp: the target cgroup + * @css: the target css + * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; @@ -3277,7 +3292,7 @@ restart: continue; if (is_add) { - ret = cgroup_add_file(cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); @@ -3309,7 +3324,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - ret = cgroup_addrm_files(cgrp, cfts, is_add); + ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } @@ -4685,7 +4700,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css->id = err; if (visible) { - err = cgroup_populate_dir(cgrp, 1 << ss->id); + err = css_populate_dir(css, NULL); if (err) goto err_free_id; } @@ -4711,7 +4726,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err_list_del: list_del_rcu(&css->sibling); - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: @@ -4728,7 +4743,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - struct cftype *base_files; int ssid, ret; /* Do not accept '\n' to prevent making /proc//cgroup unparsable. @@ -4804,12 +4818,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - if (cgroup_on_dfl(cgrp)) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(cgrp, base_files, true); + ret = css_populate_dir(&cgrp->self, NULL); if (ret) goto out_destroy; @@ -4896,7 +4905,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); /* * Killing would put the base ref, but we need to keep it alive -- cgit v1.1 From 6f60eade2433cb3a38687d5f8a4f44b92c6c51bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: generalize obtaining the handles of and notifying cgroup files cgroup core handles creations and removals of cgroup interface files as described by cftypes. There are cases where the handle for a given file instance is necessary, for example, to generate a file modified event. Currently, this is handled by explicitly matching the callback method pointer and storing the file handle manually in cgroup_add_file(). While this simple approach works for cgroup core files, it can't for controller interface files. This patch generalizes cgroup interface file handle handling. struct cgroup_file is defined and each cftype can optionally tell cgroup core to store the file handle by setting ->file_offset. A file handle remains accessible as long as the containing css is accessible. Both "cgroup.procs" and "cgroup.events" are converted to use the new generic mechanism instead of hooking directly into cgroup_add_file(). Also, cgroup_file_notify() which takes a struct cgroup_file and generates a file modified event on it is added and replaces explicit kernfs_notify() invocations. This generalizes cgroup file handle handling and allows controllers to generate file modified notifications. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 06c9d1a..0be276f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -612,8 +612,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - if (cgrp->events_kn) - kernfs_notify(cgrp->events_kn); + cgroup_file_notify(&cgrp->events_file); + cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -1771,6 +1771,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); + INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -2562,7 +2563,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, cgrp = cgroup_parent(cgrp); ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_kn); + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (inode) { ret = inode_permission(inode, MAY_WRITE); iput(inode); @@ -3253,10 +3254,14 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, return ret; } - if (cft->write == cgroup_procs_write) - cgrp->procs_kn = kn; - else if (cft->seq_show == cgroup_events_show) - cgrp->events_kn = kn; + if (cft->file_offset) { + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + kernfs_get(kn); + cfile->kn = kn; + list_add(&cfile->node, &css->files); + } + return 0; } @@ -4408,6 +4413,7 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", + .file_offset = offsetof(struct cgroup, procs_file), .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, @@ -4433,6 +4439,7 @@ static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, { } /* terminate */ @@ -4511,9 +4518,13 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; + struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); + list_for_each_entry(cfile, &css->files, node) + kernfs_put(cfile->kn); + if (ss) { /* css free path */ int id = css->id; @@ -4618,6 +4629,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { -- cgit v1.1 From 19a5ecde086a6a5287978b12ae948fa691b197b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 20 Sep 2015 21:01:22 -0700 Subject: rcu: Suppress lockdep false positive for rcp->exp_funnel_mutex In kernels built with CONFIG_PREEMPT=y, synchronize_rcu_expedited() invokes synchronize_sched_expedited() while holding RCU-preempt's root rcu_node structure's ->exp_funnel_mutex, which is acquired after the rcu_data structure's ->exp_funnel_mutex. The first thing that synchronize_sched_expedited() will do is acquire RCU-sched's rcu_data structure's ->exp_funnel_mutex. There is no danger of an actual deadlock because the locking order is always from RCU-preempt's expedited mutexes to those of RCU-sched. Unfortunately, lockdep considers both rcu_data structures' ->exp_funnel_mutex to be in the same lock class and therefore reports a deadlock cycle. This commit silences this false positive by placing RCU-sched's rcu_data structures' ->exp_funnel_mutex locks into their own lock class. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9f75f25..775d36c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { + static struct lock_class_key rcu_exp_sched_rdp_class; unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (rsp == &rcu_sched_state) + lockdep_set_class_and_name(&rdp->exp_funnel_mutex, + &rcu_exp_sched_rdp_class, + "rcu_data_exp_sched"); } /* -- cgit v1.1 From f4ecea309d3e17ba5e90082308125ad23bd5701b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jul 2015 17:28:11 -0700 Subject: rcu: Use rsp->expedited_wq instead of sync_rcu_preempt_exp_wq Now that there is an ->expedited_wq waitqueue in each rcu_state structure, there is no need for the sync_rcu_preempt_exp_wq global variable. This commit therefore substitutes ->expedited_wq for sync_rcu_preempt_exp_wq. It also initializes ->expedited_wq only once at boot instead of at the start of each expedited grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 775d36c..53d66eb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3556,7 +3556,6 @@ void synchronize_sched_expedited(void) rcu_exp_gp_seq_start(rsp); /* Stop each CPU that is online, non-idle, and not us. */ - init_waitqueue_head(&rsp->expedited_wq); atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ for_each_online_cpu(cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); @@ -4179,6 +4178,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } init_waitqueue_head(&rsp->gp_wq); + init_waitqueue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b2bf396..72df006 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -535,8 +535,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); - /* * Return non-zero if there are any tasks in RCU read-side critical * sections blocking the current preemptible-RCU expedited grace period. @@ -590,7 +588,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore(&rnp->lock, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - wake_up(&sync_rcu_preempt_exp_wq); + wake_up(&rsp->expedited_wq); } break; } @@ -729,7 +727,7 @@ void synchronize_rcu_expedited(void) /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); - wait_event(sync_rcu_preempt_exp_wq, + wait_event(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp)); /* Clean up and exit. */ -- cgit v1.1 From 7922cd0e562cb2b8da2c8a0afda2e1c9bb4a94e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jul 2015 13:34:32 -0700 Subject: rcu: Move rcu_report_exp_rnp() to allow consolidation This is a nearly pure code-movement commit, moving rcu_report_exp_rnp(), sync_rcu_preempt_exp_done(), and rcu_preempted_readers_exp() so that later commits can make synchronize_sched_expedited() use them. The non-code-movement portion of this commit tags rcu_report_exp_rnp() as __maybe_unused to avoid build errors when CONFIG_PREEMPT=n. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 66 ------------------------------------------------ 2 files changed, 66 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 53d66eb..59af27d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3379,6 +3379,72 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) return rcu_seq_done(&rsp->expedited_sequence, s); } +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(struct rcu_node *rnp) +{ + return rnp->exp_tasks != NULL; +} + +/* + * return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return !rcu_preempted_readers_exp(rnp) && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + unsigned long mask; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + wake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); + rnp->expmask &= ~mask; + } +} + /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 72df006..e73be85 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -536,72 +536,6 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); /* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return !rcu_preempted_readers_exp(rnp) && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ - unsigned long flags; - unsigned long mask; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - wake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - rnp->expmask &= ~mask; - } -} - -/* * Snapshot the tasks blocking the newly started preemptible-RCU expedited * grace period for the specified rcu_node structure, phase 1. If there * are such tasks, set the ->expmask bits up the rcu_node tree and also -- cgit v1.1 From b9585e940a0d78770cda8f9aebf81b17b4d19e6d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jul 2015 16:04:45 -0700 Subject: rcu: Consolidate tree setup for synchronize_rcu_expedited() This commit replaces sync_rcu_preempt_exp_init1(() and sync_rcu_preempt_exp_init2() with sync_exp_reset_tree_hotplug() and sync_exp_reset_tree(), which will also be used by synchronize_sched_expedited(), and sync_rcu_exp_select_nodes(), which contains code specific to synchronize_rcu_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 86 ++++++++++++++++++++++++++++++++++++++- kernel/rcu/tree.h | 17 +++++--- kernel/rcu/tree_plugin.h | 102 +++++++++-------------------------------------- 3 files changed, 115 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 59af27d..8526896 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3380,6 +3380,87 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) } /* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave(&rnp_up->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* * Return non-zero if there are any tasks in RCU read-side critical * sections blocking the current preemptible-RCU expedited grace period. * If there is no preemptible-RCU expedited grace period currently in @@ -3971,7 +4052,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->beenonline = 1; /* We have now been online. */ rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3993,6 +4073,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + if (!rdp->beenonline) + WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); + rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->passed_quiesce = false; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2e991f8..a57f25e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -171,16 +171,21 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blkd_tasks */ - /* elements that need to drain to allow the */ - /* current expedited grace period to */ - /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initial value for qsmask & expmask. */ + /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; /* Online CPUs for next grace period. */ + unsigned long expmask; /* CPUs or groups that need to check in */ + /* to allow the current expedited GP */ + /* to complete. */ + unsigned long expmaskinit; + /* Per-GP initial values for expmask. */ + /* Initialized from ->expmaskinitnext at the */ + /* beginning of each expedited GP. */ + unsigned long expmaskinitnext; + /* Online CPUs for next expedited GP. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -466,6 +471,7 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); + int ncpus; /* # CPUs seen so far. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -508,6 +514,7 @@ struct rcu_state { atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e73be85..62d0541 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -536,86 +536,28 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); /* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 1. If there - * are such tasks, set the ->expmask bits up the rcu_node tree and also - * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 - * that work is needed here. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Select the nodes that the upcoming expedited grace period needs + * to wait for. */ -static void -sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) +static void sync_rcu_exp_select_nodes(struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; - struct rcu_node *rnp_up; + struct rcu_node *rnp; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp->expmask); - WARN_ON_ONCE(rnp->exp_tasks); - if (!rcu_preempt_has_tasks(rnp)) { - /* No blocked tasks, nothing to do. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - /* Call for Phase 2 and propagate ->expmask bits up the tree. */ - rnp->expmask = 1; - rnp_up = rnp; - while (rnp_up->parent) { - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - if (rnp_up->expmask & mask) - break; - raw_spin_lock(&rnp_up->lock); /* irqs already off */ + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - rnp_up->expmask |= mask; - raw_spin_unlock(&rnp_up->lock); /* irqs still off */ - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 2. If the - * leaf rcu_node structure has its ->expmask field set, check for tasks. - * If there are some, clear ->expmask and set ->exp_tasks accordingly, - * then initiate RCU priority boosting. Otherwise, clear ->expmask and - * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, - * enabling rcu_read_unlock_special() to do the bit-clearing. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static void -sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - if (!rnp->expmask) { - /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Phase 1 is over. */ - rnp->expmask = 0; - - /* - * If there are still blocked tasks, set up ->exp_tasks so that - * rcu_read_unlock_special() will wake us and then boost them. - */ - if (rcu_preempt_has_tasks(rnp)) { - rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - return; + rnp->expmask = 0; /* No per-CPU component yet. */ + if (!rcu_preempt_has_tasks(rnp)) { + /* FIXME: Want __rcu_report_exp_rnp() here. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else { + rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp, flags); + } + rcu_report_exp_rnp(rsp, rnp, false); } - - /* No longer any blocked tasks, so undo bit setting. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_report_exp_rnp(rsp, rnp, false); } /** @@ -648,16 +590,8 @@ void synchronize_rcu_expedited(void) /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); - /* - * Snapshot current state of ->blkd_tasks lists into ->expmask. - * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() - * to start clearing them. Doing this in one phase leads to - * strange races between setting and clearing bits, so just say "no"! - */ - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init1(rsp, rnp); - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init2(rsp, rnp); + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_nodes(rsp); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); -- cgit v1.1 From 8203d6d0ee784cfb2ebf89053f7fe399abc867d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Aug 2015 13:53:17 -0700 Subject: rcu: Use single-stage IPI algorithm for RCU expedited grace period The current preemptible-RCU expedited grace-period algorithm invokes synchronize_sched_expedited() to enqueue all tasks currently running in a preemptible-RCU read-side critical section, then waits for all the ->blkd_tasks lists to drain. This works, but results in both an IPI and a double context switch even on CPUs that do not happen to be running in a preemptible RCU read-side critical section. This commit implements a new algorithm that causes less OS jitter. This new algorithm IPIs all online CPUs that are not idle (from an RCU perspective), but refrains from self-IPIs. If a CPU receiving this IPI is not in a preemptible RCU read-side critical section (or is just now exiting one), it pushes quiescence up the rcu_node tree, otherwise, it sets a flag that will be handled by the upcoming outermost rcu_read_unlock(), which will then push quiescence up the tree. The expedited grace period must of course wait on any pre-existing blocked readers, and newly blocked readers must be queued carefully based on the state of both the normal and the expedited grace periods. This new queueing approach also avoids the need to update boost state, courtesy of the fact that blocked tasks are no longer ever migrated to the root rcu_node structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 75 ++++++++---- kernel/rcu/tree_plugin.h | 296 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 305 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8526896..6e92bf4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3461,18 +3461,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) } /* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress + * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit * for the current expedited grace period. Works only for preemptible @@ -3482,7 +3471,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { - return !rcu_preempted_readers_exp(rnp) && + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -3494,19 +3483,21 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex and the + * specified rcu_node structure's ->lock. */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) { - unsigned long flags; unsigned long mask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; } if (rnp->parent == NULL) { @@ -3522,10 +3513,54 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, rnp = rnp->parent; raw_spin_lock(&rnp->lock); /* irqs already disabled */ smp_mb__after_unlock_lock(); + WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } } +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the root + * rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + WARN_ON_ONCE((rnp->expmask & mask) != mask); + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp, + struct rcu_data *rdp, bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 62d0541..6f7500f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *const rcu_state_p = &rcu_preempt_state; static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; -static int rcu_preempted_readers_exp(struct rcu_node *rnp); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); @@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } +/* Flags for rcu_preempt_ctxt_queue() decision table. */ +#define RCU_GP_TASKS 0x8 +#define RCU_EXP_TASKS 0x4 +#define RCU_GP_BLKD 0x2 +#define RCU_EXP_BLKD 0x1 + +/* + * Queues a task preempted within an RCU-preempt read-side critical + * section into the appropriate location within the ->blkd_tasks list, + * depending on the states of any ongoing normal and expedited grace + * periods. The ->gp_tasks pointer indicates which element the normal + * grace period is waiting on (NULL if none), and the ->exp_tasks pointer + * indicates which element the expedited grace period is waiting on (again, + * NULL if none). If a grace period is waiting on a given element in the + * ->blkd_tasks list, it also waits on all subsequent elements. Thus, + * adding a task to the tail of the list blocks any grace period that is + * already waiting on one of the elements. In contrast, adding a task + * to the head of the list won't block any grace period that is already + * waiting on one of the elements. + * + * This queuing is imprecise, and can sometimes make an ongoing grace + * period wait for a task that is not strictly speaking blocking it. + * Given the choice, we needlessly block a normal grace period rather than + * blocking an expedited grace period. + * + * Note that an endless sequence of expedited grace periods still cannot + * indefinitely postpone a normal grace period. Eventually, all of the + * fixed number of preempted tasks blocking the normal grace period that are + * not also blocking the expedited grace period will resume and complete + * their RCU read-side critical sections. At that point, the ->gp_tasks + * pointer will equal the ->exp_tasks pointer, at which point the end of + * the corresponding expedited grace period will also be the end of the + * normal grace period. + */ +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long flags) __releases(rnp->lock) +{ + int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + + (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + + (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); + struct task_struct *t = current; + + /* + * Decide where to queue the newly blocked task. In theory, + * this could be an if-statement. In practice, when I tried + * that, it was quite messy. + */ + switch (blkd_state) { + case 0: + case RCU_EXP_TASKS: + case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS: + case RCU_GP_TASKS + RCU_EXP_TASKS: + + /* + * Blocking neither GP, or first task blocking the normal + * GP but not blocking the already-waiting expedited GP. + * Queue at the head of the list to avoid unnecessarily + * blocking the already-waiting GPs. + */ + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_BLKD: + case RCU_GP_BLKD: + case RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + + /* + * First task arriving that blocks either GP, or first task + * arriving that blocks the expedited GP (with the normal + * GP already waiting), or a task arriving that blocks + * both GPs with both GPs already waiting. Queue at the + * tail of the list to avoid any GP waiting on any of the + * already queued tasks that are not blocking it. + */ + list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + + /* + * Second or subsequent task blocking the expedited GP. + * The task either does not block the normal GP, or is the + * first task blocking the normal GP. Queue just after + * the first task blocking the expedited GP. + */ + list_add(&t->rcu_node_entry, rnp->exp_tasks); + break; + + case RCU_GP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + + /* + * Second or subsequent task blocking the normal GP. + * The task does not block the expedited GP. Queue just + * after the first task blocking the normal GP. + */ + list_add(&t->rcu_node_entry, rnp->gp_tasks); + break; + + default: + + /* Yet another exercise in excessive paranoia. */ + WARN_ON_ONCE(1); + break; + } + + /* + * We have now queued the task. If it was the first one to + * block either grace period, update the ->gp_tasks and/or + * ->exp_tasks pointers, respectively, to reference the newly + * blocked tasks. + */ + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + rnp->gp_tasks = &t->rcu_node_entry; + if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) + rnp->exp_tasks = &t->rcu_node_entry; + raw_spin_unlock(&rnp->lock); + + /* + * Report the quiescent state for the expedited GP. This expedited + * GP should not be able to end until we report, so there should be + * no need to check for a subsequent expedited GP. (Though we are + * still in a quiescent state in any case.) + */ + if (blkd_state & RCU_EXP_BLKD && + t->rcu_read_unlock_special.b.exp_need_qs) { + t->rcu_read_unlock_special.b.exp_need_qs = false; + rcu_report_exp_rdp(rdp->rsp, rdp, true); + } else { + WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); + } + local_irq_restore(flags); +} + /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void) t->rcu_blocked_node = rnp; /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. Note that there is some uncertainty as - * to exactly when the current grace period started. - * We take a conservative approach, which can result - * in unnecessarily waiting on tasks that started very - * slightly after the current grace period began. C'est - * la vie!!! - * - * But first, note that the current CPU must still be - * on line! + * Verify the CPU's sanity, trace the preemption, and + * then queue the task as required based on the states + * of any ongoing and expedited grace periods. */ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { - list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); - rnp->gp_tasks = &t->rcu_node_entry; - if (IS_ENABLED(CONFIG_RCU_BOOST) && - rnp->boost_tasks != NULL) - rnp->boost_tasks = rnp->gp_tasks; - } else { - list_add(&t->rcu_node_entry, &rnp->blkd_tasks); - if (rnp->qsmask & rdp->grpmask) - rnp->gp_tasks = &t->rcu_node_entry; - } trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_preempt_ctxt_queue(rnp, rdp, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; bool drop_boost_mutex = false; + struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; @@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. Because irqs are disabled, + * If RCU core is waiting for this CPU to exit its critical section, + * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; @@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t) } } + /* + * Respond to a request for an expedited grace period, but only if + * we were not preempted, meaning that we were running on the same + * CPU throughout. If we were preempted, the exp_need_qs flag + * would have been cleared at the time of the first preemption, + * and the quiescent state would be reported when we were dequeued. + */ + if (special.b.exp_need_qs) { + WARN_ON_ONCE(special.b.blocked); + t->rcu_read_unlock_special.b.exp_need_qs = false; + rdp = this_cpu_ptr(rcu_state_p->rda); + rcu_report_exp_rdp(rcu_state_p, rdp, true); + if (!t->rcu_read_unlock_special.s) { + local_irq_restore(flags); + return; + } + } + /* Hardware IRQ handlers cannot block, complain if they get here. */ if (in_irq() || in_serving_softirq()) { lockdep_rcu_suspicious(__FILE__, __LINE__, "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", + pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", t->rcu_read_unlock_special.s, t->rcu_read_unlock_special.b.blocked, + t->rcu_read_unlock_special.b.exp_need_qs, t->rcu_read_unlock_special.b.need_qs); local_irq_restore(flags); return; @@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - empty_exp = !rcu_preempted_readers_exp(rnp); + empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = !rcu_preempted_readers_exp(rnp); + empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, @@ -536,27 +672,98 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); /* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_nodes(struct rcu_state *rsp) +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp) { + int cpu; unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; struct rcu_node *rnp; sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - rnp->expmask = 0; /* No per-CPU component yet. */ - if (!rcu_preempt_has_tasks(rnp)) { - /* FIXME: Want __rcu_report_exp_rnp() here. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } else { + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + cpu_is_offline(cpu) || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; + ret = smp_call_function_single(cpu, + sync_rcu_exp_handler, + rsp, 0); + if (!ret) + mask_ofl_ipi &= ~mask; } - rcu_report_exp_rnp(rsp, rnp, false); + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); } } @@ -587,11 +794,8 @@ void synchronize_rcu_expedited(void) rcu_exp_gp_seq_start(rsp); - /* force all RCU readers onto ->blkd_tasks lists. */ - synchronize_sched_expedited(); - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_nodes(rsp); + sync_rcu_exp_select_cpus(rsp); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); -- cgit v1.1 From bce5fa12aad148e15efd9bc0015dc4898b6e723b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2015 16:03:54 -0700 Subject: rcu: Move synchronize_sched_expedited() to combining tree Currently, synchronize_sched_expedited() uses a single global counter to track the number of remaining context switches that the current expedited grace period must wait on. This is problematic on large systems, where the resulting memory contention can be pathological. This commit therefore makes synchronize_sched_expedited() instead use the combining tree in the same manner as synchronize_rcu_expedited(), keeping memory contention down to a dull roar. This commit creates a temporary function sync_sched_exp_select_cpus() that is very similar to sync_rcu_exp_select_cpus(). A later commit will consolidate these two functions, which becomes possible when synchronize_sched_expedited() switches from stop_one_cpu_nowait() to smp_call_function_single(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 123 ++++++++++++++++++++++++++++++++++++------------------ kernel/rcu/tree.h | 1 - 2 files changed, 82 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6e92bf4..d2cdcad 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3642,19 +3642,77 @@ static int synchronize_sched_expedited_cpu_stop(void *data) struct rcu_data *rdp = data; struct rcu_state *rsp = rdp->rsp; - /* We are here: If we are last, do the wakeup. */ - rdp->exp_done = true; - if (atomic_dec_and_test(&rsp->expedited_need_qs)) - wake_up(&rsp->expedited_wq); + /* Report the quiescent state. */ + rcu_report_exp_rdp(rsp, rdp, true); return 0; } +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_sched_exp_select_cpus(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + struct rcu_data *rdp; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + cpu_is_offline(cpu) || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; + rdp = per_cpu_ptr(rsp->rda, cpu); + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, + rdp, &rdp->exp_stop_work); + mask_ofl_ipi &= ~mask; + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + static void synchronize_sched_expedited_wait(struct rcu_state *rsp) { int cpu; unsigned long jiffies_stall; unsigned long jiffies_start; - struct rcu_data *rdp; + unsigned long mask; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; jiffies_stall = rcu_jiffies_till_stall_check(); @@ -3663,33 +3721,36 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = wait_event_interruptible_timeout( rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs), + sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); if (ret > 0) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ wait_event(rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs)); + sync_rcu_preempt_exp_done(rnp_root)); return; } pr_err("INFO: %s detected expedited stalls on CPUs: {", rsp->name); - for_each_online_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - - if (rdp->exp_done) - continue; - pr_cont(" %d", cpu); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + pr_cont(" %d", cpu); + } + mask <<= 1; } pr_cont(" } %lu jiffies s: %lu\n", jiffies - jiffies_start, rsp->expedited_sequence); - for_each_online_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - - if (rdp->exp_done) - continue; - dump_cpu_task(cpu); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } } jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; } @@ -3713,7 +3774,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) */ void synchronize_sched_expedited(void) { - int cpu; unsigned long s; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; @@ -3736,27 +3796,8 @@ void synchronize_sched_expedited(void) } rcu_exp_gp_seq_start(rsp); - - /* Stop each CPU that is online, non-idle, and not us. */ - atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ - for_each_online_cpu(cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdp->exp_done = false; - - /* Skip our CPU and any idle CPUs. */ - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - continue; - atomic_inc(&rsp->expedited_need_qs); - stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, - rdp, &rdp->exp_stop_work); - } - - /* Remove extra count and, if necessary, wait for CPUs to stop. */ - if (!atomic_dec_and_test(&rsp->expedited_need_qs)) - synchronize_sched_expedited_wait(rsp); + sync_sched_exp_select_cpus(rsp); + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a57f25e..efe361c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -383,7 +383,6 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - bool exp_done; /* Expedited QS for this CPU? */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.1 From 97c668b8e983b722e2ed765b98b05f644aff1b13 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2015 11:31:51 -0700 Subject: rcu: Rename qs_pending to core_needs_qs An upcoming commit needs to invert the sense of the ->passed_quiesce rcu_data structure field, so this commit is taking this opportunity to clarify things a bit by renaming ->qs_pending to ->core_needs_qs. So if !rdp->core_needs_qs, then this CPU need not concern itself with quiescent states, in particular, it need not acquire its leaf rcu_node structure's ->lock to check. Otherwise, it needs to report the next quiescent state. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- kernel/rcu/tree.h | 4 ++-- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_trace.c | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d2cdcad..7c158ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1746,7 +1746,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } @@ -2357,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { - rdp->qs_pending = 0; + rdp->core_needs_qs = 0; /* * This GP can't end until cpu checks in, so all of our @@ -2388,7 +2388,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Does this CPU still need to do its part for current grace period? * If no, return and let the other CPUs do their part as well. */ - if (!rdp->qs_pending) + if (!rdp->core_needs_qs) return; /* @@ -3828,10 +3828,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce && + rdp->core_needs_qs && !rdp->passed_quiesce && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { - rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && + rdp->n_rp_core_needs_qs++; + } else if (rdp->core_needs_qs && (rdp->passed_quiesce || rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; @@ -4157,7 +4157,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->completed = rnp->completed; rdp->passed_quiesce = false; rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); - rdp->qs_pending = false; + rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index efe361c..4a0f306 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -303,7 +303,7 @@ struct rcu_data { unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ - bool qs_pending; /* Core waits for quiesc state. */ + bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ @@ -368,7 +368,7 @@ struct rcu_data { /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_qs_pending; + unsigned long n_rp_core_needs_qs; unsigned long n_rp_report_qs; unsigned long n_rp_cb_ready; unsigned long n_rp_cpu_needs_gp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6f7500f..e33b4f3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -619,7 +619,7 @@ static void rcu_preempt_check_callbacks(void) return; } if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_data_p->qs_pending) && + __this_cpu_read(rcu_data_p->core_needs_qs) && !__this_cpu_read(rcu_data_p->passed_quiesce)) t->rcu_read_unlock_special.b.need_qs = true; } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 6fc4c5f..4ac25f8 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -123,7 +123,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->passed_quiesce, rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), - rdp->qs_pending); + rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->n_rcu_pending); seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_qs_pending, + rdp->n_rp_core_needs_qs, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); -- cgit v1.1 From 0d43eb34f9aabcddf41c99b7af2d0ced33e9a3cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2015 14:17:29 -0700 Subject: rcu: Invert passed_quiesce and rename to cpu_no_qs This commit inverts the sense of the rcu_data structure's ->passed_quiesce field and renames it to ->cpu_no_qs. This will allow a later commit to use an "aggregate OR" operation to test expedited as well as normal grace periods without added overhead. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 6 +++--- kernel/rcu/tree_trace.c | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c158ff..31e7021 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -245,21 +245,21 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + if (__this_cpu_read(rcu_sched_data.cpu_no_qs)) { trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + __this_cpu_write(rcu_sched_data.cpu_no_qs, false); } } void rcu_bh_qs(void) { - if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + if (__this_cpu_read(rcu_bh_data.cpu_no_qs)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + __this_cpu_write(rcu_bh_data.cpu_no_qs, false); } } @@ -1744,7 +1744,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->passed_quiesce = 0; + rdp->cpu_no_qs = true; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); @@ -2337,7 +2337,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if ((rdp->passed_quiesce == 0 && + if ((rdp->cpu_no_qs && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || rdp->gpwrap) { @@ -2348,7 +2348,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * We will instead need a new quiescent state that lies * within the current grace period. */ - rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->cpu_no_qs = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2395,7 +2395,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce && + if (rdp->cpu_no_qs && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; @@ -3828,11 +3828,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->core_needs_qs && !rdp->passed_quiesce && + rdp->core_needs_qs && rdp->cpu_no_qs && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; } else if (rdp->core_needs_qs && - (rdp->passed_quiesce || + (!rdp->cpu_no_qs || rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; @@ -4155,7 +4155,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; - rdp->passed_quiesce = false; + rdp->cpu_no_qs = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4a0f306..ded4ceeb 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -302,7 +302,7 @@ struct rcu_data { /* is aware of having started. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ - bool passed_quiesce; /* User-mode/idle loop etc. */ + bool cpu_no_qs; /* No QS yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e33b4f3..6977ff0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -265,11 +265,11 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, */ static void rcu_preempt_qs(void) { - if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { + if (__this_cpu_read(rcu_data_p->cpu_no_qs)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_data_p->passed_quiesce, 1); + __this_cpu_write(rcu_data_p->cpu_no_qs, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } @@ -620,7 +620,7 @@ static void rcu_preempt_check_callbacks(void) } if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && - !__this_cpu_read(rcu_data_p->passed_quiesce)) + __this_cpu_read(rcu_data_p->cpu_no_qs)) t->rcu_read_unlock_special.b.need_qs = true; } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 4ac25f8..d373e57 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -117,11 +117,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, + rdp->cpu_no_qs, rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", -- cgit v1.1 From 5b74c458906fc4a62f932ee8bb801d29baf15fec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2015 15:16:57 -0700 Subject: rcu: Make ->cpu_no_qs be a union for aggregate OR This commit converts the rcu_data structure's ->cpu_no_qs field to a union. The bytewise side of this union allows individual access to indications as to whether this CPU needs to find a quiescent state for a normal (.norm) and/or expedited (.exp) grace period. The setwise side of the union allows testing whether or not a quiescent state is needed at all, for either type of grace period. For now, only .norm is used. A later commit will introduce the expedited usage. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree.h | 14 +++++++++++++- kernel/rcu/tree_plugin.h | 6 +++--- kernel/rcu/tree_trace.c | 2 +- 4 files changed, 28 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 31e7021..3e2875b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -245,21 +245,21 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - if (__this_cpu_read(rcu_sched_data.cpu_no_qs)) { + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs, false); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); } } void rcu_bh_qs(void) { - if (__this_cpu_read(rcu_bh_data.cpu_no_qs)) { + if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.cpu_no_qs, false); + __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } } @@ -1744,7 +1744,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->cpu_no_qs = true; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); @@ -2337,7 +2337,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if ((rdp->cpu_no_qs && + if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || rdp->gpwrap) { @@ -2348,7 +2348,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * We will instead need a new quiescent state that lies * within the current grace period. */ - rdp->cpu_no_qs = true; /* need qs for new gp. */ + rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2395,7 +2395,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (rdp->cpu_no_qs && + if (rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; @@ -3828,11 +3828,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->core_needs_qs && rdp->cpu_no_qs && + rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; } else if (rdp->core_needs_qs && - (!rdp->cpu_no_qs || + (!rdp->cpu_no_qs.b.norm || rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; @@ -4155,7 +4155,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; - rdp->cpu_no_qs = true; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ded4ceeb..3eee48b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -286,6 +286,18 @@ struct rcu_node { for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +/* + * Union to allow "aggregate OR" operation on the need for a quiescent + * state by the normal and expedited grace periods. + */ +union rcu_noqs { + struct { + u8 norm; + u8 exp; + } b; /* Bits. */ + u16 s; /* Set of bits, aggregate OR here. */ +}; + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -302,7 +314,7 @@ struct rcu_data { /* is aware of having started. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ - bool cpu_no_qs; /* No QS yet for this CPU. */ + union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6977ff0..7880202 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -265,11 +265,11 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, */ static void rcu_preempt_qs(void) { - if (__this_cpu_read(rcu_data_p->cpu_no_qs)) { + if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_data_p->cpu_no_qs, false); + __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } @@ -620,7 +620,7 @@ static void rcu_preempt_check_callbacks(void) } if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs)) + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) t->rcu_read_unlock_special.b.need_qs = true; } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d373e57..999c367 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->cpu_no_qs, + rdp->cpu_no_qs.b.norm, rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", -- cgit v1.1 From 2a1d3ab8986d1b2f598ffc42351d94166fa0f022 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Sep 2015 11:01:10 +0200 Subject: genirq: Handle force threading of irqs with primary and thread handler Force threading of interrupts does not really deal with interrupts which are requested with a primary and a threaded handler. The current policy is to leave them alone and let the primary handler run in interrupt context, but we set the ONESHOT flag for those interrupts as well. Kohji Okuno debugged a problem with the SDHCI driver where the interrupt thread waits for a hardware interrupt to trigger, which can't work well because the hardware interrupt is masked due to the ONESHOT flag being set. He proposed to set the ONESHOT flag only if the interrupt does not provide a thread handler. Though that does not work either because these interrupts can be shared. So the other interrupt would rightfully get the ONESHOT flag set and therefor the same situation would happen again. To deal with this proper, we need to force thread the primary handler of such interrupts as well. That means that the primary interrupt handler is treated as any other primary interrupt handler which is not marked IRQF_NO_THREAD. The threaded handler becomes a separate thread so the SDHCI flow logic can be handled gracefully. The same issue was reported against 4.1-rt. Reported-and-tested-by: Kohji Okuno Reported-By: Michal Smucr Reported-and-tested-by: Nathan Sullivan Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1509211058080.5606@nanos Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 158 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 117 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9a59f6..0a63c2b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -730,6 +730,12 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) return IRQ_NONE; } +static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) +{ + WARN(1, "Secondary action handler called for irq %d\n", irq); + return IRQ_NONE; +} + static int irq_wait_for_interrupt(struct irqaction *action) { set_current_state(TASK_INTERRUPTIBLE); @@ -756,7 +762,8 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { - if (!(desc->istate & IRQS_ONESHOT)) + if (!(desc->istate & IRQS_ONESHOT) || + action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); @@ -910,6 +917,18 @@ static void irq_thread_dtor(struct callback_head *unused) irq_finalize_oneshot(desc, action); } +static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) +{ + struct irqaction *secondary = action->secondary; + + if (WARN_ON_ONCE(!secondary)) + return; + + raw_spin_lock_irq(&desc->lock); + __irq_wake_thread(desc, secondary); + raw_spin_unlock_irq(&desc->lock); +} + /* * Interrupt handler thread */ @@ -940,6 +959,8 @@ static int irq_thread(void *data) action_ret = handler_fn(desc, action); if (action_ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); + if (action_ret == IRQ_WAKE_THREAD) + irq_wake_secondary(desc, action); wake_threads_waitq(desc); } @@ -984,20 +1005,36 @@ void irq_wake_thread(unsigned int irq, void *dev_id) } EXPORT_SYMBOL_GPL(irq_wake_thread); -static void irq_setup_forced_threading(struct irqaction *new) +static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) - return; + return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) - return; + return 0; new->flags |= IRQF_ONESHOT; - if (!new->thread_fn) { - set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); - new->thread_fn = new->handler; - new->handler = irq_default_primary_handler; + /* + * Handle the case where we have a real primary handler and a + * thread handler. We force thread them as well by creating a + * secondary action. + */ + if (new->handler != irq_default_primary_handler && new->thread_fn) { + /* Allocate the secondary action */ + new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!new->secondary) + return -ENOMEM; + new->secondary->handler = irq_forced_secondary_handler; + new->secondary->thread_fn = new->thread_fn; + new->secondary->dev_id = new->dev_id; + new->secondary->irq = new->irq; + new->secondary->name = new->name; } + /* Deal with the primary handler */ + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + return 0; } static int irq_request_resources(struct irq_desc *desc) @@ -1017,6 +1054,48 @@ static void irq_release_resources(struct irq_desc *desc) c->irq_release_resources(d); } +static int +setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) +{ + struct task_struct *t; + struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; + + if (!secondary) { + t = kthread_create(irq_thread, new, "irq/%d-%s", irq, + new->name); + } else { + t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, + new->name); + param.sched_priority -= 1; + } + + if (IS_ERR(t)) + return PTR_ERR(t); + + sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code + * references an already freed task_struct. + */ + get_task_struct(t); + new->thread = t; + /* + * Tell the thread to set its affinity. This is + * important for shared interrupt handlers as we do + * not invoke setup_affinity() for the secondary + * handlers as everything is already set up. Even for + * interrupts marked with IRQF_NO_BALANCE this is + * correct as we want the thread to move to the cpu(s) + * on which the requesting code placed the interrupt. + */ + set_bit(IRQTF_AFFINITY, &new->thread_flags); + return 0; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1037,6 +1116,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV; + new->irq = irq; + /* * Check whether the interrupt nests into another interrupt * thread. @@ -1054,8 +1135,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->handler = irq_nested_primary_handler; } else { - if (irq_settings_can_thread(desc)) - irq_setup_forced_threading(new); + if (irq_settings_can_thread(desc)) { + ret = irq_setup_forced_threading(new); + if (ret) + goto out_mput; + } } /* @@ -1064,37 +1148,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * thread. */ if (new->thread_fn && !nested) { - struct task_struct *t; - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - - t = kthread_create(irq_thread, new, "irq/%d-%s", irq, - new->name); - if (IS_ERR(t)) { - ret = PTR_ERR(t); + ret = setup_irq_thread(new, irq, false); + if (ret) goto out_mput; + if (new->secondary) { + ret = setup_irq_thread(new->secondary, irq, true); + if (ret) + goto out_thread; } - - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); - - /* - * We keep the reference to the task struct even if - * the thread dies to avoid that the interrupt code - * references an already freed task_struct. - */ - get_task_struct(t); - new->thread = t; - /* - * Tell the thread to set its affinity. This is - * important for shared interrupt handlers as we do - * not invoke setup_affinity() for the secondary - * handlers as everything is already set up. Even for - * interrupts marked with IRQF_NO_BALANCE this is - * correct as we want the thread to move to the cpu(s) - * on which the requesting code placed the interrupt. - */ - set_bit(IRQTF_AFFINITY, &new->thread_flags); } if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -1267,7 +1328,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq, nmsk, omsk); } - new->irq = irq; *old_ptr = new; irq_pm_install_action(desc, new); @@ -1293,6 +1353,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (new->thread) wake_up_process(new->thread); + if (new->secondary) + wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); new->dir = NULL; @@ -1323,6 +1385,13 @@ out_thread: kthread_stop(t); put_task_struct(t); } + if (new->secondary && new->secondary->thread) { + struct task_struct *t = new->secondary->thread; + + new->secondary->thread = NULL; + kthread_stop(t); + put_task_struct(t); + } out_mput: module_put(desc->owner); return ret; @@ -1430,9 +1499,14 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (action->thread) { kthread_stop(action->thread); put_task_struct(action->thread); + if (action->secondary && action->secondary->thread) { + kthread_stop(action->secondary->thread); + put_task_struct(action->secondary->thread); + } } module_put(desc->owner); + kfree(action->secondary); return action; } @@ -1576,8 +1650,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); - if (retval) + if (retval) { + kfree(action->secondary); kfree(action); + } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { -- cgit v1.1 From 571af55a31d3652ac1f758f116835a76d0335661 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 25 Aug 2015 14:42:53 +0800 Subject: time: Fix spelling in comments Signed-off-by: Zhen Lei Cc: Hanjun Guo Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tianhong Ding Cc: Viresh Kumar Cc: Xinwei Hu Cc: Xunlei Pang Cc: Zefan Li Link: http://lkml.kernel.org/r/1440484973-13892-1-git-send-email-thunder.leizhen@huawei.com [ Fixed yet another typo in one of the sentences fixed. ] Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/timekeeping.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 841b72f..174c594 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -479,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * return half the number of nanoseconds the hardware counter can technically * cover. This is done so that we can potentially detect problems caused by * delayed timers or bad hardware, which might result in time intervals that - * are larger then what the math used can handle without overflows. + * are larger than what the math used can handle without overflows. */ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 457a373..435b885 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,7 +59,7 @@ /* * The timer bases: * - * There are more clockids then hrtimer bases. Thus, we index + * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3739ac6..125f16a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1674,7 +1674,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * - * Helper function that accumulates a the nsecs greater then a second + * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. * @@ -1726,7 +1726,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; - /* If the offset is smaller then a shifted interval, do nothing */ + /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; -- cgit v1.1 From 3ed769bdb2a2484fd7f9f7f3047413053aacbe21 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 18 Sep 2015 15:54:23 +0200 Subject: timers: Fix data race in timer_stats_account_timer() timer_stats_account_timer() reads timer->start_site, then checks it for NULL and then re-reads it again, while timer_stats_timer_clear_start_info() can concurrently reset timer->start_site to NULL. This should not lead to crashes, but can double number of entries in timer stats as start_site is used during comparison, the doubled entries will have unuseful NULL start_site. Read timer->start_site only once in timer_stats_account_timer(). The data race was found with KernelThreadSanitizer (KTSAN). Signed-off-by: Dmitry Vyukov Cc: andreyknvl@google.com Cc: glider@google.com Cc: kcc@google.com Cc: ktsan@googlegroups.com Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1442584463-69553-1-git-send-email-dvyukov@google.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 84190f0..d3f5e92 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -461,10 +461,17 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) static void timer_stats_account_timer(struct timer_list *timer) { - if (likely(!timer->start_site)) + void *site; + + /* + * start_site can be concurrently reset by + * timer_stats_timer_clear_start_info() + */ + site = READ_ONCE(timer->start_site); + if (likely(!site)) return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer_stats_update_stats(timer, timer->start_pid, site, timer->function, timer->start_comm, timer->flags); } -- cgit v1.1 From 71f64340fc0eadd06036d0db9a511b6d726add1d Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 2 Sep 2015 10:24:55 +0800 Subject: genirq: Remove the second parameter from handle_irq_event_percpu() Actually, we always use the first irq action of the @desc->action chain, so remove the second parameter from handle_irq_event_percpu() which makes the code more tidy. Signed-off-by: Huang Shijie Reviewed-by: Jiang Liu Cc: peterz@infradead.org Cc: marc.zyngier@arm.com Link: http://lkml.kernel.org/r/1441160695-19809-1-git-send-email-shijie.huang@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/handle.c | 7 +++---- kernel/irq/internals.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e28169d..46f1fb5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -669,7 +669,7 @@ void handle_percpu_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - handle_irq_event_percpu(desc, desc->action); + handle_irq_event_percpu(desc); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index de41a68..ea7b5fd 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,11 +132,11 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t -handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; + struct irqaction *action = desc->action; do { irqreturn_t res; @@ -184,14 +184,13 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) irqreturn_t handle_irq_event(struct irq_desc *desc) { - struct irqaction *action = desc->action; irqreturn_t ret; desc->istate &= ~IRQS_PENDING; irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - ret = handle_irq_event_percpu(desc, action); + ret = handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5ef0c2d..cd60bb4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -81,7 +81,7 @@ extern void irq_mark_irq(unsigned int irq); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -- cgit v1.1 From ac742d37180bee83bc433be087b66a17af2883b9 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 23:36:40 +0200 Subject: futex: Force hot variables into a single cache line futex_hash() references two global variables: the base pointer futex_queues and the size of the array futex_hashsize. The latter is marked __read_mostly, while the former is not, so they are likely to end up very far from each other. This means that futex_hash() is likely to encounter two cache misses. We could mark futex_queues as __read_mostly as well, but that doesn't guarantee they'll end up next to each other (and even if they do, they may still end up in different cache lines). So put the two variables in a small singleton struct with sufficient alignment and mark that as __read_mostly. Signed-off-by: Rasmus Villemoes Cc: Peter Zijlstra Cc: Davidlohr Bueso Cc: kbuild test robot Cc: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1441834601-13633-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Thomas Gleixner --- kernel/futex.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6e443ef..dfc86e9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -255,9 +255,18 @@ struct futex_hash_bucket { struct plist_head chain; } ____cacheline_aligned_in_smp; -static unsigned long __read_mostly futex_hashsize; +/* + * The base of the bucket array and its size are always used together + * (after initialization only in hash_futex()), so ensure that they + * reside in the same cacheline. + */ +static struct { + struct futex_hash_bucket *queues; + unsigned long hashsize; +} __futex_data __read_mostly __aligned(2*sizeof(long)); +#define futex_queues (__futex_data.queues) +#define futex_hashsize (__futex_data.hashsize) -static struct futex_hash_bucket *futex_queues; /* * Fault injections for futexes. -- cgit v1.1 From 3df9ca0a2b8b50db5a079ae9d97c5b55435e9a6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2015 15:00:18 -0400 Subject: cpuset: migrate memory only for threadgroup leaders If memory_migrate flag is set, cpuset migrates memory according to the destnation css's nodemask. The current implementation migrates memory whenever any thread of a process is migrated making the behavior somewhat arbitrary. Let's tie memory operations to the threadgroup leader so that memory is migrated only when the leader is migrated. While this is a behavior change, given the inherent fuziness, this change is not too likely to be noticed and allows us to clearly define who owns the memory (always the leader) and helps the planned atomic multi-process migration. Note that we're currently migrating memory in migration path proper while holding all the locks. In the long term, this should be moved out to an async work item. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cpuset.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 312961e..0b361a0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1487,7 +1487,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css, { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; - struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); struct cpuset *cs = css_cs(css); @@ -1515,26 +1514,31 @@ static void cpuset_attach(struct cgroup_subsys_state *css, } /* - * Change mm, possibly for multiple threads in a threadgroup. This is - * expensive and may sleep. + * Change mm, possibly for multiple threads in a threadgroup. This + * is expensive and may sleep and should be moved outside migration + * path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - mm = get_task_mm(leader); - if (mm) { - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - - /* - * old_mems_allowed is the same with mems_allowed here, except - * if this task is being moved automatically due to hotplug. - * In that case @mems_allowed has been updated and is empty, - * so @old_mems_allowed is the right nodesets that we migrate - * mm from. - */ - if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, - &cpuset_attach_nodemask_to); + if (thread_group_leader(leader)) { + struct mm_struct *mm = get_task_mm(leader); + + if (mm) { + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); + + /* + * old_mems_allowed is the same with mems_allowed + * here, except if this task is being moved + * automatically due to hotplug. In that case + * @mems_allowed has been updated and is empty, so + * @old_mems_allowed is the right nodesets that we + * migrate mm from. + */ + if (is_memory_migrate(cs)) { + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, + &cpuset_attach_nodemask_to); + } + mmput(mm); } - mmput(mm); } cs->old_mems_allowed = cpuset_attach_nodemask_to; -- cgit v1.1 From 4530eddb59494b89650d6bcd980fc7f7717ad80c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2015 15:00:19 -0400 Subject: cgroup, memcg, cpuset: implement cgroup_taskset_for_each_leader() It wasn't explicitly documented but, when a process is being migrated, cpuset and memcg depend on cgroup_taskset_first() returning the threadgroup leader; however, this approach is somewhat ghetto and would no longer work for the planned multi-process migration. This patch introduces explicit cgroup_taskset_for_each_leader() which iterates over only the threadgroup leaders and replaces cgroup_taskset_first() usages for accessing the leader with it. This prepares both memcg and cpuset for multi-process migration. This patch also updates the documentation for cgroup_taskset_for_each() to clarify the iteration rules and removes comments mentioning task ordering in tasksets. v2: A previous patch which added threadgroup leader test was dropped. Patch updated accordingly. Signed-off-by: Tejun Heo Acked-by: Zefan Li Acked-by: Michal Hocko Cc: Johannes Weiner --- kernel/cgroup.c | 11 ----------- kernel/cpuset.c | 9 ++++----- 2 files changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0be276f..7f4b85a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2217,13 +2217,6 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); - - /* - * Use move_tail so that cgroup_taskset_first() still returns the - * leader after migration. This works because cgroup_migrate() - * ensures that the dst_cset of the leader is the first on the - * tset's dst_csets list. - */ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); /* @@ -2419,10 +2412,6 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, if (!cset->mg_src_cgrp) goto next; - /* - * cgroup_taskset_first() must always return the leader. - * Take care to avoid disturbing the ordering. - */ list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &tset.src_csets); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0b361a0..e4d9999 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1488,7 +1488,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); + struct task_struct *leader; struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; @@ -1514,12 +1514,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css, } /* - * Change mm, possibly for multiple threads in a threadgroup. This - * is expensive and may sleep and should be moved outside migration - * path proper. + * Change mm for all threadgroup leaders. This is expensive and may + * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - if (thread_group_leader(leader)) { + cgroup_taskset_for_each_leader(leader, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { -- cgit v1.1 From 9af2ec45c2d323d5ce35b431bb150c0eb567f19a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2015 15:00:20 -0400 Subject: cgroup: reorder cgroup_migrate()'s parameters cgroup_migrate() has the destination cgroup as the first parameter while cgroup_task_migrate() has the destination cset as the last. Another migration function is scheduled to be added which can make the discrepancy further stand out. Let's reorder cgroup_migrate()'s parameters so that the destination cgroup is the last. This doesn't cause any functional difference. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7f4b85a..dc07956 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2362,9 +2362,9 @@ err: /** * cgroup_migrate - migrate a process or task to a cgroup - * @cgrp: the destination cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task + * @cgrp: the destination cgroup * * Migrate a process or task denoted by @leader to @cgrp. If migrating a * process, the caller must be holding cgroup_threadgroup_rwsem. The @@ -2378,8 +2378,8 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, - bool threadgroup) +static int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup *cgrp) { struct cgroup_taskset tset = { .src_csets = LIST_HEAD_INIT(tset.src_csets), @@ -2516,7 +2516,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); if (!ret) - ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2823,7 +2823,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + ret = cgroup_migrate(task, true, src_cset->dfl_cgrp); put_task_struct(task); @@ -3905,7 +3905,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(to, task, false); + ret = cgroup_migrate(task, false, to); put_task_struct(task); } } while (task && !ret); -- cgit v1.1 From adaae5dcf8920375a2fdc6268f762a0b7b331c55 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2015 15:00:21 -0400 Subject: cgroup: separate out taskset operations from cgroup_migrate() Currently, cgroup_migreate() implements large part of the migration logic inline including building the target taskset and actually migrating them. This patch separates out the following taskset operations. CGROUP_TASKSET_INIT() : taskset initializer cgroup_taskset_add() : add a task to a taskset cgroup_taskset_migrate() : migrate a taskset to the destination cgroup This will be used to implement atomic multi-process migration in cgroup_update_dfl_csses(). This is pure reorganization which doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 211 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 125 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dc07956..f24d3ce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2144,6 +2144,49 @@ struct cgroup_taskset { struct task_struct *cur_task; }; +#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +/** + * cgroup_taskset_add - try to add a migration target task to a taskset + * @task: target task + * @tset: target taskset + * + * Add @task, which is a migration target, to @tset. This function becomes + * noop if @task doesn't need to be migrated. @task's css_set should have + * been added as a migration source and @task->cg_list will be moved from + * the css_set's tasks list to mg_tasks one. + */ +static void cgroup_taskset_add(struct task_struct *task, + struct cgroup_taskset *tset) +{ + struct css_set *cset; + + lockdep_assert_held(&css_set_rwsem); + + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) + return; + + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) + return; + + cset = task_css_set(task); + if (!cset->mg_src_cgrp) + return; + + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset->src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); +} + /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest @@ -2228,6 +2271,84 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, } /** + * cgroup_taskset_migrate - migrate a taskset to a cgroup + * @tset: taget taskset + * @dst_cgrp: destination cgroup + * + * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the + * ->can_attach callbacks fails and guarantees that either all or none of + * the tasks in @tset are migrated. @tset is consumed regardless of + * success. + */ +static int cgroup_taskset_migrate(struct cgroup_taskset *tset, + struct cgroup *dst_cgrp) +{ + struct cgroup_subsys_state *css, *failed_css = NULL; + struct task_struct *task, *tmp_task; + struct css_set *cset, *tmp_cset; + int i, ret; + + /* methods shouldn't be called if no task is actually migrating */ + if (list_empty(&tset->src_csets)) + return 0; + + /* check that we can legitimately attach to the cgroup */ + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->can_attach) { + ret = css->ss->can_attach(css, tset); + if (ret) { + failed_css = css; + goto out_cancel_attach; + } + } + } + + /* + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. + */ + down_write(&css_set_rwsem); + list_for_each_entry(cset, &tset->src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) + cgroup_task_migrate(cset->mg_src_cgrp, task, + cset->mg_dst_cset); + } + up_write(&css_set_rwsem); + + /* + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. + */ + tset->csets = &tset->dst_csets; + + for_each_e_css(css, i, dst_cgrp) + if (css->ss->attach) + css->ss->attach(css, tset); + + ret = 0; + goto out_release_tset; + +out_cancel_attach: + for_each_e_css(css, i, dst_cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, tset); + } +out_release_tset: + down_write(&css_set_rwsem); + list_splice_init(&tset->dst_csets, &tset->src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); + list_del_init(&cset->mg_node); + } + up_write(&css_set_rwsem); + return ret; +} + +/** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets * @@ -2381,15 +2502,8 @@ err: static int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup *cgrp) { - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct cgroup_subsys_state *css, *failed_css = NULL; - struct css_set *cset, *tmp_cset; - struct task_struct *task, *tmp_task; - int i, ret; + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + struct task_struct *task; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2400,89 +2514,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - /* @task either already exited or can't exit until the end */ - if (task->flags & PF_EXITING) - goto next; - - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - goto next; - - cset = task_css_set(task); - if (!cset->mg_src_cgrp) - goto next; - - list_move_tail(&task->cg_list, &cset->mg_tasks); - if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset.src_csets); - if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset.dst_csets); - next: + cgroup_taskset_add(task, &tset); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); up_write(&css_set_rwsem); - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset.src_csets)) - return 0; - - /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, cgrp) { - if (css->ss->can_attach) { - ret = css->ss->can_attach(css, &tset); - if (ret) { - failed_css = css; - goto out_cancel_attach; - } - } - } - - /* - * Now that we're guaranteed success, proceed to move all tasks to - * the new cgroup. There are no failure cases after here, so this - * is the commit point. - */ - down_write(&css_set_rwsem); - list_for_each_entry(cset, &tset.src_csets, mg_node) { - list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) - cgroup_task_migrate(cset->mg_src_cgrp, task, - cset->mg_dst_cset); - } - up_write(&css_set_rwsem); - - /* - * Migration is committed, all target tasks are now on dst_csets. - * Nothing is sensitive to fork() after this point. Notify - * controllers that migration is complete. - */ - tset.csets = &tset.dst_csets; - - for_each_e_css(css, i, cgrp) - if (css->ss->attach) - css->ss->attach(css, &tset); - - ret = 0; - goto out_release_tset; - -out_cancel_attach: - for_each_e_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } -out_release_tset: - down_write(&css_set_rwsem); - list_splice_init(&tset.dst_csets, &tset.src_csets); - list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { - list_splice_tail_init(&cset->mg_tasks, &cset->tasks); - list_del_init(&cset->mg_node); - } - up_write(&css_set_rwsem); - return ret; + return cgroup_taskset_migrate(&tset, cgrp); } /** -- cgit v1.1 From 10265075aa3a8629b0ccdcff4d10b17bd740defe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2015 15:00:22 -0400 Subject: cgroup: make cgroup_update_dfl_csses() migrate all target processes atomically cgroup_update_dfl_csses() is responsible for migrating processes when controllers are enabled or disabled on the default hierarchy. As the css association changes for all the processes in the affected cgroups, this involves migrating multiple processes. Up until now, it was implemented by migrating process-by-process until the source css_sets are empty; however, this means that if a process fails to migrate after some succeed before it, the recovery is very tricky. This was considered okay as subsystems weren't allowed to reject process migration on the default hierarchy; unfortunately, enforcing this policy turned out to be problematic for certain types of resources - realtime slices for now. As such, the default hierarchy is gonna allow restricted failures during migration and to support that this patch makes cgroup_update_dfl_csses() migrate all target processes atomically rather than one-by-one. The preceding patches made subsystems ready for multi-process migration and factored out taskset operations making this almost trivial. All tasks of the target processes are put in the same taskset and the migration operations are performed once which either fails or succeeds for all. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 44 ++++++++------------------------------------ 1 file changed, 8 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24d3ce..f924158 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2799,6 +2799,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_subsys_state *css; struct css_set *src_cset; int ret; @@ -2827,50 +2828,21 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) if (ret) goto out_finish; + down_write(&css_set_rwsem); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { - struct task_struct *last_task = NULL, *task; + struct task_struct *task, *ntask; /* src_csets precede dst_csets, break on the first dst_cset */ if (!src_cset->mg_src_cgrp) break; - /* - * All tasks in src_cset need to be migrated to the - * matching dst_cset. Empty it process by process. We - * walk tasks but migrate processes. The leader might even - * belong to a different cset but such src_cset would also - * be among the target src_csets because the default - * hierarchy enforces per-process membership. - */ - while (true) { - down_read(&css_set_rwsem); - task = list_first_entry_or_null(&src_cset->tasks, - struct task_struct, cg_list); - if (task) { - task = task->group_leader; - WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); - get_task_struct(task); - } - up_read(&css_set_rwsem); - - if (!task) - break; - - /* guard against possible infinite loop */ - if (WARN(last_task == task, - "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) - goto out_finish; - last_task = task; - - ret = cgroup_migrate(task, true, src_cset->dfl_cgrp); - - put_task_struct(task); - - if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) - goto out_finish; - } + /* all tasks in src_csets need to be migrated */ + list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) + cgroup_taskset_add(task, &tset); } + up_write(&css_set_rwsem); + ret = cgroup_taskset_migrate(&tset, cgrp); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); -- cgit v1.1 From f0132c4e0d06046a324051c09bd094719b9216ac Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 22 Sep 2015 21:43:13 +0800 Subject: kernel/trace_probe: is_good_name can be boolean This patch makes is_good_name return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Link: http://lkml.kernel.org/r/1442929393-4753-2-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/trace_probe.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b98dee9..f6398db 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -302,15 +302,15 @@ static nokprobe_inline void call_fetch(struct fetch_param *fprm, } /* Check the name is good for event/group/fields */ -static inline int is_good_name(const char *name) +static inline bool is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') - return 0; + return false; while (*++name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return 0; + return false; } - return 1; + return true; } static inline struct event_file_link * -- cgit v1.1 From 21199f27b430576552b26210b3194a363d7f05cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2015 16:10:40 +0200 Subject: locking/lockdep: Fix hlock->pin_count reset on lock stack rebuilds Various people reported hitting the "unpinning an unpinned lock" warning. As it turns out there are 2 places where we take a lock out of the middle of a stack, and in those cases it would fail to preserve the pin_count when rebuilding the lock stack. Reported-by: Sasha Levin Reported-by: Tim Spriggs Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davej@codemonkey.org.uk Link: http://lkml.kernel.org/r/20150916141040.GA11639@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8acfbf7..4e49cc4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references) + int references, int pin_count) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->waittime_stamp = 0; hlock->holdtime_stamp = lockstat_clock(); #endif - hlock->pin_count = 0; + hlock->pin_count = pin_count; if (check && !mark_irqflags(curr, hlock)) return 0; @@ -3343,7 +3343,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3433,7 +3433,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } -- cgit v1.1 From 2726d6ce389788c7fe724961a6e1bfe569560088 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 2 Sep 2015 11:01:34 +0100 Subject: sched/deadline: Unify dl_time_before() usage Move dl_time_before() static definition in include/linux/sched/deadline.h so that it can be used by different parties without being re-defined. Reported-by: Luca Abeni Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441188096-23021-3-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 5 ----- kernel/sched/cpudeadline.h | 1 + kernel/sched/sched.h | 5 ----- 3 files changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07..5a75b08 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,11 +31,6 @@ static inline int right_child(int i) return (i << 1) + 2; } -static inline int dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef..fcbdf83 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -2,6 +2,7 @@ #define _LINUX_CPUDL_H #include +#include #define IDX_INVALID -1 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3845a71..af6f252 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -118,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -static inline bool dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - /* * Tells if entity @a should preempt entity @b. */ -- cgit v1.1 From f52405757e4e9bddd868d6b8ca501d58f292969f Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 2 Sep 2015 11:01:35 +0100 Subject: sched/deadline, locking/rtmutex: Fix open coded check in rt_mutex_waiter_less() rt_mutex_waiter_less() check of task deadlines is open coded. Since this is subject to wraparound bugs, make it use the correct helper. Reported-by: Luca Abeni Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441188096-23021-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7781d80..35e9bfc 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -158,7 +158,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return (left->task->dl.deadline < right->task->dl.deadline); + return dl_time_before(left->task->dl.deadline, + right->task->dl.deadline); return 0; } -- cgit v1.1 From 269b26a5ef2b10bf15f66524fa47d81c1b4dc1a1 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 2 Sep 2015 11:01:36 +0100 Subject: sched/rt: Make (do_)balance_runtime() return void The return value of (do_)balance_runtime() is not consumed by anybody. Make them return void. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441188096-23021-5-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d2ea593..e3cc163 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) /* * We ran out of runtime, see if we can borrow some from our neighbours. */ -static int do_balance_runtime(struct rt_rq *rt_rq) +static void do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; - int i, weight, more = 0; + int i, weight; u64 rt_period; weight = cpumask_weight(rd->span); @@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; rt_rq->rt_runtime += diff; - more = 1; if (rt_rq->rt_runtime == rt_period) { raw_spin_unlock(&iter->rt_runtime_lock); break; @@ -683,8 +682,6 @@ next: raw_spin_unlock(&iter->rt_runtime_lock); } raw_spin_unlock(&rt_b->rt_runtime_lock); - - return more; } /* @@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) } } -static int balance_runtime(struct rt_rq *rt_rq) +static void balance_runtime(struct rt_rq *rt_rq) { - int more = 0; - if (!sched_feat(RT_RUNTIME_SHARE)) - return more; + return; if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); + do_balance_runtime(rt_rq); raw_spin_lock(&rt_rq->rt_runtime_lock); } - - return more; } #else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} +static inline void balance_runtime(struct rt_rq *rt_rq) {} #endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) -- cgit v1.1 From c6e1e7b5b7f031910850ddaf7bfa65ba3b4843ea Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 22 Sep 2015 12:48:59 +0200 Subject: sched/core: Make 'sched_domain_topology' declaration static The 'sched_domain_topology' variable is only used within kernel/sched/core.c. Make it static. Signed-off-by: Juergen Gross Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1442918939-9907-1-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b30b5b..a91df61 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6445,7 +6445,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -- cgit v1.1 From d78a461427d752bc1cd5b87515167453a18de7e3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 25 Sep 2015 13:30:47 -0400 Subject: tracing: Remove ftrace_trace_stack_regs() ftrace_trace_stack_regs() is used in only one place, and because that is such a simple function, just move its code into the location that it was used in (trace_buffer_unlock_commit_regs()). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++++---------- kernel/trace/trace.h | 9 --------- 2 files changed, 14 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6e79408..5082088 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -468,6 +468,18 @@ static inline void trace_access_lock_init(void) #endif +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs); +#else +static inline void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ +} +#endif + /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | @@ -1744,7 +1756,8 @@ void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + if (trace_flags & TRACE_ITER_STACKTRACE) + __ftrace_trace_stack(buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -1873,15 +1886,6 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs) -{ - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - __ftrace_trace_stack(buffer, flags, skip, pc, regs); -} - void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 74bde81..3b2a950 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -614,9 +614,6 @@ void update_max_tr_single(struct trace_array *tr, void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc); -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs); - void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); @@ -628,12 +625,6 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, { } -static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, - unsigned long flags, int skip, - int pc, struct pt_regs *regs) -{ -} - static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { -- cgit v1.1 From 41907416bc24412f839559ea10a2b9e4eeb21aa7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 25 Sep 2015 15:06:00 -0400 Subject: tracing: Remove unused function trace_current_buffer_lock_reserve() trace_current_buffer_lock_reserve() is not used by anything. Might as well get rid of it. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5082088..a499ec9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1741,14 +1741,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); - void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, -- cgit v1.1 From a3e72739b7a7ea225dd11c4096f97123f6427d87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 25 Sep 2015 16:24:27 -0400 Subject: cgroup: fix too early usage of static_branch_disable() 49d1dc4b8179 ("cgroup: implement static_key based cgroup_subsys_enabled() and cgroup_subsys_on_dfl()") converted cgroup enabled test to use static_key; however, cgroup_disable() is called before static_key subsystem itself is initialized and thus leads to the following warning when "cgroup_disable=" parameter is specified. WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99 static_key_slow_dec+0x44/0x60() static_key_slow_dec used before call to jump_label_init ... Call Trace: [] dump_stack+0x44/0x62 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_fmt+0x5c/0x80 [] static_key_slow_dec+0x44/0x60 [] cgroup_disable+0xaf/0xd6 [] unknown_bootoption+0x8c/0x194 [] parse_args+0x273/0x4a0 [] start_kernel+0x205/0x4b8 ... Fix it by making cgroup_disable() to record the subsystems to disable in cgroup_disable_mask and moving the actual application to cgroup_init() which is late enough and where the enabled state is first used. Signed-off-by: Tejun Heo Reported-by: Andrey Wagin Link: http://lkml.kernel.org/g/CANaxB-yFuS4SA2znSvcKrO9L_CbHciHYW+o9bN8sZJ8eR9FxYA@mail.gmail.com Fixes: 49d1dc4b81797f88270832b11e9f73809e7e7209 --- kernel/cgroup.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f924158..ae23814 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5124,6 +5124,8 @@ int __init cgroup_init_early(void) return 0; } +static unsigned long cgroup_disable_mask __initdata; + /** * cgroup_init - cgroup initialization * @@ -5170,8 +5172,12 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (!cgroup_ssid_enabled(ssid)) + if (cgroup_disable_mask & (1 << ssid)) { + static_branch_disable(cgroup_subsys_enabled_key[ssid]); + printk(KERN_INFO "Disabling %s control group subsystem\n", + ss->name); continue; + } cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -5595,11 +5601,7 @@ static int __init cgroup_disable(char *str) if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; - - static_branch_disable(cgroup_subsys_enabled_key[i]); - printk(KERN_INFO "Disabling %s control group subsystem\n", - ss->name); - break; + cgroup_disable_mask |= 1 << i; } } return 1; -- cgit v1.1 From b7f0c959edfb4448f94bd33c39fda08e10ce6ede Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 25 Sep 2015 17:38:44 -0400 Subject: tracing: Pass trace_array into trace_buffer_unlock_commit() In preparation for having trace options be per instance, the trace_array needs to be passed to the trace_buffer_unlock_commit(). The trace_event_buffer_lock_reserve() already passes in the trace_event_file where the trace_array can be derived from. Also added a "__init" to the boot up test event plus function tracing function function_test_events_call(). Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 4 ++-- kernel/trace/trace.c | 18 ++++++------------ kernel/trace/trace_events.c | 9 +++++++-- kernel/trace/trace_mmiotrace.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 4 ++-- 5 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 90e72a0..973d41d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -103,7 +103,7 @@ record_it: memcpy((void *) t + sizeof(*t), data, len); if (blk_tracer) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); } } @@ -278,7 +278,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); return; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a499ec9..3329c8e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1683,23 +1683,16 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve ring_buffer_unlock_commit(buffer, event); } -static inline void -__trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); ftrace_trace_userstack(buffer, flags, pc); } - -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; @@ -1741,7 +1734,8 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7ca09cd..b2e3d8d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2891,7 +2891,9 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static void +static struct trace_array *event_tr; + +static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { @@ -2922,7 +2924,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); @@ -2944,6 +2946,9 @@ static __init void event_trace_self_test_with_function(void) return; } pr_info("Running tests again, along with the function tracer\n"); + event_tr = top_trace_array(); + if (WARN_ON(!event_tr)) + return; event_trace_self_tests(); unregister_ftrace_function(&trace_ops); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 638e110..2be8c4f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -314,7 +314,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry->rw = *rw; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -344,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry->map = *map; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 12cbe77..c29d49e 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -388,7 +388,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, flags, pc); } static void @@ -416,7 +416,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, flags, pc); } static void notrace -- cgit v1.1 From 18ab2cd3ee9d52dc64c5ae984146a261a328c4e8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 27 Sep 2015 23:25:50 +0800 Subject: perf/core, perf/x86: Change needlessly global functions and a variable to static Fixes various sparse warnings. Signed-off-by: Geliang Tang Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/70c14234da1bed6e3e67b9c419e2d5e376ab4f32.1443367286.git.geliangtang@163.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f87b434..ea02109 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; -void update_perf_cpu_limits(void) +static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; @@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, * mode SWOUT : schedule out everything * mode SWIN : schedule in based on cgroup for next */ -void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; struct pmu *pmu; @@ -7390,7 +7390,7 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } -DEFINE_PER_CPU(unsigned int, nop_txn_flags); +static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { @@ -7750,7 +7750,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) return ret; } -struct pmu *perf_init_event(struct perf_event *event) +static struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; -- cgit v1.1 From ca475e831fd59e131bccd60de43c4104d82d02f5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Sep 2015 09:41:11 -0400 Subject: tracing: Make ftrace_trace_stack() static ftrace_trace_stack() is not called outside of trace.c. Make it a static function. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 +++++++++- kernel/trace/trace.h | 8 -------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3329c8e..5d3ce29 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -472,12 +472,20 @@ static inline void trace_access_lock_init(void) static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); +static void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc); + #else static inline void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { } +static inline void ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, int skip, int pc) +{ +} + #endif /* trace_flags holds trace_options default values */ @@ -1872,7 +1880,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, +static void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b2a950..107cf08 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -611,20 +611,12 @@ void update_max_tr_single(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc); - void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_stack(struct ring_buffer *buffer, - unsigned long flags, int skip, int pc) -{ -} - static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { -- cgit v1.1 From 6b1032d53cdbda39ad56c8692bac17a66475b57d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Sep 2015 10:11:44 -0400 Subject: tracing: Inject seq_print_userip_objs() into its only user seq_print_userip_objs() is used only in one location, in one file. Instead of having it as an external function, go one further than making it static, but inject is code into its only user. It doesn't make the calling function much more complex. Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 81 ++++++++++++++++++++------------------------- kernel/trace/trace_output.h | 2 -- 2 files changed, 36 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8e481a8..881cbda 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -355,50 +355,6 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } int -seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) -{ - struct mm_struct *mm = NULL; - unsigned int i; - - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { - struct task_struct *task; - /* - * we do the lookup on the thread group leader, - * since individual threads might have already quit! - */ - rcu_read_lock(); - task = find_task_by_vpid(entry->tgid); - if (task) - mm = get_task_mm(task); - rcu_read_unlock(); - } - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - unsigned long ip = entry->caller[i]; - - if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) - break; - - trace_seq_puts(s, " => "); - - if (!ip) { - trace_seq_puts(s, "??"); - trace_seq_putc(s, '\n'); - continue; - } - - seq_print_user_ip(s, mm, ip, sym_flags); - trace_seq_putc(s, '\n'); - } - - if (mm) - mmput(mm); - - return !trace_seq_has_overflowed(s); -} - -int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { if (!ip) { @@ -1081,11 +1037,46 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, { struct userstack_entry *field; struct trace_seq *s = &iter->seq; + struct mm_struct *mm = NULL; + unsigned int i; trace_assign_type(field, iter->ent); trace_seq_puts(s, "\n"); - seq_print_userip_objs(field, s, flags); + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(field->tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = field->caller[i]; + + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + + if (!ip) { + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); + continue; + } + + seq_print_user_ip(s, mm, ip, flags); + trace_seq_putc(s, '\n'); + } + + if (mm) + mmput(mm); return trace_handle_return(s); } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 4cbfe85..b774c06 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -14,8 +14,6 @@ trace_print_printk_msg_only(struct trace_iterator *iter); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern int seq_print_userip_objs(const struct userstack_entry *entry, - struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long ip, unsigned long sym_flags); -- cgit v1.1 From ef92480a58c3b4dac5eccbc787131a51a3b0a45c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Sep 2015 10:16:12 -0400 Subject: tracing: Turn seq_print_user_ip() into a static function seq_print_user_ip() is used in only one location in one file. Turn it into a static function. We could inject its code into the caller, but that would make the code a bit too complex. Keep the code separate. Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 4 ++-- kernel/trace/trace_output.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 881cbda..3b5dcdf 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -322,8 +322,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags) +static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) { struct file *file = NULL; unsigned long vmstart = 0; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index b774c06..fabc49b 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -14,8 +14,6 @@ trace_print_printk_msg_only(struct trace_iterator *iter); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); -- cgit v1.1 From 03905582fd093940cf609956adf6feb494e45346 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Sep 2015 15:37:49 -0400 Subject: tracing: Move "display-graph" option to main options In order to facilitate making all tracer options visible even when the tracer is not active, we need to get rid of duplicate options. Any option that is shared between multiple tracers really should be a main option. As the wakeup and irqsoff tracers both use the "display-graph" option, and use it exactly the same way, move that option from the tracer options to the main options and consolidate them. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 46 ++++++++++++--------------------------- kernel/trace/trace_sched_wakeup.c | 46 ++++++++++++--------------------------- 4 files changed, 30 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d3ce29..9a4ef5a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -882,6 +882,7 @@ static const char *trace_options[] = { "irq-info", "markers", "function-trace", + "display-graph", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 107cf08..dfa3cd2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -913,6 +913,7 @@ enum trace_iterator_flags { TRACE_ITER_IRQ_INFO = 0x800000, TRACE_ITER_MARKERS = 0x1000000, TRACE_ITER_FUNCTION = 0x2000000, + TRACE_ITER_DISPLAY_GRAPH = 0x4000000, }; /* diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8523ea3..446480a 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -57,22 +57,16 @@ irq_trace(void) # define irq_trace() (0) #endif -#define TRACE_DISPLAY_GRAPH 1 +#define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) -static struct tracer_opt trace_opts[] = { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +static int irqsoff_display_graph(struct trace_array *tr, int set); +#else +static inline int irqsoff_display_graph(struct trace_array *tr, int set) +{ + return -EINVAL; +} #endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; - -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) /* * Sequence count - we record it when starting a measurement and @@ -152,14 +146,10 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int -irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +static int irqsoff_display_graph(struct trace_array *tr, int set) { int cpu; - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - if (!(is_graph() ^ set)) return 0; @@ -259,12 +249,6 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int -irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) { return -1; @@ -556,12 +540,13 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) function_enabled = false; } -static void irqsoff_function_set(struct trace_array *tr, int set) +static int irqsoff_function_set(struct trace_array *tr, int set) { if (set) register_irqsoff_function(tr, is_graph(), 1); else unregister_irqsoff_function(tr, is_graph()); + return 0; } static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -569,7 +554,10 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(tr, set); + return irqsoff_function_set(tr, set); + + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return irqsoff_display_graph(tr, set); return trace_keep_overwrite(tracer, mask, set); } @@ -666,8 +654,6 @@ static struct tracer irqsoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -700,8 +686,6 @@ static struct tracer preemptoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -736,8 +720,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index c29d49e..f5d2e65 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -40,22 +40,17 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; static bool function_enabled; -#define TRACE_DISPLAY_GRAPH 1 +#define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) -static struct tracer_opt trace_opts[] = { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +static int wakeup_display_graph(struct trace_array *tr, int set); +#else +static inline int wakeup_display_graph(struct trace_array *tr, int set) +{ + return -EINVAL; +} #endif - { } /* Empty entry */ -}; -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; - -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) #ifdef CONFIG_FUNCTION_TRACER @@ -163,12 +158,13 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) function_enabled = false; } -static void wakeup_function_set(struct trace_array *tr, int set) +static int wakeup_function_set(struct trace_array *tr, int set) { if (set) register_wakeup_function(tr, is_graph(), 1); else unregister_wakeup_function(tr, is_graph()); + return 0; } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -176,7 +172,10 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(tr, set); + return wakeup_function_set(tr, set); + + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return wakeup_display_graph(tr, set); return trace_keep_overwrite(tracer, mask, set); } @@ -203,13 +202,8 @@ static void stop_func_tracer(struct trace_array *tr, int graph) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int -wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +static int wakeup_display_graph(struct trace_array *tr, int set) { - - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - if (!(is_graph() ^ set)) return 0; @@ -306,12 +300,6 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int -wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - static int wakeup_graph_entry(struct ftrace_graph_ent *trace) { return -1; @@ -740,8 +728,6 @@ static struct tracer wakeup_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -762,8 +748,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -784,8 +768,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, -- cgit v1.1 From 938db5f569247d13910d4542666709623c4253b0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Sep 2015 16:21:55 -0400 Subject: tracing: Remove unused tracing option "ftrace_preempt" There was a time where the function tracing would disable interrupts unless specifically told not to, where it would only disable preemption. With the new lockless code, the function tracing never disalbes interrupts and just uses disabling of preemption. Remove the option "ftrace_preempt" as it does nothing anyway. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 - kernel/trace/trace.h | 33 ++++++++++++++++----------------- 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9a4ef5a..f2fbf61 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -866,7 +866,6 @@ static const char *trace_options[] = { "block", "stacktrace", "trace_printk", - "ftrace_preempt", "branch", "annotate", "userstacktrace", diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dfa3cd2..19d5c41 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -897,23 +897,22 @@ enum trace_iterator_flags { TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_PRINTK = 0x200, - TRACE_ITER_PREEMPTONLY = 0x400, - TRACE_ITER_BRANCH = 0x800, - TRACE_ITER_ANNOTATE = 0x1000, - TRACE_ITER_USERSTACKTRACE = 0x2000, - TRACE_ITER_SYM_USEROBJ = 0x4000, - TRACE_ITER_PRINTK_MSGONLY = 0x8000, - TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x20000, - TRACE_ITER_SLEEP_TIME = 0x40000, - TRACE_ITER_GRAPH_TIME = 0x80000, - TRACE_ITER_RECORD_CMD = 0x100000, - TRACE_ITER_OVERWRITE = 0x200000, - TRACE_ITER_STOP_ON_FREE = 0x400000, - TRACE_ITER_IRQ_INFO = 0x800000, - TRACE_ITER_MARKERS = 0x1000000, - TRACE_ITER_FUNCTION = 0x2000000, - TRACE_ITER_DISPLAY_GRAPH = 0x4000000, + TRACE_ITER_BRANCH = 0x400, + TRACE_ITER_ANNOTATE = 0x800, + TRACE_ITER_USERSTACKTRACE = 0x1000, + TRACE_ITER_SYM_USEROBJ = 0x2000, + TRACE_ITER_PRINTK_MSGONLY = 0x4000, + TRACE_ITER_CONTEXT_INFO = 0x8000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x10000, + TRACE_ITER_SLEEP_TIME = 0x20000, + TRACE_ITER_GRAPH_TIME = 0x40000, + TRACE_ITER_RECORD_CMD = 0x80000, + TRACE_ITER_OVERWRITE = 0x100000, + TRACE_ITER_STOP_ON_FREE = 0x200000, + TRACE_ITER_IRQ_INFO = 0x400000, + TRACE_ITER_MARKERS = 0x800000, + TRACE_ITER_FUNCTION = 0x1000000, + TRACE_ITER_DISPLAY_GRAPH = 0x2000000, }; /* -- cgit v1.1 From ce3fed628ecc86d81fdb2be5a5c336c636960bfe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 09:22:05 -0400 Subject: tracing: Use enums instead of hard coded bitmasks for TRACE_ITER flags Using enums with FLAG_BIT and then defining a FLAG = (1 << FLAG_BIT), is a bit more robust as we require that there are no bits out of order or skipped to match the file names that represent the bits. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 81 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 19d5c41..31d8395 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -886,33 +886,62 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, * NOTE: These bits must match the trace_options array in * trace.c. */ +enum trace_iterator_bits { + TRACE_ITER_PRINT_PARENT_BIT = 0, + TRACE_ITER_SYM_OFFSET_BIT, + TRACE_ITER_SYM_ADDR_BIT, + TRACE_ITER_VERBOSE_BIT, + TRACE_ITER_RAW_BIT, + TRACE_ITER_HEX_BIT, + TRACE_ITER_BIN_BIT, + TRACE_ITER_BLOCK_BIT, + TRACE_ITER_STACKTRACE_BIT, + TRACE_ITER_PRINTK_BIT, + TRACE_ITER_BRANCH_BIT, + TRACE_ITER_ANNOTATE_BIT, + TRACE_ITER_USERSTACKTRACE_BIT, + TRACE_ITER_SYM_USEROBJ_BIT, + TRACE_ITER_PRINTK_MSGONLY_BIT, + TRACE_ITER_CONTEXT_INFO_BIT, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT_BIT, + TRACE_ITER_SLEEP_TIME_BIT, + TRACE_ITER_GRAPH_TIME_BIT, + TRACE_ITER_RECORD_CMD_BIT, + TRACE_ITER_OVERWRITE_BIT, + TRACE_ITER_STOP_ON_FREE_BIT, + TRACE_ITER_IRQ_INFO_BIT, + TRACE_ITER_MARKERS_BIT, + TRACE_ITER_FUNCTION_BIT, + TRACE_ITER_DISPLAY_GRAPH_BIT, +}; + enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = 0x01, - TRACE_ITER_SYM_OFFSET = 0x02, - TRACE_ITER_SYM_ADDR = 0x04, - TRACE_ITER_VERBOSE = 0x08, - TRACE_ITER_RAW = 0x10, - TRACE_ITER_HEX = 0x20, - TRACE_ITER_BIN = 0x40, - TRACE_ITER_BLOCK = 0x80, - TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_PRINTK = 0x200, - TRACE_ITER_BRANCH = 0x400, - TRACE_ITER_ANNOTATE = 0x800, - TRACE_ITER_USERSTACKTRACE = 0x1000, - TRACE_ITER_SYM_USEROBJ = 0x2000, - TRACE_ITER_PRINTK_MSGONLY = 0x4000, - TRACE_ITER_CONTEXT_INFO = 0x8000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x10000, - TRACE_ITER_SLEEP_TIME = 0x20000, - TRACE_ITER_GRAPH_TIME = 0x40000, - TRACE_ITER_RECORD_CMD = 0x80000, - TRACE_ITER_OVERWRITE = 0x100000, - TRACE_ITER_STOP_ON_FREE = 0x200000, - TRACE_ITER_IRQ_INFO = 0x400000, - TRACE_ITER_MARKERS = 0x800000, - TRACE_ITER_FUNCTION = 0x1000000, - TRACE_ITER_DISPLAY_GRAPH = 0x2000000, + TRACE_ITER_PRINT_PARENT = (1 << TRACE_ITER_PRINT_PARENT_BIT), + TRACE_ITER_SYM_OFFSET = (1 << TRACE_ITER_SYM_OFFSET_BIT), + TRACE_ITER_SYM_ADDR = (1 << TRACE_ITER_SYM_ADDR_BIT), + TRACE_ITER_VERBOSE = (1 << TRACE_ITER_VERBOSE_BIT), + TRACE_ITER_RAW = (1 << TRACE_ITER_RAW_BIT), + TRACE_ITER_HEX = (1 << TRACE_ITER_HEX_BIT), + TRACE_ITER_BIN = (1 << TRACE_ITER_BIN_BIT), + TRACE_ITER_BLOCK = (1 << TRACE_ITER_BLOCK_BIT), + TRACE_ITER_STACKTRACE = (1 << TRACE_ITER_STACKTRACE_BIT), + TRACE_ITER_PRINTK = (1 << TRACE_ITER_PRINTK_BIT), + TRACE_ITER_BRANCH = (1 << TRACE_ITER_BRANCH_BIT), + TRACE_ITER_ANNOTATE = (1 << TRACE_ITER_ANNOTATE_BIT), + TRACE_ITER_USERSTACKTRACE = (1 << TRACE_ITER_USERSTACKTRACE_BIT), + TRACE_ITER_SYM_USEROBJ = (1 << TRACE_ITER_SYM_USEROBJ_BIT), + TRACE_ITER_PRINTK_MSGONLY = (1 << TRACE_ITER_PRINTK_MSGONLY_BIT), + TRACE_ITER_CONTEXT_INFO = (1 << TRACE_ITER_CONTEXT_INFO_BIT), + TRACE_ITER_LATENCY_FMT = (1 << TRACE_ITER_LATENCY_FMT_BIT), + TRACE_ITER_SLEEP_TIME = (1 << TRACE_ITER_SLEEP_TIME_BIT), + TRACE_ITER_GRAPH_TIME = (1 << TRACE_ITER_GRAPH_TIME_BIT), + TRACE_ITER_RECORD_CMD = (1 << TRACE_ITER_RECORD_CMD_BIT), + TRACE_ITER_OVERWRITE = (1 << TRACE_ITER_OVERWRITE_BIT), + TRACE_ITER_STOP_ON_FREE = (1 << TRACE_ITER_STOP_ON_FREE_BIT), + TRACE_ITER_IRQ_INFO = (1 << TRACE_ITER_IRQ_INFO_BIT), + TRACE_ITER_MARKERS = (1 << TRACE_ITER_MARKERS_BIT), + TRACE_ITER_FUNCTION = (1 << TRACE_ITER_FUNCTION_BIT), + TRACE_ITER_DISPLAY_GRAPH = (1 << TRACE_ITER_DISPLAY_GRAPH_BIT), }; /* -- cgit v1.1 From a3418a364ec3c8f0c29bf3f4cfc71dc6f240150e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 09:43:30 -0400 Subject: tracing: Use TRACE_FLAGS macro to keep enums and strings matched Use a cute little macro trick to keep the names of the trace flags file guaranteed to match the corresponding masks. The macro TRACE_FLAGS is defined as a serious of enum names followed by the string name of the file that matches it. For example: #define TRACE_FLAGS \ C(PRINT_PARENT, "print-parent"), \ C(SYM_OFFSET, "sym-offset"), \ C(SYM_ADDR, "sym-addr"), \ C(VERBOSE, "verbose"), Now we can define the following: #undef C #define C(a, b) TRACE_ITER_##a##_BIT enum trace_iterator_bits { TRACE_FLAGS }; The above creates: enum trace_iterator_bits { TRACE_ITER_PRINT_PARENT_BIT, TRACE_ITER_SYM_OFFSET_BIT, TRACE_ITER_SYM_ADDR_BIT, TRACE_ITER_VERBOSE_BIT, }; Then we can redefine C as: #undef C #define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) enum trace_iterator_flags { TRACE_FLAGS }; Which creates: enum trace_iterator_flags { TRACE_ITER_PRINT_PARENT = (1 << TRACE_ITER_PRINT_PARENT_BIT), TRACE_ITER_SYM_OFFSET = (1 << TRACE_ITER_SYM_OFFSET_BIT), TRACE_ITER_SYM_ADDR = (1 << TRACE_ITER_SYM_ADDR_BIT), TRACE_ITER_VERBOSE = (1 << TRACE_ITER_VERBOSE_BIT), }; Then finally we can create the list of file names: #undef C #define C(a, b) b static const char *trace_options[] = { TRACE_FLAGS NULL }; Which creates: static const char *trace_options[] = { "print-parent", "sym-offset", "sym-addr", "verbose", NULL }; The importance of this is that the strings match the bit index. trace_options[TRACE_ITER_SYM_ADDR_BIT] == "sym-addr" Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 36 +++++------------- kernel/trace/trace.h | 102 +++++++++++++++++++++++---------------------------- 2 files changed, 55 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2fbf61..e80e380 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -854,34 +854,18 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } +/* + * TRACE_FLAGS is defined as a tuple matching bit masks with strings. + * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list + * of strings in the order that the enums were defined. + */ +#undef C +#define C(a, b) b + /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { - "print-parent", - "sym-offset", - "sym-addr", - "verbose", - "raw", - "hex", - "bin", - "block", - "stacktrace", - "trace_printk", - "branch", - "annotate", - "userstacktrace", - "sym-userobj", - "printk-msg-only", - "context-info", - "latency-format", - "sleep-time", - "graph-time", - "record-cmd", - "overwrite", - "disable_on_free", - "irq-info", - "markers", - "function-trace", - "display-graph", + TRACE_FLAGS NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 31d8395..d164845 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -884,65 +884,53 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, * positions into trace_flags that controls the output. * * NOTE: These bits must match the trace_options array in - * trace.c. + * trace.c (this macro guarantees it). */ -enum trace_iterator_bits { - TRACE_ITER_PRINT_PARENT_BIT = 0, - TRACE_ITER_SYM_OFFSET_BIT, - TRACE_ITER_SYM_ADDR_BIT, - TRACE_ITER_VERBOSE_BIT, - TRACE_ITER_RAW_BIT, - TRACE_ITER_HEX_BIT, - TRACE_ITER_BIN_BIT, - TRACE_ITER_BLOCK_BIT, - TRACE_ITER_STACKTRACE_BIT, - TRACE_ITER_PRINTK_BIT, - TRACE_ITER_BRANCH_BIT, - TRACE_ITER_ANNOTATE_BIT, - TRACE_ITER_USERSTACKTRACE_BIT, - TRACE_ITER_SYM_USEROBJ_BIT, - TRACE_ITER_PRINTK_MSGONLY_BIT, - TRACE_ITER_CONTEXT_INFO_BIT, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT_BIT, - TRACE_ITER_SLEEP_TIME_BIT, - TRACE_ITER_GRAPH_TIME_BIT, - TRACE_ITER_RECORD_CMD_BIT, - TRACE_ITER_OVERWRITE_BIT, - TRACE_ITER_STOP_ON_FREE_BIT, - TRACE_ITER_IRQ_INFO_BIT, - TRACE_ITER_MARKERS_BIT, - TRACE_ITER_FUNCTION_BIT, - TRACE_ITER_DISPLAY_GRAPH_BIT, -}; +#define TRACE_FLAGS \ + C(PRINT_PARENT, "print-parent"), \ + C(SYM_OFFSET, "sym-offset"), \ + C(SYM_ADDR, "sym-addr"), \ + C(VERBOSE, "verbose"), \ + C(RAW, "raw"), \ + C(HEX, "hex"), \ + C(BIN, "bin"), \ + C(BLOCK, "block"), \ + C(STACKTRACE, "stacktrace"), \ + C(PRINTK, "trace_printk"), \ + C(BRANCH, "branch"), \ + C(ANNOTATE, "annotate"), \ + C(USERSTACKTRACE, "userstacktrace"), \ + C(SYM_USEROBJ, "sym-userobj"), \ + C(PRINTK_MSGONLY, "printk-msg-only"), \ + C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ + C(LATENCY_FMT, "latency-format"), \ + C(SLEEP_TIME, "sleep-time"), \ + C(GRAPH_TIME, "graph-time"), \ + C(RECORD_CMD, "record-cmd"), \ + C(OVERWRITE, "overwrite"), \ + C(STOP_ON_FREE, "disable_on_free"), \ + C(IRQ_INFO, "irq-info"), \ + C(MARKERS, "markers"), \ + C(FUNCTION, "function-trace"), \ + C(DISPLAY_GRAPH, "display-graph"), -enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = (1 << TRACE_ITER_PRINT_PARENT_BIT), - TRACE_ITER_SYM_OFFSET = (1 << TRACE_ITER_SYM_OFFSET_BIT), - TRACE_ITER_SYM_ADDR = (1 << TRACE_ITER_SYM_ADDR_BIT), - TRACE_ITER_VERBOSE = (1 << TRACE_ITER_VERBOSE_BIT), - TRACE_ITER_RAW = (1 << TRACE_ITER_RAW_BIT), - TRACE_ITER_HEX = (1 << TRACE_ITER_HEX_BIT), - TRACE_ITER_BIN = (1 << TRACE_ITER_BIN_BIT), - TRACE_ITER_BLOCK = (1 << TRACE_ITER_BLOCK_BIT), - TRACE_ITER_STACKTRACE = (1 << TRACE_ITER_STACKTRACE_BIT), - TRACE_ITER_PRINTK = (1 << TRACE_ITER_PRINTK_BIT), - TRACE_ITER_BRANCH = (1 << TRACE_ITER_BRANCH_BIT), - TRACE_ITER_ANNOTATE = (1 << TRACE_ITER_ANNOTATE_BIT), - TRACE_ITER_USERSTACKTRACE = (1 << TRACE_ITER_USERSTACKTRACE_BIT), - TRACE_ITER_SYM_USEROBJ = (1 << TRACE_ITER_SYM_USEROBJ_BIT), - TRACE_ITER_PRINTK_MSGONLY = (1 << TRACE_ITER_PRINTK_MSGONLY_BIT), - TRACE_ITER_CONTEXT_INFO = (1 << TRACE_ITER_CONTEXT_INFO_BIT), - TRACE_ITER_LATENCY_FMT = (1 << TRACE_ITER_LATENCY_FMT_BIT), - TRACE_ITER_SLEEP_TIME = (1 << TRACE_ITER_SLEEP_TIME_BIT), - TRACE_ITER_GRAPH_TIME = (1 << TRACE_ITER_GRAPH_TIME_BIT), - TRACE_ITER_RECORD_CMD = (1 << TRACE_ITER_RECORD_CMD_BIT), - TRACE_ITER_OVERWRITE = (1 << TRACE_ITER_OVERWRITE_BIT), - TRACE_ITER_STOP_ON_FREE = (1 << TRACE_ITER_STOP_ON_FREE_BIT), - TRACE_ITER_IRQ_INFO = (1 << TRACE_ITER_IRQ_INFO_BIT), - TRACE_ITER_MARKERS = (1 << TRACE_ITER_MARKERS_BIT), - TRACE_ITER_FUNCTION = (1 << TRACE_ITER_FUNCTION_BIT), - TRACE_ITER_DISPLAY_GRAPH = (1 << TRACE_ITER_DISPLAY_GRAPH_BIT), -}; +/* + * By defining C, we can make TRACE_FLAGS a list of bit names + * that will define the bits for the flag masks. + */ +#undef C +#define C(a, b) TRACE_ITER_##a##_BIT + +enum trace_iterator_bits { TRACE_FLAGS }; + +/* + * By redefining C, we can make TRACE_FLAGS a list of masks that + * use the bits as defined above. + */ +#undef C +#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) + +enum trace_iterator_flags { TRACE_FLAGS }; /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that -- cgit v1.1 From 729358da95a1b3850ef892e9384f58932da1dc69 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 10:15:10 -0400 Subject: tracing: Only create function graph options when it is compiled in Do not create fuction graph tracer options when function graph tracer is not even compiled in. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 +++++++---- kernel/trace/trace.h | 20 +++++++++++++++++--- kernel/trace/trace_irqsoff.c | 6 ++++-- kernel/trace/trace_sched_wakeup.c | 6 ++++-- 4 files changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e80e380..68fcb40f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -489,10 +489,13 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, #endif /* trace_flags holds trace_options default values */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +unsigned long trace_flags = + FUNCTION_GRAPH_DEFAULT_FLAGS | + TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | + TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION + ; static void tracer_tracing_on(struct trace_array *tr) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d164845..33cd097 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -880,6 +880,22 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos); /* + * Only create function graph options if function graph is configured. + */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FGRAPH_FLAGS \ + C(SLEEP_TIME, "sleep-time"), \ + C(GRAPH_TIME, "graph-time"), \ + C(DISPLAY_GRAPH, "display-graph"), +/* Initially set for trace_flags */ +# define FUNCTION_GRAPH_DEFAULT_FLAGS \ + (TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME) +#else +# define FGRAPH_FLAGS +# define FUNCTION_GRAPH_DEFAULT_FLAGS 0UL +#endif + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * @@ -904,15 +920,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(PRINTK_MSGONLY, "printk-msg-only"), \ C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ - C(SLEEP_TIME, "sleep-time"), \ - C(GRAPH_TIME, "graph-time"), \ C(RECORD_CMD, "record-cmd"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(FUNCTION, "function-trace"), \ - C(DISPLAY_GRAPH, "display-graph"), + FGRAPH_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 446480a..bd9cd0e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -57,15 +57,15 @@ irq_trace(void) # define irq_trace() (0) #endif -#define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int irqsoff_display_graph(struct trace_array *tr, int set); +# define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) #else static inline int irqsoff_display_graph(struct trace_array *tr, int set) { return -EINVAL; } +# define is_graph() false #endif /* @@ -556,8 +556,10 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) if (mask & TRACE_ITER_FUNCTION) return irqsoff_function_set(tr, set); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER if (mask & TRACE_ITER_DISPLAY_GRAPH) return irqsoff_display_graph(tr, set); +#endif return trace_keep_overwrite(tracer, mask, set); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f5d2e65..a6c350c 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -40,15 +40,15 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; static bool function_enabled; -#define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_display_graph(struct trace_array *tr, int set); +# define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) #else static inline int wakeup_display_graph(struct trace_array *tr, int set) { return -EINVAL; } +# define is_graph() false #endif @@ -174,8 +174,10 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) if (mask & TRACE_ITER_FUNCTION) return wakeup_function_set(tr, set); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER if (mask & TRACE_ITER_DISPLAY_GRAPH) return wakeup_display_graph(tr, set); +#endif return trace_keep_overwrite(tracer, mask, set); } -- cgit v1.1 From 4ee4301c4bab22c84df20ce694cc6932dd812be5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 10:19:35 -0400 Subject: tracing: Only create branch tracer options when compiled in When the branch tracer is not compiled in, do not create the option files associated to it. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 33cd097..3f1cc45 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -895,6 +895,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define FUNCTION_GRAPH_DEFAULT_FLAGS 0UL #endif +#ifdef CONFIG_BRANCH_TRACER +# define BRANCH_FLAGS \ + C(BRANCH, "branch"), +#else +# define BRANCH_FLAGS +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -913,7 +920,6 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(BLOCK, "block"), \ C(STACKTRACE, "stacktrace"), \ C(PRINTK, "trace_printk"), \ - C(BRANCH, "branch"), \ C(ANNOTATE, "annotate"), \ C(USERSTACKTRACE, "userstacktrace"), \ C(SYM_USEROBJ, "sym-userobj"), \ @@ -926,7 +932,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(FUNCTION, "function-trace"), \ - FGRAPH_FLAGS + FGRAPH_FLAGS \ + BRANCH_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names -- cgit v1.1 From a1b7febd725a2cdfc8ac245b7b7437ce4b91aecb Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 25 Sep 2015 18:09:32 +0200 Subject: genirq: Fix the documentation of request_percpu_irq The documentation of request_percpu_irq is confusing and suggest that the interrupt is not enabled at all, while it is actually enabled on the local CPU. Clarify that. Signed-off-by: Maxime Ripard Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- kernel/irq/manage.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9a59f6..a4360f0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1790,9 +1790,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources, but doesn't - * automatically enable the interrupt. It has to be done on each - * CPU using enable_percpu_irq(). + * This call allocates interrupt resources and enables the + * interrupt on the local CPU. If the interrupt is supposed to be + * enabled on other CPUs, it has to be done on each CPU using + * enable_percpu_irq(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of -- cgit v1.1 From aec2e2ad1786eb07814ae60988e0e306cd24a6cc Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 25 Sep 2015 18:09:33 +0200 Subject: irq: Export per-cpu irq allocation and de-allocation functions Some drivers might use the per-cpu interrupts and still might be built as a module. Export request_percpu_irq an free_percpu_irq to these user, which also make it consistent with enable/disable_percpu_irq that were exported. Reported-by: Willy Tarreau Signed-off-by: Maxime Ripard Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4360f0..4c21386 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1761,6 +1761,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } +EXPORT_SYMBOL_GPL(free_percpu_irq); /** * setup_percpu_irq - setup a per-cpu interrupt @@ -1832,6 +1833,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } +EXPORT_SYMBOL_GPL(request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. -- cgit v1.1 From 8179e8a15b76eaec1e757da7a0f96de9f0c466c6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 10:24:56 -0400 Subject: tracing: Do not create function tracer options when not compiled in When the function tracer is not compiled in, do not create the option files for it. Fix up both the sched_wakeup and irqsoff tracers to handle the change. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 11 +++++++++- kernel/trace/trace_irqsoff.c | 28 +++++++++++++++++++++----- kernel/trace/trace_sched_wakeup.c | 42 ++++++++++++++++++++++++++------------- 4 files changed, 63 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68fcb40f..cb223ad 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -490,11 +490,11 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, /* trace_flags holds trace_options default values */ unsigned long trace_flags = - FUNCTION_GRAPH_DEFAULT_FLAGS | + FUNCTION_DEFAULT_FLAGS | FUNCTION_GRAPH_DEFAULT_FLAGS | TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS ; static void tracer_tracing_on(struct trace_array *tr) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3f1cc45..b389d40 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -902,6 +902,15 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define BRANCH_FLAGS #endif +#ifdef CONFIG_FUNCTION_TRACER +# define FUNCTION_FLAGS \ + C(FUNCTION, "function-trace"), +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +#else +# define FUNCTION_FLAGS +# define FUNCTION_DEFAULT_FLAGS 0UL +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -931,7 +940,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ - C(FUNCTION, "function-trace"), \ + FUNCTION_FLAGS \ FGRAPH_FLAGS \ BRANCH_FLAGS diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index bd9cd0e..c834b95 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -31,7 +31,6 @@ enum { static int trace_type __read_mostly; static int save_flags; -static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -249,21 +248,23 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function +#ifdef CONFIG_FUNCTION_TRACER static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) { return -1; } +#endif static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_print_header(struct seq_file *s) { trace_default_header(s); @@ -507,6 +508,9 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_FUNCTION_TRACER +static bool function_enabled; + static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; @@ -540,21 +544,35 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) function_enabled = false; } -static int irqsoff_function_set(struct trace_array *tr, int set) +static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + if (set) register_irqsoff_function(tr, is_graph(), 1); else unregister_irqsoff_function(tr, is_graph()); + return 1; +} +#else +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) +{ return 0; } +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { } +static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; +} +#endif /* CONFIG_FUNCTION_TRACER */ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) { struct tracer *tracer = tr->current_trace; - if (mask & TRACE_ITER_FUNCTION) - return irqsoff_function_set(tr, set); + if (irqsoff_function_set(tr, mask, set)) + return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (mask & TRACE_ITER_DISPLAY_GRAPH) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a6c350c..4a20f61 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -34,11 +34,8 @@ static arch_spinlock_t wakeup_lock = static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); -static int wakeup_graph_entry(struct ftrace_graph_ent *trace); -static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; -static bool function_enabled; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_display_graph(struct trace_array *tr, int set); @@ -46,7 +43,7 @@ static int wakeup_display_graph(struct trace_array *tr, int set); #else static inline int wakeup_display_graph(struct trace_array *tr, int set) { - return -EINVAL; + return 0; } # define is_graph() false #endif @@ -54,6 +51,11 @@ static inline int wakeup_display_graph(struct trace_array *tr, int set) #ifdef CONFIG_FUNCTION_TRACER +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); + +static bool function_enabled; + /* * Prologue for the wakeup function tracers. * @@ -123,7 +125,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } -#endif /* CONFIG_FUNCTION_TRACER */ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { @@ -158,21 +159,35 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) function_enabled = false; } -static int wakeup_function_set(struct trace_array *tr, int set) +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + if (set) register_wakeup_function(tr, is_graph(), 1); else unregister_wakeup_function(tr, is_graph()); + return 1; +} +#else +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ return 0; } +static void unregister_wakeup_function(struct trace_array *tr, int graph) { } +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; +} +#endif /* CONFIG_FUNCTION_TRACER */ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { struct tracer *tracer = tr->current_trace; - if (mask & TRACE_ITER_FUNCTION) - return wakeup_function_set(tr, set); + if (wakeup_function_set(tr, mask, set)) + return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (mask & TRACE_ITER_DISPLAY_GRAPH) @@ -302,21 +317,20 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} - static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_print_header(struct seq_file *s) { trace_default_header(s); -- cgit v1.1 From 73dddbb57bb08d465dd0ecab93db0c5209e50cfe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 15:38:55 -0400 Subject: tracing: Only create stacktrace option when STACKTRACE is configured Only create the stacktrace trace option when CONFIG_STACKTRACE is configured. Cleaned up the ftrace_trace_stack() function call a little to allow better encapsulation of the stacktrace trace flag. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 28 +++++++++++++++------------- kernel/trace/trace.h | 9 ++++++++- 2 files changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb223ad..865f3fa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -472,8 +472,9 @@ static inline void trace_access_lock_init(void) static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); -static void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc); +static inline void ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs); #else static inline void __ftrace_trace_stack(struct ring_buffer *buffer, @@ -482,7 +483,8 @@ static inline void __ftrace_trace_stack(struct ring_buffer *buffer, { } static inline void ftrace_trace_stack(struct ring_buffer *buffer, - unsigned long flags, int skip, int pc) + unsigned long flags, + int skip, int pc, struct pt_regs *regs) { } @@ -571,7 +573,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc); + ftrace_trace_stack(buffer, irq_flags, 4, pc, NULL); return size; } @@ -611,7 +613,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc); + ftrace_trace_stack(buffer, irq_flags, 4, pc, NULL); return 1; } @@ -1685,7 +1687,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(buffer, flags, 6, pc, NULL); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); @@ -1737,8 +1739,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - if (trace_flags & TRACE_ITER_STACKTRACE) - __ftrace_trace_stack(buffer, flags, 0, pc, regs); + ftrace_trace_stack(buffer, flags, 6, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -1867,13 +1868,14 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } -static void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) +static inline void ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(buffer, flags, skip, pc, regs); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, @@ -2158,7 +2160,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(buffer, flags, 6, pc, NULL); } out: @@ -2210,7 +2212,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(buffer, flags, 6, pc, NULL); } out: preempt_enable_notrace(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b389d40..af34e18 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -911,6 +911,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define FUNCTION_DEFAULT_FLAGS 0UL #endif +#ifdef CONFIG_STACKTRACE +# define STACK_FLAGS \ + C(STACKTRACE, "stacktrace"), +#else +# define STACK_FLAGS +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -927,7 +934,6 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(HEX, "hex"), \ C(BIN, "bin"), \ C(BLOCK, "block"), \ - C(STACKTRACE, "stacktrace"), \ C(PRINTK, "trace_printk"), \ C(ANNOTATE, "annotate"), \ C(USERSTACKTRACE, "userstacktrace"), \ @@ -942,6 +948,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ + STACK_FLAGS \ BRANCH_FLAGS /* -- cgit v1.1 From 41d9c0beccbb92397bea8b04a6afd1253c064a1a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 17:31:55 -0400 Subject: tracing: Always show all tracer options in the options directory There are options that are unique to a specific tracer (like function and function graph). Currently, these options are only visible in the options directory when the tracer is enabled. This has been a pain, especially for something like the func_stack_trace option that if used inappropriately, could bring the system to a crawl. But the only way to see it, is to enable the function tracer. For example, if one had done: # cd /sys/kernel/tracing # echo __schedule > set_ftrace_filter # echo 1 > options/func_stack_trace # echo function > current_tracer The __schedule call will be traced and a stack trace will also be recorded there. Now when you were done, you may do... # echo nop > current_tracer # echo > set_ftrace_filter But you forgot to disable the func_stack_trace. The only way to disable it is to re-enable function tracing first. If you do not add a filter to set_ftrace_filter and just do: # echo function > current_tracer Now you would be performing a stack trace on *every* function! On some systems, that causes a live lock. Others may take a few minutes to fix your mistake. Having the func_stack_trace option visible allows you to check it and disable it before enabling the funtion tracer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 53 +++++++++++++++++++--------------------------------- kernel/trace/trace.h | 8 ++++++++ 2 files changed, 27 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 865f3fa..7446d42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1213,6 +1213,8 @@ static inline int run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ +static void add_tracer_options(struct trace_array *tr, struct tracer *t); + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer @@ -1262,6 +1264,7 @@ int register_tracer(struct tracer *type) type->next = trace_types; trace_types = type; + add_tracer_options(&global_trace, type); out: tracing_selftest_running = false; @@ -4287,9 +4290,6 @@ struct trace_option_dentry; static struct trace_option_dentry * create_trace_option_files(struct trace_array *tr, struct tracer *tracer); -static void -destroy_trace_option_files(struct trace_option_dentry *topts); - /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. @@ -4307,10 +4307,8 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } -static void update_tracer_options(struct trace_array *tr, struct tracer *t) +static void add_tracer_options(struct trace_array *tr, struct tracer *t) { - static struct trace_option_dentry *topts; - /* Only enable if the directory has been created already. */ if (!tr->dir) return; @@ -4319,8 +4317,11 @@ static void update_tracer_options(struct trace_array *tr, struct tracer *t) if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) return; - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); + /* Ignore if they were already created */ + if (t->topts) + return; + + t->topts = create_trace_option_files(tr, t); } static int tracing_set_tracer(struct trace_array *tr, const char *buf) @@ -4389,7 +4390,6 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - update_tracer_options(tr, t); #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -6119,13 +6119,6 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) #include "trace_selftest.c" #endif -struct trace_option_dentry { - struct tracer_opt *opt; - struct tracer_flags *flags; - struct trace_array *tr; - struct dentry *entry; -}; - static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -6310,27 +6303,17 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) if (!topts) return NULL; - for (cnt = 0; opts[cnt].name; cnt++) + for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); + WARN_ONCE(topts[cnt].entry == NULL, + "Failed to create trace option: %s", + opts[cnt].name); + } return topts; } -static void -destroy_trace_option_files(struct trace_option_dentry *topts) -{ - int cnt; - - if (!topts) - return; - - for (cnt = 0; topts[cnt].opt; cnt++) - tracefs_remove(topts[cnt].entry); - - kfree(topts); -} - static struct dentry * create_trace_option_core_file(struct trace_array *tr, const char *option, long index) @@ -6812,6 +6795,7 @@ static struct notifier_block trace_module_nb = { static __init int tracer_init_tracefs(void) { struct dentry *d_tracer; + struct tracer *t; trace_access_lock_init(); @@ -6850,9 +6834,10 @@ static __init int tracer_init_tracefs(void) create_trace_options_dir(&global_trace); - /* If the tracer was started via cmdline, create options for it here */ - if (global_trace.current_trace != &nop_trace) - update_tracer_options(&global_trace, global_trace.current_trace); + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) + add_tracer_options(&global_trace, t); + mutex_unlock(&trace_types_lock); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index af34e18..8ed9787 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -333,6 +333,13 @@ struct tracer_flags { #define TRACER_OPT(s, b) .name = #s, .bit = b +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct trace_array *tr; + struct dentry *entry; +}; + /** * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file @@ -387,6 +394,7 @@ struct tracer { u32 mask, int set); struct tracer *next; struct tracer_flags *flags; + struct trace_option_dentry *topts; int enabled; int ref; bool print_max; -- cgit v1.1 From b5e87c0581319481399b6d8e8d6972b5523c18e6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 18:13:33 -0400 Subject: tracing: Add build bug if we have more trace_flags than bits Add a enum that denotes the last bit of the trace_flags and have a BUILD_BUG_ON(last_bit > 32). If we add more bits than we have in trace_flags, the kernel wont build. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ kernel/trace/trace.h | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7446d42..991bab9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7046,6 +7046,12 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; + /* + * Make sure we don't accidently add more trace options + * than we have bits for. + */ + BUILD_BUG_ON(TRACE_ITER_LAST_BIT > 32); + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8ed9787..0715565 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -966,7 +966,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, #undef C #define C(a, b) TRACE_ITER_##a##_BIT -enum trace_iterator_bits { TRACE_FLAGS }; +enum trace_iterator_bits { + TRACE_FLAGS + /* Make sure we don't go more than we have bits for */ + TRACE_ITER_LAST_BIT +}; /* * By redefining C, we can make TRACE_FLAGS a list of masks that -- cgit v1.1 From b9f9108cad3998a4c8fd26051c37a451f1dff1f1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 18:21:35 -0400 Subject: tracing: Remove access to trace_flags in trace_printk.c In the effort to move the global trace_flags to the tracing instances, the direct access to trace_flags must be removed from trace_printk.c Instead, add a new trace_printk_enabled boolean that is set by a new access function trace_printk_control(), that will enable or disable trace_printk. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- kernel/trace/trace.h | 1 + kernel/trace/trace_printk.c | 14 ++++++++++---- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 991bab9..d98789b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3555,8 +3555,10 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) #endif } - if (mask == TRACE_ITER_PRINTK) + if (mask == TRACE_ITER_PRINTK) { trace_printk_start_stop_comm(enabled); + trace_printk_control(enabled); + } return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0715565..33d1e53 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1318,6 +1318,7 @@ extern const char *__stop___trace_bprintk_fmt[]; extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; +void trace_printk_control(bool enabled); void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 36c1455..1c2b285 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -178,6 +178,12 @@ static inline void format_mod_start(void) { } static inline void format_mod_stop(void) { } #endif /* CONFIG_MODULES */ +static bool __read_mostly trace_printk_enabled = true; + +void trace_printk_control(bool enabled) +{ + trace_printk_enabled = enabled; +} __initdata_or_module static struct notifier_block module_trace_bprintk_format_nb = { @@ -192,7 +198,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) if (unlikely(!fmt)) return 0; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; va_start(ap, fmt); @@ -207,7 +213,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) if (unlikely(!fmt)) return 0; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; return trace_vbprintk(ip, fmt, ap); @@ -219,7 +225,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; va_start(ap, fmt); @@ -231,7 +237,7 @@ EXPORT_SYMBOL_GPL(__trace_printk); int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) { - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; return trace_vprintk(ip, fmt, ap); -- cgit v1.1 From 874bbfe600a660cba9c776b3957b1ce393151b76 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 30 Sep 2015 09:05:30 -0700 Subject: workqueue: make sure delayed work run in local cpu My system keeps crashing with below message. vmstat_update() schedules a delayed work in current cpu and expects the work runs in the cpu. schedule_delayed_work() is expected to make delayed work run in local cpu. The problem is timer can be migrated with NO_HZ. __queue_work() queues work in timer handler, which could run in a different cpu other than where the delayed work is scheduled. The end result is the delayed work runs in different cpu. The patch makes __queue_delayed_work records local cpu earlier. Where the timer runs doesn't change where the work runs with the change. [ 28.010131] ------------[ cut here ]------------ [ 28.010609] kernel BUG at ../mm/vmstat.c:1392! [ 28.011099] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [ 28.011860] Modules linked in: [ 28.012245] CPU: 0 PID: 289 Comm: kworker/0:3 Tainted: G W4.3.0-rc3+ #634 [ 28.013065] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014 [ 28.014160] Workqueue: events vmstat_update [ 28.014571] task: ffff880117682580 ti: ffff8800ba428000 task.ti: ffff8800ba428000 [ 28.015445] RIP: 0010:[] []vmstat_update+0x31/0x80 [ 28.016282] RSP: 0018:ffff8800ba42fd80 EFLAGS: 00010297 [ 28.016812] RAX: 0000000000000000 RBX: ffff88011a858dc0 RCX:0000000000000000 [ 28.017585] RDX: ffff880117682580 RSI: ffffffff81f14d8c RDI:ffffffff81f4df8d [ 28.018366] RBP: ffff8800ba42fd90 R08: 0000000000000001 R09:0000000000000000 [ 28.019169] R10: 0000000000000000 R11: 0000000000000121 R12:ffff8800baa9f640 [ 28.019947] R13: ffff88011a81e340 R14: ffff88011a823700 R15:0000000000000000 [ 28.020071] FS: 0000000000000000(0000) GS:ffff88011a800000(0000)knlGS:0000000000000000 [ 28.020071] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 28.020071] CR2: 00007ff6144b01d0 CR3: 00000000b8e93000 CR4:00000000000006f0 [ 28.020071] Stack: [ 28.020071] ffff88011a858dc0 ffff8800baa9f640 ffff8800ba42fe00ffffffff8106bd88 [ 28.020071] ffffffff8106bd0b 0000000000000096 0000000000000000ffffffff82f9b1e8 [ 28.020071] ffffffff829f0b10 0000000000000000 ffffffff81f18460ffff88011a81e340 [ 28.020071] Call Trace: [ 28.020071] [] process_one_work+0x1c8/0x540 [ 28.020071] [] ? process_one_work+0x14b/0x540 [ 28.020071] [] worker_thread+0x114/0x460 [ 28.020071] [] ? process_one_work+0x540/0x540 [ 28.020071] [] kthread+0xf8/0x110 [ 28.020071] [] ?kthread_create_on_node+0x200/0x200 [ 28.020071] [] ret_from_fork+0x3f/0x70 [ 28.020071] [] ?kthread_create_on_node+0x200/0x200 Signed-off-by: Shaohua Li Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v2.6.31+ --- kernel/workqueue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca71582..bcb14ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; + /* timer isn't guaranteed to run in this cpu, record earlier */ + if (cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - if (unlikely(cpu != WORK_CPU_UNBOUND)) - add_timer_on(timer, cpu); - else - add_timer(timer); + add_timer_on(timer, cpu); } /** -- cgit v1.1 From 55577204154c7a95c6bce4cb185366d638b238b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Sep 2015 19:06:50 -0400 Subject: tracing: Move sleep-time and graph-time options out of the core trace_flags The sleep-time and graph-time options are only for the function graph tracer and are not used by anything else. As tracer options are now visible when the tracer is not activated, its better to move the function graph specific tracer options into the function graph tracer. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 19 +++++++++++++++++-- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 11 +++++------ kernel/trace/trace_functions_graph.c | 13 ++++++++++++- 4 files changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0623ac..e7638489 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -243,6 +243,11 @@ static void ftrace_sync_ipi(void *data) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static void update_function_graph_func(void); + +/* Both enabled by default (can be cleared by function_graph tracer flags */ +static bool fgraph_sleep_time = true; +static bool fgraph_graph_time = true; + #else static inline void update_function_graph_func(void) { } #endif @@ -917,7 +922,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) calltime = trace->rettime - trace->calltime; - if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + if (!fgraph_graph_time) { int index; index = trace->depth; @@ -5639,6 +5644,16 @@ static struct ftrace_ops graph_ops = { ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; +void ftrace_graph_sleep_time_control(bool enable) +{ + fgraph_sleep_time = enable; +} + +void ftrace_graph_graph_time_control(bool enable) +{ + fgraph_graph_time = enable; +} + int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; @@ -5707,7 +5722,7 @@ ftrace_graph_probe_sched_switch(void *ignore, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (trace_flags & TRACE_ITER_SLEEP_TIME) + if (fgraph_sleep_time) return; timestamp = trace_clock_local(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d98789b..e26933c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -492,7 +492,7 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, /* trace_flags holds trace_options default values */ unsigned long trace_flags = - FUNCTION_DEFAULT_FLAGS | FUNCTION_GRAPH_DEFAULT_FLAGS | + FUNCTION_DEFAULT_FLAGS | TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 33d1e53..5219bf5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -714,9 +714,14 @@ extern char trace_find_mark(unsigned long long duration); #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 #define TRACE_GRAPH_PRINT_TAIL 0x80 +#define TRACE_GRAPH_SLEEP_TIME 0x100 +#define TRACE_GRAPH_GRAPH_TIME 0x200 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) +extern void ftrace_graph_sleep_time_control(bool enable); +extern void ftrace_graph_graph_time_control(bool enable); + extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); @@ -892,15 +897,9 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER # define FGRAPH_FLAGS \ - C(SLEEP_TIME, "sleep-time"), \ - C(GRAPH_TIME, "graph-time"), \ C(DISPLAY_GRAPH, "display-graph"), -/* Initially set for trace_flags */ -# define FUNCTION_GRAPH_DEFAULT_FLAGS \ - (TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME) #else # define FGRAPH_FLAGS -# define FUNCTION_GRAPH_DEFAULT_FLAGS 0UL #endif #ifdef CONFIG_BRANCH_TRACER diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ca98445..86e45c2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -83,13 +83,18 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, + /* Include sleep time (scheduled out) between entry and return */ + { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, + /* Include time within nested functions */ + { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | + TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, .opts = trace_opts }; @@ -1362,6 +1367,12 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; + if (bit == TRACE_GRAPH_SLEEP_TIME) + ftrace_graph_sleep_time_control(set); + + if (bit == TRACE_GRAPH_GRAPH_TIME) + ftrace_graph_graph_time_control(set); + return 0; } -- cgit v1.1 From 983f938ae69585213bbb779d841b90e75f93f545 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 30 Sep 2015 09:42:05 -0400 Subject: tracing: Move trace_flags from global to a trace_array field In preparation to make trace options per instance, the global trace_flags needs to be moved from being a global variable to a field within the trace instance trace_array structure. There's still more work to do, as there's some functions that use trace_flags without passing in a way to get to the current_trace array. For those, the global_trace is used directly (from trace.c). This includes setting and clearing the trace_flags. This means that when a new instance is created, it just gets the trace_flags of the global_trace and will not be able to modify them. Depending on the functions that have access to the trace_array, the flags of an instance may not affect parts of its trace, where the global_trace is used. These will be fixed in future changes. Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 7 +-- kernel/trace/trace.c | 92 +++++++++++++++++++++--------------- kernel/trace/trace.h | 5 +- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_functions_graph.c | 50 ++++++++++++-------- kernel/trace/trace_irqsoff.c | 28 ++++++----- kernel/trace/trace_kdb.c | 8 ++-- kernel/trace/trace_output.c | 14 ++++-- kernel/trace/trace_sched_wakeup.c | 26 +++++----- kernel/trace/trace_syscalls.c | 3 +- 10 files changed, 135 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 973d41d..b2fcf47 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1343,6 +1343,7 @@ static const struct { static enum print_line_t print_one_line(struct trace_iterator *iter, bool classic) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; @@ -1351,7 +1352,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = t->action & ((1 << BLK_TC_SHIFT) - 1); - long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; if (t->action == BLK_TN_MESSAGE) { @@ -1413,9 +1414,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; else - trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; } return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e26933c..4e82f4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -250,6 +250,14 @@ unsigned long long ns2usecs(cycle_t nsec) return nsec; } +/* trace_flags holds trace_options default values */ +#define TRACE_DEFAULT_FLAGS \ + (FUNCTION_DEFAULT_FLAGS | \ + TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ + TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS) + /* * The global_trace is the descriptor that holds the tracing * buffers for the live tracing. For each CPU, it contains @@ -262,7 +270,9 @@ unsigned long long ns2usecs(cycle_t nsec) * pages for the buffer for that CPU. Each CPU has the same number * of pages allocated for its buffer. */ -static struct trace_array global_trace; +static struct trace_array global_trace = { + .trace_flags = TRACE_DEFAULT_FLAGS, +}; LIST_HEAD(ftrace_trace_arrays); @@ -490,15 +500,6 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, #endif -/* trace_flags holds trace_options default values */ -unsigned long trace_flags = - FUNCTION_DEFAULT_FLAGS | - TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | - TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS - ; - static void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) @@ -543,7 +544,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) int alloc; int pc; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); @@ -593,7 +594,7 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int pc; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); @@ -1875,7 +1876,7 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { - if (!(trace_flags & TRACE_ITER_STACKTRACE)) + if (!(global_trace.trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(buffer, flags, skip, pc, regs); @@ -1919,7 +1920,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) struct userstack_entry *entry; struct stack_trace trace; - if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* @@ -2236,7 +2237,7 @@ int trace_array_printk(struct trace_array *tr, int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -2251,7 +2252,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -2592,7 +2593,7 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); struct trace_buffer *buf = iter->trace_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; @@ -2654,8 +2655,9 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) @@ -2677,8 +2679,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter) static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; @@ -2688,7 +2691,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -2708,13 +2711,14 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -2732,6 +2736,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; @@ -2739,7 +2744,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -2761,13 +2766,14 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -2816,6 +2822,8 @@ int trace_empty(struct trace_iterator *iter) /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; if (iter->lost_events) { @@ -2861,6 +2869,7 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) @@ -2869,13 +2878,15 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) return; @@ -3220,7 +3231,7 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -3467,7 +3478,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { - if (trace_flags & (1 << i)) + if (tr->trace_flags & (1 << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); @@ -3532,7 +3543,7 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ - if (!!(trace_flags & mask) == !!enabled) + if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ @@ -3541,9 +3552,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return -EINVAL; if (enabled) - trace_flags |= mask; + tr->trace_flags |= mask; else - trace_flags &= ~mask; + tr->trace_flags &= ~mask; if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); @@ -4558,7 +4569,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -4615,11 +4626,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) static unsigned int trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { + struct trace_array *tr = iter->tr; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return POLLIN | POLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ @@ -5036,7 +5049,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -5069,7 +5082,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; - if (!(trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -6180,7 +6193,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, long index = (long)filp->private_data; char *buf; - if (trace_flags & (1 << index)) + if (global_trace.trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; @@ -6407,7 +6420,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size { enum ring_buffer_flags rb_flags; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; @@ -6502,6 +6515,8 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + tr->trace_flags = global_trace.trace_flags; + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&tr->start_lock); @@ -6938,6 +6953,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static atomic_t dump_running; + struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; @@ -6967,10 +6983,10 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; switch (oops_dump_mode) { case DUMP_ALL: @@ -7033,7 +7049,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - trace_flags |= old_userobj; + tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5219bf5..eda4e6f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -217,6 +217,7 @@ struct trace_array { int stop_count; int clock_id; struct tracer *current_trace; + unsigned int trace_flags; unsigned int flags; raw_spinlock_t start_lock; struct dentry *dir; @@ -698,8 +699,6 @@ int trace_array_printk_buf(struct ring_buffer *buffer, void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); -extern unsigned long trace_flags; - extern char trace_find_mark(unsigned long long duration); /* Standard output formatting function used for function return traces */ @@ -994,7 +993,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER_BRANCH) return enable_branch_tracing(tr); return 0; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b2e3d8d..0f39411 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -338,6 +338,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { struct trace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; int ret = 0; int disable; @@ -401,7 +402,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 86e45c2..92382af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -112,8 +112,8 @@ enum { }; static void -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags); +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -658,6 +658,7 @@ static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -665,7 +666,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -681,19 +682,19 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) print_graph_lat_fmt(s, ent); } /* No overhead */ - print_graph_duration(0, s, flags | FLAGS_FILL_START); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_START); if (type == TRACE_GRAPH_ENT) trace_seq_puts(s, "==========>"); else trace_seq_puts(s, "<=========="); - print_graph_duration(0, s, flags | FLAGS_FILL_END); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_END); trace_seq_putc(s, '\n'); } @@ -731,11 +732,11 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) } static void -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags) +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; /* No real adata, just filling the column with spaces */ @@ -769,6 +770,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct trace_seq *s, u32 flags) { struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; @@ -797,7 +799,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead and duration */ - print_graph_duration(duration, s, flags); + print_graph_duration(tr, duration, s, flags); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) @@ -815,6 +817,7 @@ print_graph_entry_nested(struct trace_iterator *iter, { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; int i; if (data) { @@ -830,7 +833,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) @@ -854,6 +857,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; + struct trace_array *tr = iter->tr; int cpu = iter->cpu; /* Pid */ @@ -863,7 +867,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; /* Absolute time */ @@ -881,7 +885,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) print_graph_lat_fmt(s, ent); return; @@ -1032,6 +1036,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, { unsigned long long duration = trace->rettime - trace->calltime; struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; @@ -1063,7 +1068,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, print_graph_prologue(iter, s, 0, 0, flags); /* Overhead and duration */ - print_graph_duration(duration, s, flags); + print_graph_duration(tr, duration, s, flags); /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) @@ -1096,7 +1101,8 @@ static enum print_line_t print_graph_comment(struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct fgraph_data *data = iter->private; struct trace_event *event; int depth = 0; @@ -1109,7 +1115,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, print_graph_prologue(iter, s, 0, 0, flags); /* No time */ - print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); /* Indentation */ if (depth > 0) @@ -1250,9 +1256,10 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s||| / \n", size, spaces); } -static void __print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct trace_array *tr, + struct seq_file *s, u32 flags) { - int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; if (lat) print_lat_header(s, flags); @@ -1294,11 +1301,12 @@ static void print_graph_headers(struct seq_file *s) void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; - if (trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; @@ -1306,7 +1314,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) print_trace_header(s, iter); } - __print_graph_headers_flags(s, flags); + __print_graph_headers_flags(tr, s, flags); } void graph_trace_open(struct trace_iterator *iter) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c834b95..eaf5291 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -58,13 +58,13 @@ irq_trace(void) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int irqsoff_display_graph(struct trace_array *tr, int set); -# define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) #else static inline int irqsoff_display_graph(struct trace_array *tr, int set) { return -EINVAL; } -# define is_graph() false +# define is_graph(tr) false #endif /* @@ -149,7 +149,7 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) { int cpu; - if (!(is_graph() ^ set)) + if (!(is_graph(tr) ^ set)) return 0; stop_irqsoff_tracer(irqsoff_trace, !set); @@ -198,7 +198,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) static void irqsoff_trace_open(struct trace_iterator *iter) { - if (is_graph()) + if (is_graph(iter->tr)) graph_trace_open(iter); } @@ -220,7 +220,7 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ - if (is_graph()) + if (is_graph(iter->tr)) return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; @@ -228,7 +228,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) + struct trace_array *tr = irqsoff_trace; + + if (is_graph(tr)) print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); else trace_default_header(s); @@ -239,7 +241,7 @@ __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (is_graph()) + if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, flags, pc); else trace_function(tr, ip, parent_ip, flags, pc); @@ -516,7 +518,7 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) int ret; /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) return 0; if (graph) @@ -550,9 +552,9 @@ static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) return 0; if (set) - register_irqsoff_function(tr, is_graph(), 1); + register_irqsoff_function(tr, is_graph(tr), 1); else - unregister_irqsoff_function(tr, is_graph()); + unregister_irqsoff_function(tr, is_graph(tr)); return 1; } #else @@ -610,7 +612,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) if (irqsoff_busy) return -EBUSY; - save_flags = trace_flags; + save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); @@ -626,7 +628,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) /* Only toplevel instance supports graph tracing */ if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && - is_graph()))) + is_graph(tr)))) printk(KERN_ERR "failed to start irqsoff tracer\n"); irqsoff_busy = true; @@ -638,7 +640,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr) int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - stop_irqsoff_tracer(tr, is_graph()); + stop_irqsoff_tracer(tr, is_graph(tr)); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3ccf5c2..57149bc 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -21,20 +21,22 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; + tr = iter.tr; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - old_userobj = trace_flags; + old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); @@ -82,7 +84,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); out: - trace_flags = old_userobj; + tr->trace_flags = old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3b5dcdf..2829821 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -476,7 +476,8 @@ char trace_find_mark(unsigned long long d) static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { - unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + struct trace_array *tr = iter->tr; + unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -519,6 +520,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) int trace_print_context(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; unsigned long long t; @@ -530,7 +532,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { @@ -546,14 +548,15 @@ int trace_print_context(struct trace_iterator *iter) int trace_print_lat_context(struct trace_iterator *iter) { - u64 next_ts; + struct trace_array *tr = iter->tr; /* trace_find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; + u64 next_ts; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, &next_ts); - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); /* Restore the original ent_size */ iter->ent_size = ent_size; @@ -1035,6 +1038,7 @@ static struct trace_event trace_stack_event = { static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, int flags, struct trace_event *event) { + struct trace_array *tr = iter->tr; struct userstack_entry *field; struct trace_seq *s = &iter->seq; struct mm_struct *mm = NULL; @@ -1044,7 +1048,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "\n"); - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { struct task_struct *task; /* * we do the lookup on the thread group leader, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4a20f61..4661442 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -39,13 +39,13 @@ static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_display_graph(struct trace_array *tr, int set); -# define is_graph() (trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) #else static inline int wakeup_display_graph(struct trace_array *tr, int set) { return 0; } -# define is_graph() false +# define is_graph(tr) false #endif @@ -131,7 +131,7 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) int ret; /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) return 0; if (graph) @@ -165,9 +165,9 @@ static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) return 0; if (set) - register_wakeup_function(tr, is_graph(), 1); + register_wakeup_function(tr, is_graph(tr), 1); else - unregister_wakeup_function(tr, is_graph()); + unregister_wakeup_function(tr, is_graph(tr)); return 1; } #else @@ -221,7 +221,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_display_graph(struct trace_array *tr, int set) { - if (!(is_graph() ^ set)) + if (!(is_graph(tr) ^ set)) return 0; stop_func_tracer(tr, !set); @@ -270,7 +270,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) static void wakeup_trace_open(struct trace_iterator *iter) { - if (is_graph()) + if (is_graph(iter->tr)) graph_trace_open(iter); } @@ -290,7 +290,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ - if (is_graph()) + if (is_graph(iter->tr)) return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; @@ -298,7 +298,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) static void wakeup_print_header(struct seq_file *s) { - if (is_graph()) + if (is_graph(wakeup_trace)) print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); else trace_default_header(s); @@ -309,7 +309,7 @@ __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (is_graph()) + if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, flags, pc); else trace_function(tr, ip, parent_ip, flags, pc); @@ -639,7 +639,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(tr, is_graph())) + if (start_func_tracer(tr, is_graph(tr))) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -652,7 +652,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(tr, is_graph()); + stop_func_tracer(tr, is_graph(tr)); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -663,7 +663,7 @@ static bool wakeup_busy; static int __wakeup_tracer_init(struct trace_array *tr) { - save_flags = trace_flags; + save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7d567a4..0655afb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -110,6 +110,7 @@ static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; @@ -136,7 +137,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) + if (tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ -- cgit v1.1 From 9a38a8856f41f90cc7e57798c544e3fe77033196 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 30 Sep 2015 11:11:15 -0400 Subject: tracing: Add a method to pass in trace_array descriptor to option files In preparation of having the multi buffer instances having their own trace option flags, the trace option files needs a way to not only pass in the flag they represent, but also the trace_array descriptor. A new field is added to the trace_array descriptor called trace_flags_index, which is a 32 byte character array representing a bit. This array is simply filled with the index of the array, where index_array[n] = n; Then the address of this array is passed to the file callbacks instead of the index of the flag index. Then to retrieve both the flag index and the trace_array descriptor: data is the passed in argument. index = *(unsigned char *)data; data -= index; /* Now data points to the address of the array in the trace_array */ tr = container_of(data, struct trace_array, trace_flags_index); Suggested-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++------ kernel/trace/trace.h | 3 +++ 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4e82f4a..5f48188 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6186,14 +6186,51 @@ static const struct file_operations trace_options_fops = { .llseek = generic_file_llseek, }; +/* + * In order to pass in both the trace_array descriptor as well as the index + * to the flag that the trace option file represents, the trace_array + * has a character array of trace_flags_index[], which holds the index + * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. + * The address of this character array is passed to the flag option file + * read/write callbacks. + * + * In order to extract both the index and the trace_array descriptor, + * get_tr_index() uses the following algorithm. + * + * idx = *ptr; + * + * As the pointer itself contains the address of the index (remember + * index[1] == 1). + * + * Then to get the trace_array descriptor, by subtracting that index + * from the ptr, we get to the start of the index itself. + * + * ptr - idx == &index[0] + * + * Then a simple container_of() from that pointer gets us to the + * trace_array descriptor. + */ +static void get_tr_index(void *data, struct trace_array **ptr, + unsigned int *pindex) +{ + *pindex = *(unsigned char *)data; + + *ptr = container_of(data - *pindex, struct trace_array, + trace_flags_index); +} + static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; char *buf; - if (global_trace.trace_flags & (1 << index)) + get_tr_index(tr_index, &tr, &index); + + if (tr->trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; @@ -6205,11 +6242,14 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = &global_trace; - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; unsigned long val; int ret; + get_tr_index(tr_index, &tr, &index); + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -6339,8 +6379,9 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, (void *)index, - &trace_options_core_fops); + return trace_create_file(option, 0644, t_options, + (void *)&tr->trace_flags_index[index], + &trace_options_core_fops); } static __init void create_trace_options_dir(struct trace_array *tr) @@ -6490,6 +6531,15 @@ static void free_trace_buffers(struct trace_array *tr) #endif } +static void init_trace_flags_index(struct trace_array *tr) +{ + int i; + + /* Used by the trace options files */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) + tr->trace_flags_index[i] = i; +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -6542,6 +6592,7 @@ static int instance_mkdir(const char *name) } init_tracer_tracefs(tr, tr->dir); + init_trace_flags_index(tr); list_add(&tr->list, &ftrace_trace_arrays); @@ -7068,7 +7119,7 @@ __init static int tracer_alloc_buffers(void) * Make sure we don't accidently add more trace options * than we have bits for. */ - BUILD_BUG_ON(TRACE_ITER_LAST_BIT > 32); + BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -7128,6 +7179,8 @@ __init static int tracer_alloc_buffers(void) ftrace_init_global_array_ops(&global_trace); + init_trace_flags_index(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index eda4e6f..423cb48 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -168,6 +168,8 @@ struct trace_buffer { int cpu; }; +#define TRACE_FLAGS_MAX_SIZE 32 + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -218,6 +220,7 @@ struct trace_array { int clock_id; struct tracer *current_trace; unsigned int trace_flags; + unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; struct dentry *dir; -- cgit v1.1 From 2d34f48955158cfdf18704256c84b04fe3a16c7b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 30 Sep 2015 11:45:22 -0400 Subject: tracing: Make ftrace_trace_stack() depend on general trace_array flag In preparation for the multi buffer instances to have their own trace_flags, the check in ftrace_trace_stack() needs to test the trace_array descriptor flag that is for the current event, not the global_trace descriptor. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 23 +++++++++++++---------- kernel/trace/trace_events.c | 6 +++--- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f48188..51697b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -482,7 +482,8 @@ static inline void trace_access_lock_init(void) static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); -static inline void ftrace_trace_stack(struct ring_buffer *buffer, +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); @@ -492,7 +493,8 @@ static inline void __ftrace_trace_stack(struct ring_buffer *buffer, int skip, int pc, struct pt_regs *regs) { } -static inline void ftrace_trace_stack(struct ring_buffer *buffer, +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { @@ -574,7 +576,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); return size; } @@ -614,7 +616,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); return 1; } @@ -1691,7 +1693,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc, NULL); + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); @@ -1743,7 +1745,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -1872,11 +1874,12 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } -static inline void ftrace_trace_stack(struct ring_buffer *buffer, +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { - if (!(global_trace.trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(buffer, flags, skip, pc, regs); @@ -2164,7 +2167,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc, NULL); + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); } out: @@ -2216,7 +2219,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc, NULL); + ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } out: preempt_enable_notrace(); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0f39411..57c9e70 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2941,15 +2941,15 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; + event_tr = top_trace_array(); + if (WARN_ON(!event_tr)) + return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); return; } pr_info("Running tests again, along with the function tracer\n"); - event_tr = top_trace_array(); - if (WARN_ON(!event_tr)) - return; event_trace_self_tests(); unregister_ftrace_function(&trace_ops); } -- cgit v1.1 From 16270145ce6b90750bbe4f9365865f65037b2027 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 30 Sep 2015 12:30:06 -0400 Subject: tracing: Add trace options for core options to instances Allow instances to have their own options, at least for the core options (non tracer specific ones). There are a few global options that should not be added to instances, like enabling of trace_printk, and the sched comm recording, which do not have a specific trace instance associated to them. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 51697b4..7b99e36 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -258,6 +258,11 @@ unsigned long long ns2usecs(cycle_t nsec) TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS) +/* trace_options that are only supported by global_trace */ +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ + TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) + + /* * The global_trace is the descriptor that holds the tracing * buffers for the live tracing. For each CPU, it contains @@ -6387,17 +6392,21 @@ create_trace_option_core_file(struct trace_array *tr, &trace_options_core_fops); } -static __init void create_trace_options_dir(struct trace_array *tr) +static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; + bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; - for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(tr, trace_options[i], i); + for (i = 0; trace_options[i]; i++) { + if (top_level || + !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + create_trace_option_core_file(tr, trace_options[i], i); + } } static ssize_t @@ -6707,6 +6716,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + create_trace_options_dir(tr); + #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); @@ -6903,8 +6914,6 @@ static __init int tracer_init_tracefs(void) create_trace_instances(d_tracer); - create_trace_options_dir(&global_trace); - mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) add_tracer_options(&global_trace, t); -- cgit v1.1 From 37aea98b84c0ce2ac638510fefeed9f8f920bd34 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 30 Sep 2015 14:27:31 -0400 Subject: tracing: Add trace options for tracer options to instances Add the tracer options to instances options directory as well. Only add the options for tracers that are allowed to be enabled by an instance. But note, that tracer options are global. That is, tracer options enabled in an instance, also take affect at the top level and in other instances. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 80 ++++++++++++++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 9 +++++- 2 files changed, 67 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7b99e36..78022c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4308,7 +4308,7 @@ int tracing_update_buffers(void) struct trace_option_dentry; -static struct trace_option_dentry * +static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* @@ -4334,15 +4334,7 @@ static void add_tracer_options(struct trace_array *tr, struct tracer *t) if (!tr->dir) return; - /* Currently, only the top instance has options */ - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - return; - - /* Ignore if they were already created */ - if (t->topts) - return; - - t->topts = create_trace_option_files(tr, t); + create_trace_option_files(tr, t); } static int tracing_set_tracer(struct trace_array *tr, const char *buf) @@ -6341,21 +6333,39 @@ create_trace_option_file(struct trace_array *tr, } -static struct trace_option_dentry * +static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; + struct trace_options *tr_topts; struct tracer_flags *flags; struct tracer_opt *opts; int cnt; + int i; if (!tracer) - return NULL; + return; flags = tracer->flags; if (!flags || !flags->opts) - return NULL; + return; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return; + + for (i = 0; i < tr->nr_topts; i++) { + /* + * Check if these flags have already been added. + * Some tracers share flags. + */ + if (tr->topts[i].tracer->flags == tracer->flags) + return; + } opts = flags->opts; @@ -6364,7 +6374,19 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return NULL; + return; + + tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), + GFP_KERNEL); + if (!tr_topts) { + kfree(topts); + return; + } + + tr->topts = tr_topts; + tr->topts[tr->nr_topts].tracer = tracer; + tr->topts[tr->nr_topts].topts = topts; + tr->nr_topts++; for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, @@ -6373,8 +6395,6 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) "Failed to create trace option: %s", opts[cnt].name); } - - return topts; } static struct dentry * @@ -6552,6 +6572,21 @@ static void init_trace_flags_index(struct trace_array *tr) tr->trace_flags_index[i] = i; } +static void __update_tracer_options(struct trace_array *tr) +{ + struct tracer *t; + + for (t = trace_types; t; t = t->next) + add_tracer_options(tr, t); +} + +static void update_tracer_options(struct trace_array *tr) +{ + mutex_lock(&trace_types_lock); + __update_tracer_options(tr); + mutex_unlock(&trace_types_lock); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -6605,6 +6640,7 @@ static int instance_mkdir(const char *name) init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); + __update_tracer_options(tr); list_add(&tr->list, &ftrace_trace_arrays); @@ -6630,6 +6666,7 @@ static int instance_rmdir(const char *name) struct trace_array *tr; int found = 0; int ret; + int i; mutex_lock(&trace_types_lock); @@ -6655,6 +6692,11 @@ static int instance_rmdir(const char *name) debugfs_remove_recursive(tr->dir); free_trace_buffers(tr); + for (i = 0; i < tr->nr_topts; i++) { + kfree(tr->topts[i].topts); + } + kfree(tr->topts); + kfree(tr->name); kfree(tr); @@ -6877,7 +6919,6 @@ static struct notifier_block trace_module_nb = { static __init int tracer_init_tracefs(void) { struct dentry *d_tracer; - struct tracer *t; trace_access_lock_init(); @@ -6914,10 +6955,7 @@ static __init int tracer_init_tracefs(void) create_trace_instances(d_tracer); - mutex_lock(&trace_types_lock); - for (t = trace_types; t; t = t->next) - add_tracer_options(&global_trace, t); - mutex_unlock(&trace_types_lock); + update_tracer_options(&global_trace); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 423cb48..fb8a61c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -159,6 +159,7 @@ struct trace_array_cpu { }; struct tracer; +struct trace_option_dentry; struct trace_buffer { struct trace_array *tr; @@ -170,6 +171,11 @@ struct trace_buffer { #define TRACE_FLAGS_MAX_SIZE 32 +struct trace_options { + struct tracer *tracer; + struct trace_option_dentry *topts; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -218,6 +224,7 @@ struct trace_array { #endif int stop_count; int clock_id; + int nr_topts; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; @@ -227,6 +234,7 @@ struct trace_array { struct dentry *options; struct dentry *percpu_dir; struct dentry *event_dir; + struct trace_options *topts; struct list_head systems; struct list_head events; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ @@ -398,7 +406,6 @@ struct tracer { u32 mask, int set); struct tracer *next; struct tracer_flags *flags; - struct trace_option_dentry *topts; int enabled; int ref; bool print_max; -- cgit v1.1 From 95c2b17534654829db428f11bcf4297c059a2a7e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 26 Sep 2015 12:23:56 +0100 Subject: genirq: Fix race in register_irq_proc() Per-IRQ directories in procfs are created only when a handler is first added to the irqdesc, not when the irqdesc is created. In the case of a shared IRQ, multiple tasks can race to create a directory. This race condition seems to have been present forever, but is easier to hit with async probing. Signed-off-by: Ben Hutchings Link: http://lkml.kernel.org/r/1443266636.2004.2.camel@decadent.org.uk Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/proc.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e3a8c95..a50ddc9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internals.h" @@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ @@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) -- cgit v1.1 From f1e0bb0ad473a32d1b7e6d285ae9f7e47710bb5e Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 24 Sep 2015 17:32:13 +0800 Subject: genirq: Introduce generic irq migration for cpu hotunplug ARM and ARM64 have almost identical code for migrating interrupts on cpu hotunplug. Provide a generic version which can be used by both. The new code addresses a shortcoming in the ARM[64] variants which fails to update the affinity change in some cases. The solution for this is to use the core function irq_do_set_affinity() instead of open coding it. [ tglx: Added copyright notice and license boilerplate. Rewrote subject and changelog. ] Signed-off-by: Yang Yingliang Acked-by: Russell King - ARM Linux Cc: Jiang Liu Cc: Marc Zyngier Cc: Mark Rutland Cc: Will Deacon Cc: Hanjun Guo Cc: Link: http://lkml.kernel.org/r/1443087135-17044-2-git-send-email-yangyingliang@huawei.com Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 4 +++ kernel/irq/Makefile | 1 + kernel/irq/cpuhotplug.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 kernel/irq/cpuhotplug.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9a76e3b..3b48dab 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ config GENERIC_PENDING_IRQ bool +# Support for generic irq migrating off cpu before the cpu is offline. +config GENERIC_IRQ_MIGRATION + bool + # Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index d121235..2fc9cbd 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o +obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c new file mode 100644 index 0000000..80f4f4e --- /dev/null +++ b/kernel/irq/cpuhotplug.c @@ -0,0 +1,82 @@ +/* + * Generic cpu hotunplug interrupt migration code copied from the + * arch/arm implementation + * + * Copyright (C) Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +#include "internals.h" + +static bool migrate_one_irq(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + const struct cpumask *affinity = d->common->affinity; + struct irq_chip *c; + bool ret = false; + + /* + * If this is a per-CPU interrupt, or the affinity does not + * include this CPU, then we have nothing to do. + */ + if (irqd_is_per_cpu(d) || + !cpumask_test_cpu(smp_processor_id(), affinity)) + return false; + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + affinity = cpu_online_mask; + ret = true; + } + + c = irq_data_get_irq_chip(d); + if (!c->irq_set_affinity) { + pr_warn_ratelimited("IRQ%u: unable to set affinity\n", d->irq); + } else { + int r = irq_do_set_affinity(d, affinity, false); + if (r) + pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", + d->irq, r); + } + + return ret; +} + +/** + * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu + * + * The current CPU has been marked offline. Migrate IRQs off this CPU. + * If the affinity settings do not allow other CPUs, force them onto any + * available CPU. + * + * Note: we must iterate over all IRQs, whether they have an attached + * action structure or not, as we need to get chained interrupts too. + */ +void irq_migrate_all_off_this_cpu(void) +{ + unsigned int irq; + struct irq_desc *desc; + unsigned long flags; + + local_irq_save(flags); + + for_each_active_irq(irq) { + bool affinity_broken; + + desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); + affinity_broken = migrate_one_irq(desc); + raw_spin_unlock(&desc->lock); + + if (affinity_broken) + pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", + irq, smp_processor_id()); + } + + local_irq_restore(flags); +} -- cgit v1.1 From 79ac6ef521075d0f40805df77e8890c55f538fe4 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 23:24:01 +0200 Subject: tracing: Use kstrdup_const instead of private implementation The kernel now has kstrdup_const/kfree_const for reusing .rodata (typically string literals) when possible; there's no reason to duplicate that logic in the tracing system. Moreover, as the comment above core_kernel_data states, it may not always return true for .rodata - that is for example the case on x86_64, where we thus end up kstrdup'ing all the passed-in strings. Arguably, testing for .rodata explicitly (as kstrdup_const does) is also more correct: I don't think one is supposed to be able to change the name after creating the event_subsystem by passing the address of a static char (but non-const) array. Link: http://lkml.kernel.org/r/1441833841-12955-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57c9e70..d120cfe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -38,21 +38,19 @@ static LIST_HEAD(ftrace_common_fields); static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; -#define SYSTEM_FL_FREE_NAME (1 << 31) - static inline int system_refcount(struct event_subsystem *system) { - return system->ref_count & ~SYSTEM_FL_FREE_NAME; + return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { - return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; + return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { - return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; + return --system->ref_count; } /* Double loops, do not use break, only goto's work */ @@ -461,8 +459,7 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } - if (system->ref_count & SYSTEM_FL_FREE_NAME) - kfree(system->name); + kfree_const(system->name); kfree(system); } @@ -1493,13 +1490,9 @@ create_new_subsystem(const char *name) system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ - if (!core_kernel_data((unsigned long)name)) { - system->ref_count |= SYSTEM_FL_FREE_NAME; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) - goto out_free; - } else - system->name = name; + system->name = kstrdup_const(name, GFP_KERNEL); + if (!system->name) + goto out_free; system->filter = NULL; @@ -1512,8 +1505,7 @@ create_new_subsystem(const char *name) return system; out_free: - if (system->ref_count & SYSTEM_FL_FREE_NAME) - kfree(system->name); + kfree_const(system->name); kfree(system); return NULL; } -- cgit v1.1 From 9eec50b8bbe1535c440a1ee88c1958f78fc55957 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:50 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_RUNTIME support HV_X64_MSR_VP_RUNTIME msr used by guest to get "the time the virtual processor consumes running guest code, and the time the associated logical processor spends running hypervisor code on behalf of that guest." Calculation of this time is performed by task_cputime_adjusted() for vcpu task. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- kernel/sched/cputime.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8cbc3db..26a5446 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { -- cgit v1.1 From 6db0290322101f971d6c06ee652d9838f3f4ee92 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 23:27:02 +0200 Subject: ftrace: Remove redundant swap function To cover the common case of sorting an array of pointers, Daniel Wagner recently modified the library sort() to use a specific swap function for size==8, in addition to the size==4 case which was already handled. Since sizeof(long) is either 4 or 8, ftrace_swap_ips() is redundant and we can just let sort() pick an appropriate and fast swap callback. Link: http://lkml.kernel.org/r/1441834023-13130-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e7638489..f7b78d7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4788,17 +4788,6 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } -static void ftrace_swap_ips(void *a, void *b, int size) -{ - unsigned long *ipa = a; - unsigned long *ipb = b; - unsigned long t; - - t = *ipa; - *ipa = *ipb; - *ipb = t; -} - static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -4818,7 +4807,7 @@ static int ftrace_process_locs(struct module *mod, return 0; sort(start, count, sizeof(*start), - ftrace_cmp_ips, ftrace_swap_ips); + ftrace_cmp_ips, NULL); start_pg = ftrace_allocate_pages(count); if (!start_pg) -- cgit v1.1 From 7ec88e4be461590b5a3817460c34603f76d9b3ae Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 28 Sep 2015 22:21:28 +0200 Subject: ntp/pps: use timespec64 for hardpps() There is only one user of the hardpps function in the kernel, so it makes sense to atomically change it over to using 64-bit timestamps for y2038 safety. In the hardpps implementation, we also need to change the pps_normtime structure, which is similar to struct timespec and also requires a 64-bit seconds portion. This introduces two temporary variables in pps_kc_event() to do the conversion, they will be removed again in the next step, which seemed preferable to having a larger patch changing it all at the same time. Acked-by: Richard Cochran Acked-by: David S. Miller Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/ntp.c | 12 ++++++------ kernel/time/ntp_internal.h | 2 +- kernel/time/timekeeping.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index df68cb8..bd4fa62 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -99,7 +99,7 @@ static time64_t ntp_next_leap_sec = TIME64_MAX; static int pps_valid; /* signal watchdog counter */ static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ -static struct timespec pps_fbase; /* beginning of the last freq interval */ +static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ @@ -773,13 +773,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ struct pps_normtime { - __kernel_time_t sec; /* seconds */ + s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; /* normalize the timestamp so that nsec is in the ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ -static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { .sec = ts.tv_sec, @@ -861,7 +861,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) pps_errcnt++; pps_dec_freq_interval(); printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %ld s\n", + "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; } @@ -948,7 +948,7 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; @@ -969,7 +969,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) } /* ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); /* check that the signal is in the range * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 6543050..af92447 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -9,5 +9,5 @@ extern ktime_t ntp_get_next_leap(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); -extern void __hardpps(const struct timespec *, const struct timespec *); +extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3739ac6..177188b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2025,7 +2025,7 @@ int do_adjtimex(struct timex *txc) /** * hardpps() - Accessor function to NTP __hardpps function */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { unsigned long flags; -- cgit v1.1 From 071eee45b1650d53d21c636d344bdcebd4577ed2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 28 Sep 2015 22:21:29 +0200 Subject: ntp/pps: replace getnstime_raw_and_real with 64-bit version There is exactly one caller of getnstime_raw_and_real in the kernel, which is the pps_get_ts function. This changes the caller and the implementation to work on timespec64 types rather than timespec, to avoid the time_t overflow on 32-bit architectures. For consistency with the other new functions (ktime_get_seconds, ktime_get_real_*, ...), I'm renaming the function to ktime_get_raw_and_real_ts64. We still need to convert from the internal 64-bit type to 32 bit types in the caller, but this conversion is now pushed out from getnstime_raw_and_real to pps_get_ts. A follow-up patch changes the remaining pps code to completely avoid the conversion. Acked-by: Richard Cochran Acked-by: David S. Miller Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 177188b..274ed5e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -849,7 +849,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); #ifdef CONFIG_NTP_PPS /** - * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format * @ts_raw: pointer to the timespec to be set to raw monotonic time * @ts_real: pointer to the timespec to be set to the time of day * @@ -857,7 +857,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * same time atomically and stores the resulting timestamps in timespec * format. */ -void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; @@ -868,7 +868,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) do { seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = timespec64_to_timespec(tk->raw_time); + *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; @@ -877,10 +877,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec_add_ns(ts_raw, nsecs_raw); - timespec_add_ns(ts_real, nsecs_real); + timespec64_add_ns(ts_raw, nsecs_raw); + timespec64_add_ns(ts_real, nsecs_real); } -EXPORT_SYMBOL(getnstime_raw_and_real); +EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); #endif /* CONFIG_NTP_PPS */ -- cgit v1.1 From 5fd96c421ff2c76ec441aa4139c3b87dfea93e3a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 28 Sep 2015 22:21:30 +0200 Subject: ntp: use timespec64 in sync_cmos_clock The sync_cmos_clock has one use of struct timespec, which we want to eventually replace with timespec64 or similar in the kernel. There is no way this one can overflow, but the conversion to timespec64 is trivial and has no other dependencies. Acked-by: Richard Cochran Acked-by: David S. Miller Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bd4fa62..149cc80 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -509,7 +509,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { struct timespec64 now; - struct timespec next; + struct timespec64 next; int fail = 1; /* @@ -559,7 +559,7 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_nsec -= NSEC_PER_SEC; } queue_delayed_work(system_power_efficient_wq, - &sync_cmos_work, timespec_to_jiffies(&next)); + &sync_cmos_work, timespec64_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) -- cgit v1.1 From 67dfae0cd72fec5cd158b6e5fb1647b7dbe0834c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Sep 2015 18:05:20 -0700 Subject: clocksource: Fix abs() usage w/ 64bit values This patch fixes one cases where abs() was being used with 64-bit nanosecond values, where the result may be capped at 32-bits. This potentially could cause watchdog false negatives on 32-bit systems, so this patch addresses the issue by using abs64(). Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1442279124-7309-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 841b72f..3a38775 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { + if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", -- cgit v1.1 From a91263d520246b63c63e75ddfb072ee6a853fe15 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2015 01:41:50 +0200 Subject: ebpf: migrate bpf_prog's flags to bitfield As we need to add further flags to the bpf_prog structure, lets migrate both bools to a bitfield representation. The size of the base structure (excluding insns) remains unchanged at 40 bytes. Add also tags for the kmemchecker, so that it doesn't throw false positives. Even in case gcc would generate suboptimal code, it's not being accessed in performance critical paths. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 ++++ kernel/bpf/syscall.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 67c380c..c8855c2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; + kmemcheck_annotate_bitfield(fp, meta); + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -110,6 +112,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35bac8e..2190ab1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -553,10 +553,10 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; prog->orig_prog = NULL; - prog->jited = false; + prog->jited = 0; atomic_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl ? 1 : 0; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); -- cgit v1.1 From c46646d0484f5d08e2bede9b45034ba5b8b489cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2015 01:41:51 +0200 Subject: sched, bpf: add helper for retrieving routing realms Using routing realms as part of the classifier is quite useful, it can be viewed as a tag for one or multiple routing entries (think of an analogy to net_cls cgroup for processes), set by user space routing daemons or via iproute2 as an indicator for traffic classifiers and later on processed in the eBPF program. Unlike actions, the classifier can inspect device flags and enable netif_keep_dst() if necessary. tc actions don't have that possibility, but in case people know what they are doing, it can be used from there as well (e.g. via devs that must keep dsts by design anyway). If a realm is set, the handler returns the non-zero realm. User space can set the full 32bit realm for the dst. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2190ab1..5f35f42 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -402,6 +402,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) */ BUG_ON(!prog->aux->ops->get_func_proto); + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in -- cgit v1.1 From 621a5f7ad9cd1ce7933f1d302067cbd58354173c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sat, 26 Sep 2015 15:04:07 -0700 Subject: debugfs: Pass bool pointer to debugfs_create_bool() Its a bit odd that debugfs_create_bool() takes 'u32 *' as an argument, when all it needs is a boolean pointer. It would be better to update this API to make it accept 'bool *' instead, as that will make it more consistent and often more convenient. Over that bool takes just a byte. That required updates to all user sites as well, in the same commit updating the API. regmap core was also using debugfs_{read|write}_file_bool(), directly and variable types were updated for that to be bool as well. Signed-off-by: Viresh Kumar Acked-by: Mark Brown Acked-by: Charles Keepax Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6e443ef..395b967 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -267,10 +267,10 @@ static struct futex_hash_bucket *futex_queues; static struct { struct fault_attr attr; - u32 ignore_private; + bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_private = 0, + .ignore_private = false, }; static int __init setup_fail_futex(char *str) -- cgit v1.1 From bab18991871545dfbd10c931eb0fe8f7637156a9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Oct 2015 15:17:33 +0200 Subject: bpf, seccomp: prepare for upcoming criu support The current ongoing effort to dump existing cBPF seccomp filters back to user space requires to hold the pre-transformed instructions like we do in case of socket filters from sk_attach_filter() side, so they can be reloaded in original form at a later point in time by utilities such as criu. To prepare for this, simply extend the bpf_prog_create_from_user() API to hold a flag that tells whether we should store the original or not. Also, fanout filters could make use of that in future for things like diag. While fanout filters already use bpf_prog_destroy(), move seccomp over to them as well to handle original programs when present. Signed-off-by: Daniel Borkmann Cc: Tycho Andersen Cc: Pavel Emelyanov Cc: Kees Cook Cc: Andy Lutomirski Cc: Alexei Starovoitov Tested-by: Tycho Andersen Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5bd4779..06858a7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -370,7 +370,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(-ENOMEM); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, - seccomp_check_filter); + seccomp_check_filter, false); if (ret < 0) { kfree(sfilter); return ERR_PTR(ret); @@ -469,7 +469,7 @@ void get_seccomp_filter(struct task_struct *tsk) static inline void seccomp_filter_free(struct seccomp_filter *filter) { if (filter) { - bpf_prog_free(filter->prog); + bpf_prog_destroy(filter->prog); kfree(filter); } } -- cgit v1.1 From 0cdf5640e4f6940bdbbefee4bb0adb7dffb185ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Oct 2015 18:42:00 +0200 Subject: ebpf: include perf_event only where really needed Commit ea317b267e9d ("bpf: Add new bpf map type to store the pointer to struct perf_event") added perf_event.h to the main eBPF header, so it gets included for all users. perf_event.h is actually only needed from array map side, so lets sanitize this a bit. Signed-off-by: Daniel Borkmann Cc: Kaixu Xia Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 29ace10..2fecc4a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -15,6 +15,7 @@ #include #include #include +#include /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) -- cgit v1.1 From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 14:45:09 +0200 Subject: sched/core: Fix TASK_DEAD race in finish_task_switch() So the problem this patch is trying to address is as follows: CPU0 CPU1 context_switch(A, B) ttwu(A) LOCK A->pi_lock A->on_cpu == 0 finish_task_switch(A) prev_state = A->state <-. WMB | A->on_cpu = 0; | UNLOCK rq0->lock | | context_switch(C, A) `-- A->state = TASK_DEAD prev_state == TASK_DEAD put_task_struct(A) context_switch(A, C) finish_task_switch(A) A->state == TASK_DEAD put_task_struct(A) The argument being that the WMB will allow the load of A->state on CPU0 to cross over and observe CPU1's store of A->state, which will then result in a double-drop and use-after-free. Now the comment states (and this was true once upon a long time ago) that we need to observe A->state while holding rq->lock because that will order us against the wakeup; however the wakeup will not in fact acquire (that) rq->lock; it takes A->pi_lock these days. We can obviously fix this by upgrading the WMB to an MB, but that is expensive, so we'd rather avoid that. The alternative this patch takes is: smp_store_release(&A->on_cpu, 0), which avoids the MB on some archs, but not important ones like ARM. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: # v3.1+ Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: manfred@colorfullife.com Cc: will.deacon@arm.com Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()") Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/sched.h | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6159531..10a8faa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,11 +2517,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda11..6d2a119 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ -- cgit v1.1 From b99def8b961448f5b9a550dddeeb718e3975e7a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:02:03 +0200 Subject: sched/core: Rework TASK_DEAD preemption exception TASK_DEAD is special in that the final schedule call from do_exit() must be done with preemption disabled. This means we end up scheduling with a preempt_count() higher than usual (3 instead of the 'expected' 2). Since future patches will want to rely on an invariant preempt_count() value during schedule, fix this up. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88a4254..530fe8ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2949,12 +2949,8 @@ static inline void schedule_debug(struct task_struct *prev) #ifdef CONFIG_SCHED_STACK_END_CHECK BUG_ON(unlikely(task_stack_end_corrupted(prev))); #endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + + if (unlikely(in_atomic_preempt_off())) __schedule_bug(prev); rcu_sleep_check(); @@ -3053,6 +3049,17 @@ static void __sched __schedule(void) rcu_note_context_switch(); prev = rq->curr; + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + schedule_debug(prev); if (sched_feat(HRTICK)) -- cgit v1.1 From 609ca066386b2e64d4c0b0f55da327654962a0c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:52:18 +0200 Subject: sched/core: Create preempt_count invariant Assuming units of PREEMPT_DISABLE_OFFSET for preempt_count() numbers. Now that TASK_DEAD no longer results in preempt_count() == 3 during scheduling, we will always call context_switch() with preempt_count() == 2. However, we don't always end up with preempt_count() == 2 in finish_task_switch() because new tasks get created with preempt_count() == 1. Create FORK_PREEMPT_COUNT and set it to 2 and use that in the right places. Note that we cannot use INIT_PREEMPT_COUNT as that serves another purpose (boot). After this, preempt_count() is invariant across the context switch, with exception of PREEMPT_ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 530fe8ba..8d8722b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2504,6 +2504,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + rq->prev_mm = NULL; /* @@ -2588,8 +2600,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); -- cgit v1.1 From fc13aebab7d8f0d19d557c721a0f25cdf7ae905c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:05:34 +0200 Subject: sched/core: Add preempt argument to __schedule() There is only a single PREEMPT_ACTIVE use in the regular __schedule() path and that is to circumvent the task->state check. Since the code setting PREEMPT_ACTIVE is the immediate caller of __schedule() we can replace this with a function argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8d8722b..0a71f89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3056,7 +3056,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(void) +static void __sched __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3096,7 +3096,7 @@ static void __sched __schedule(void) rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { @@ -3161,7 +3161,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(); + __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); } @@ -3202,7 +3202,7 @@ static void __sched notrace preempt_schedule_common(void) { do { preempt_active_enter(); - __schedule(); + __schedule(true); preempt_active_exit(); /* @@ -3267,7 +3267,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(); + __schedule(true); exception_exit(prev_ctx); barrier(); @@ -3296,7 +3296,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_active_enter(); local_irq_enable(); - __schedule(); + __schedule(true); local_irq_disable(); preempt_active_exit(); } while (need_resched()); -- cgit v1.1 From c73464b1c8434ad4cbfd5369c3e724f3e8ffe5a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:06:56 +0200 Subject: sched/core: Fix trace_sched_switch() __trace_sched_switch_state() is the last remaining PREEMPT_ACTIVE user, move trace_sched_switch() from prepare_task_switch() to __schedule() and propagate the @preempt argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 3 ++- kernel/trace/trace_sched_wakeup.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a71f89..cfad7f5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2470,7 +2470,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -3132,6 +3131,7 @@ static void __sched __schedule(bool preempt) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0623ac..00611e9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5697,7 +5697,7 @@ free: } static void -ftrace_graph_probe_sched_switch(void *ignore, +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index f270088..4c896a0 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 12cbe77..4bcfbac 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, } static void notrace -probe_wakeup_sched_switch(void *ignore, +probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; -- cgit v1.1 From 3d8f74dd4ca1da8a1a464bbafcf679e40c2fc10f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:09:19 +0200 Subject: sched/core: Stop setting PREEMPT_ACTIVE Now that nothing tests for PREEMPT_ACTIVE anymore, stop setting it. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cfad7f5..6344d82 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3201,9 +3201,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_active_enter(); + preempt_disable(); __schedule(true); - preempt_active_exit(); + sched_preempt_enable_no_resched(); /* * Check again in case we missed a preemption opportunity @@ -3254,13 +3254,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { - /* - * Use raw __prempt_count() ops that don't call function. - * We can't call functions before disabling preemption which - * disarm preemption tracing recursions. - */ - __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); - barrier(); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3270,8 +3264,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); - barrier(); - __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + preempt_enable_no_resched_notrace(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); @@ -3294,11 +3287,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - preempt_active_enter(); + preempt_disable(); local_irq_enable(); __schedule(true); local_irq_disable(); - preempt_active_exit(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); -- cgit v1.1 From 1dc0fffc48af94513e621f95dff730ed4f7317ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:57:39 +0200 Subject: sched/core: Robustify preemption leak checks When we warn about a preempt_count leak; reset the preempt_count to the known good value such that the problem does not ripple forward. This is most important on x86 which has a per cpu preempt_count that is not saved/restored (after this series). So if you schedule with an invalid (!2*PREEMPT_DISABLE_OFFSET) preempt_count the next task is messed up too. Enforcing this invariant limits the borkage to just the one task. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 +++- kernel/sched/core.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ea95ee1..443677c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -706,10 +706,12 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) + if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6344d82..d6989f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2968,8 +2968,10 @@ static inline void schedule_debug(struct task_struct *prev) BUG_ON(unlikely(task_stack_end_corrupted(prev))); #endif - if (unlikely(in_atomic_preempt_off())) + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -- cgit v1.1 From da7142e2ed735e1c1bef5a757dc55de35c65fbd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:11:45 +0200 Subject: sched/core: Simplify preempt_count tests Since we stopped setting PREEMPT_ACTIVE, there is no need to mask it out of preempt_count() tests. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d6989f8..ca260cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7486,7 +7486,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = preempt_count() + rcu_preempt_depth(); return (nested == preempt_offset); } -- cgit v1.1 From 499d79559ffe4b9c0c3031752f6a40abd532fb75 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:52:36 +0200 Subject: sched/core: More notrace annotations preempt_schedule_common() is marked notrace, but it does not use _notrace() preempt_count functions and __schedule() is also not marked notrace, which means that its perfectly possible to end up in the tracer from preempt_schedule_common(). Steve says: | Yep, there's some history to this. This was originally the issue that | caused function tracing to go into infinite recursion. But now we have | preempt_schedule_notrace(), which is used by the function tracer, and | that function must not be traced till preemption is disabled. | | Now if function tracing is running and we take an interrupt when | NEED_RESCHED is set, it calls | | preempt_schedule_common() (not traced) | | But then that calls preempt_disable() (traced) | | function tracer calls preempt_disable_notrace() followed by | preempt_enable_notrace() which will see NEED_RESCHED set, and it will | call preempt_schedule_notrace(), which stops the recursion, but | still calls __schedule() here, and that means when we return, we call | the __schedule() from preempt_schedule_common(). | | That said, I prefer this patch. Preemption is disabled before calling | __schedule(), and we get rid of a one round recursion with the | scheduler. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca260cc..98c4cf8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3057,7 +3057,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(bool preempt) +static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3203,9 +3203,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_disable(); + preempt_disable_notrace(); __schedule(true); - sched_preempt_enable_no_resched(); + preempt_enable_no_resched_notrace(); /* * Check again in case we missed a preemption opportunity -- cgit v1.1 From e2bf1c4b17aff25f07e0d2952d8c1c66643f33fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 12:18:46 +0200 Subject: sched/core: Add preempt_count invariant check Ingo requested I keep my debug check for the preempt_count invariant. Requested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98c4cf8..4554cde 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2514,6 +2514,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * * Also, see FORK_PREEMPT_COUNT. */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); rq->prev_mm = NULL; -- cgit v1.1 From b52da86e0ad58f096710977fcda856fd84da9233 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 2 Oct 2015 07:48:25 +0530 Subject: sched/numa: Fix task_tick_fair() from disabling numa_balancing If static branch 'sched_numa_balancing' is enabled, it should kickstart NUMA balancing through task_tick_numa(). However the following commit: 2a595721a1fa ("sched/numa: Convert sched_numa_balancing to a static_branch") erroneously disables this. Fix this anomaly by enabling task_tick_numa() when the static branch 'sched_numa_balancing' is enabled. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443752305-27413-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4df37a4..3bdc3da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7881,7 +7881,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (!static_branch_unlikely(&sched_numa_balancing)) + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); } -- cgit v1.1 From 1de64443d755f83af8ba8b558fded0c61afaef47 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Sep 2015 17:44:13 +0200 Subject: sched/core: Fix task and run queue sched_info::run_delay inconsistencies Mike Meyer reported the following bug: > During evaluation of some performance data, it was discovered thread > and run queue run_delay accounting data was inconsistent with the other > accounting data that was collected. Further investigation found under > certain circumstances execution time was leaking into the task and > run queue accounting of run_delay. > > Consider the following sequence: > > a. thread is running. > b. thread moves beween cgroups, changes scheduling class or priority. > c. thread sleeps OR > d. thread involuntarily gives up cpu. > > a. implies: > > thread->sched_info.last_queued = 0 > > a. and b. results in the following: > > 1. dequeue_task(rq, thread) > > sched_info_dequeued(rq, thread) > delta = 0 > > sched_info_reset_dequeued(thread) > thread->sched_info.last_queued = 0 > > thread->sched_info.run_delay += delta > > 2. enqueue_task(rq, thread) > > sched_info_queued(rq, thread) > > /* thread is still on cpu at this point. */ > thread->sched_info.last_queued = task_rq(thread)->clock; > > c. results in: > > dequeue_task(rq, thread) > > sched_info_dequeued(rq, thread) > > /* delta is execution time not run_delay. */ > delta = task_rq(thread)->clock - thread->sched_info.last_queued > > sched_info_reset_dequeued(thread) > thread->sched_info.last_queued = 0 > > thread->sched_info.run_delay += delta > > Since thread was running between enqueue_task(rq, thread) and > dequeue_task(rq, thread), the delta above is really execution > time and not run_delay. > > d. results in: > > __sched_info_switch(thread, next_thread) > > sched_info_depart(rq, thread) > > sched_info_queued(rq, thread) > > /* last_queued not updated due to being non-zero */ > return > > Since thread was running between enqueue_task(rq, thread) and > __sched_info_switch(thread, next_thread), the execution time > between enqueue_task(rq, thread) and > __sched_info_switch(thread, next_thread) now will become > associated with run_delay due to when last_queued was last updated. > This alternative patch solves the problem by not calling sched_info_{de,}queued() in {de,en}queue_task(). Therefore the sched_info state is preserved and things work as expected. By inlining the {de,en}queue_task() functions the new condition becomes (mostly) a compile-time constant and we'll not emit any new branch instructions. It even shrinks the code (due to inlining {en,de}queue_task()): $ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig text data bss dec hex filename 64019 23378 2344 89741 15e8d defconfig-build/kernel/sched/core.o 64149 23378 2344 89871 15f0f defconfig-build/kernel/sched/core.o.orig Reported-by: Mike Meyer Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 44 +++++++++++++++++++++++++------------------- kernel/sched/sched.h | 14 ++++++++------ 2 files changed, 33 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4554cde..fb14a01 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); } if (running) put_prev_task(rq, p); @@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); } /* @@ -1692,7 +1694,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -3325,7 +3327,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; struct rq *rq; const struct sched_class *prev_class; @@ -3357,7 +3359,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3375,7 +3377,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag = ENQUEUE_REPLENISH; + enqueue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3383,7 +3385,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; + enqueue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3435,7 +3437,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3444,7 +3446,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3946,7 +3948,7 @@ change: queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3956,11 +3958,15 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { + int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + if (oldprio <= p->prio) + enqueue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, enqueue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -5109,7 +5115,7 @@ void sched_setnuma(struct task_struct *p, int nid) running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -5118,7 +5124,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); task_rq_unlock(rq, p, &flags); } #endif /* CONFIG_NUMA_BALANCING */ @@ -7737,7 +7743,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, 0); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7761,7 +7767,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, ENQUEUE_RESTORE); task_rq_unlock(rq, tsk, &flags); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 046242f..e08cc4c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1151,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_HEAD 2 +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_HEAD 0x02 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ #else -#define ENQUEUE_WAKING 0 +#define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 8 +#define ENQUEUE_REPLENISH 0x08 +#define ENQUEUE_RESTORE 0x10 -#define DEQUEUE_SLEEP 1 +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) -- cgit v1.1 From ce03e4137bb22fc560ad7a07cf4138ae2cd59f65 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 5 Oct 2015 21:26:05 +0800 Subject: sched/core: Drop unlikely behind BUG_ON() (1) For !CONFIG_BUG cases, the bug call is a no-op, so we couldn't care less and the change is ok. (2) PPC and MIPS, which HAVE_ARCH_BUG_ON, do not rely on branch predictions as it seems to be pointless [1] and thus callers should not be trying to push an optimization in the first place. (3) For CONFIG_BUG and !HAVE_ARCH_BUG_ON cases, BUG_ON() contains an unlikely compiler flag already. Hence, we can drop unlikely behind BUG_ON(). [1] http://lkml.iu.edu/hypermail/linux/kernel/1101.3/02289.html Signed-off-by: Geliang Tang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/6fa7125979f98bbeac26e268271769b6ca935c8d.1444051018.git.geliangtang@163.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb14a01..a395db1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2971,7 +2971,7 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + BUG_ON(task_stack_end_corrupted(prev)); #endif if (unlikely(in_atomic_preempt_off())) { -- cgit v1.1 From 5a4fd0368517bc5b5399ef958f6d30cbff492918 Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Wed, 23 Sep 2015 14:55:59 +0800 Subject: sched/core: Remove a parameter in the migrate_task_rq() function The parameter "int next_cpu" in the following function is unused: migrate_task_rq(struct task_struct *p, int next_cpu) Remove it. Signed-off-by: xiaofeng.yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442991360-31945-1-git-send-email-yanxiaofeng@inspur.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a395db1..1764a0f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1294,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); + p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bdc3da..700eb54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5009,7 +5009,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p) { /* * We are supposed to update the task to "current" time, then its up to date diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e08cc4c..efd3bfc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1190,7 +1190,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); -- cgit v1.1 From 81a43adae3b943193fb3afd20a36a7482332f964 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Sep 2015 13:03:12 -0700 Subject: locking/mutex: Use acquire/release semantics As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}() variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic: Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly ordered archs can benefit from more relaxed use of barriers when locking and unlocking, instead of regular full barrier semantics. While currently only arm64 supports such optimizations, updating corresponding locking primitives serves for other archs to immediately benefit as well, once the necessary machinery is implemented of course. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E.McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443643395-17016-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4cccea6..0551c21 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) static inline bool mutex_try_to_acquire(struct mutex *lock) { return !mutex_is_locked(lock) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1); + (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); } /* @@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * Once more, try to acquire the lock. Only try-lock the mutex if * it is unlocked to reduce unnecessary xchg() operations. */ - if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) + if (!mutex_is_locked(lock) && + (atomic_xchg_acquire(&lock->count, 0) == 1)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * non-negative in order to avoid unnecessary xchg operations: */ if (atomic_read(&lock->count) >= 0 && - (atomic_xchg(&lock->count, -1) == 1)) + (atomic_xchg_acquire(&lock->count, -1) == 1)) break; /* @@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) spin_lock_mutex(&lock->wait_lock, flags); - prev = atomic_xchg(&lock->count, -1); + prev = atomic_xchg_acquire(&lock->count, -1); if (likely(prev == 1)) { mutex_set_owner(lock); mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); -- cgit v1.1 From 700318d1d7b38bbfe86813d9c5c18364dd941526 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Sep 2015 13:03:13 -0700 Subject: locking/rtmutex: Use acquire/release semantics As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}() variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic: Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly ordered archs can benefit from more relaxed use of barriers when locking and unlocking, instead of regular full barrier semantics. While currently only arm64 supports such optimizations, updating corresponding locking primitives serves for other archs to immediately benefit as well, once the necessary machinery is implemented of course. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E.McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443643395-17016-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7781d80..bbb72b4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) + +/* + * Callers must hold the ->wait_lock -- which is the whole purpose as we force + * all future threads that attempt to [Rmw] the lock to the slowpath. As such + * relaxed semantics suffice. + */ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; do { owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); + } while (cmpxchg_relaxed(p, owner, + owner | RT_MUTEX_HAS_WAITERS) != owner); } /* @@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) * lock(wait_lock); * acquire(lock); */ - return rt_mutex_cmpxchg(lock, owner, NULL); + return rt_mutex_cmpxchg_release(lock, owner, NULL); } #else -# define rt_mutex_cmpxchg(l,c,n) (0) +# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) +# define rt_mutex_cmpxchg_acquire(l,c,n) (0) +# define rt_mutex_cmpxchg_release(l,c,n) (0) + static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) @@ -1321,7 +1333,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else @@ -1337,7 +1349,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg(lock, NULL, current))) { + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else @@ -1348,7 +1360,7 @@ static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 1; } @@ -1362,7 +1374,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { WAKE_Q(wake_q); - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); } else { @@ -1484,7 +1496,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wqh) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); return false; } -- cgit v1.1 From 3552a07a9c4aea32cc092fadf10a186c84ed8a61 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Sep 2015 13:03:14 -0700 Subject: locking/mcs: Use acquire/release semantics As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}() variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic: Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly ordered archs can benefit from more relaxed use of barriers when locking and unlocking, instead of regular full barrier semantics. While currently only arm64 supports such optimizations, updating corresponding locking primitives serves for other archs to immediately benefit as well, once the necessary machinery is implemented of course. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E.McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443643395-17016-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index fd91aaa..5b9102a 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg(lock, node); + prev = xchg_acquire(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads @@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) /* * Release the lock by setting it to NULL */ - if (likely(cmpxchg(lock, node, NULL) == node)) + if (likely(cmpxchg_release(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) -- cgit v1.1 From 00eb4bab69db349c3bdc7e0b0f7e9070dafea58c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Sep 2015 13:03:15 -0700 Subject: locking/rwsem: Use acquire/release semantics As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}() variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic: Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly ordered archs can benefit from more relaxed use of barriers when locking and unlocking, instead of regular full barrier semantics. While currently only arm64 supports such optimizations, updating corresponding locking primitives serves for other archs to immediately benefit as well, once the necessary machinery is implemented of course. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E.McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443643395-17016-6-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0f18971..a4d4de0 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) * to reduce unnecessary expensive cmpxchg() operations. */ if (count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { if (!list_is_singular(&sem->wait_list)) rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); @@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + old = cmpxchg_acquire(&sem->count, count, + count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); return true; -- cgit v1.1 From 84778472e1b6a27a8931712c40e8cf31143c8f6c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 2 Sep 2015 01:28:44 -0700 Subject: sched: Export sched_setscheduler_nocheck The new locktorture rtmutex_lock tests exercise priority boosting, which means that they need to set some tasks to real-time priority. To do this, they use sched_setscheduler_nocheck(). However, this is not exported to modules, which results in the following error when building locktorture as a module: ERROR: "sched_setscheduler_nocheck" [kernel/locking/locktorture.ko] undefined! This commit therefore adds an EXPORT_SYMBOL_GPL() to allow this function to be invoked from locktorture when built as a module. Reported-by: Stephen Rothwell Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Acked-by: Ingo Molnar Reviewed-by: Josh Triplett --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f9c928..c4e6078 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4022,6 +4022,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -- cgit v1.1 From b6a4ae766e3133a4db73fabc81e522d1601cb623 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 29 Jul 2015 13:29:38 +0800 Subject: rcu: Use rcu_callback_t in call_rcu*() and friends As we now have rcu_callback_t typedefs as the type of rcu callbacks, we should use it in call_rcu*() and friends as the type of parameters. This could save us a few lines of code and make it clear which function requires an rcu callbacks rather than other callbacks as its argument. Besides, this can also help cscope to generate a better database for code reading. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/srcu.c | 2 +- kernel/rcu/tiny.c | 8 ++++---- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7719295..51c8e7f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void) } static void -call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +call_rcu_busted(struct rcu_head *head, rcu_callback_t func) { /* This is a deliberate bug for testing purposes only! */ func(head); @@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void) } static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { call_srcu(srcu_ctlp, head, func); } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index d3fcb2e..9e61225 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -387,7 +387,7 @@ static void srcu_flip(struct srcu_struct *sp) * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { unsigned long flags; diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d047105..944b1b4 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -44,7 +44,7 @@ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp); #include "tiny_plugin.h" @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * Helper function for call_rcu() and call_rcu_bh(). */ static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp) { unsigned long flags; @@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head, * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_ctrlblk); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * Post an RCU bottom-half callback to be invoked after any subsequent * quiescent state. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_ctrlblk); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 775d36c..b9d9e02 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3017,7 +3017,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), +__call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; @@ -3088,7 +3088,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* * Queue an RCU-sched callback for invocation after a grace period. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_state, -1, 0); } @@ -3097,7 +3097,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Queue an RCU callback for invocation after a quicker grace period. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_state, -1, 0); } @@ -3111,7 +3111,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * function may only be called from __kfree_rcu(). */ void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) + rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 1); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2e991f8..ad11529 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -584,7 +584,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b2bf396..06116ae 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -500,7 +500,7 @@ static void rcu_preempt_do_callbacks(void) /* * Queue a preemptible-RCU callback for invocation after a grace period. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 0); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 7a0b3bc..5f748c5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void); * Post an RCU-tasks callback. First call must be from process context * after the scheduler if fully operational. */ -void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; -- cgit v1.1 From db3e8db45e1cbf8cc22f1083a559d78eb91ccd63 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 29 Jul 2015 13:29:39 +0800 Subject: rcu: Use call_rcu_func_t to replace explicit type equivalents We have had the call_rcu_func_t typedef for a quite awhile, but we still use explicit function pointer types in some places. These types can confuse cscope and can be hard to read. This patch therefore replaces these types with the call_rcu_func_t typedef. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 51c8e7f..f9ec6cb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -252,7 +252,7 @@ struct rcu_torture_ops { void (*exp_sync)(void); unsigned long (*get_state)(void); void (*cond_sync)(unsigned long oldstate); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ad11529..0c33c82 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -464,8 +464,7 @@ struct rcu_state { /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ - void (*func)(struct rcu_head *head)); + call_rcu_func_t call; /* call_rcu() flavor. */ /* The following fields are guarded by the root rcu_node's lock. */ -- cgit v1.1 From bb73c52bad3666997ed2ec83c0c80c3f8ef55008 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 30 Jul 2015 16:55:38 -0700 Subject: rcu: Don't disable preemption for Tiny and Tree RCU readers Because preempt_disable() maps to barrier() for non-debug builds, it forces the compiler to spill and reload registers. Because Tree RCU and Tiny RCU now only appear in CONFIG_PREEMPT=n builds, these barrier() instances generate needless extra code for each instance of rcu_read_lock() and rcu_read_unlock(). This extra code slows down Tree RCU and bloats Tiny RCU. This commit therefore removes the preempt_disable() and preempt_enable() from the non-preemptible implementations of __rcu_read_lock() and __rcu_read_unlock(), respectively. However, for debug purposes, preempt_disable() and preempt_enable() are still invoked if CONFIG_PREEMPT_COUNT=y, because this allows detection of sleeping inside atomic sections in non-preemptible kernels. However, Tiny and Tree RCU operates by coalescing all RCU read-side critical sections on a given CPU that lie between successive quiescent states. It is therefore necessary to compensate for removing barriers from __rcu_read_lock() and __rcu_read_unlock() by adding them to a couple of the RCU functions invoked during quiescent states, namely to rcu_all_qs() and rcu_note_context_switch(). However, note that the latter is more paranoia than necessity, at least until link-time optimizations become more aggressive. This is based on an earlier patch by Paul E. McKenney, fixing a bug encountered in kernels built with CONFIG_PREEMPT=n and CONFIG_PREEMPT_COUNT=y. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b9d9e02..93c0f23 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -337,12 +337,14 @@ static void rcu_momentary_dyntick_idle(void) */ void rcu_note_context_switch(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -353,12 +355,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); * RCU flavors in desperate need of a quiescent state, which will normally * be none of them). Either way, do a lightweight quiescent state for * all RCU flavors. + * + * The barrier() calls are redundant in the common case when this is + * called externally, but just in case this is called from within this + * file. + * */ void rcu_all_qs(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_qs_ctr); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_all_qs); -- cgit v1.1 From ee968ac61d5a3440b931375d81113e0eedfcb249 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jul 2015 08:28:35 -0700 Subject: rcu: Eliminate panic when silly boot-time fanout specified This commit loosens rcutree.rcu_fanout_leaf range checks and replaces a panic() with a fallback to compile-time values. This fallback is accompanied by a WARN_ON(), and both occur when the rcutree.rcu_fanout_leaf value is too small to accommodate the number of CPUs. For example, given the current four-level limit for the rcu_node tree, a system with more than 16 CPUs built with CONFIG_FANOUT=2 must have rcutree.rcu_fanout_leaf larger than 2. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93c0f23..713eb92 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4230,13 +4230,12 @@ static void __init rcu_init_geometry(void) rcu_fanout_leaf, nr_cpu_ids); /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Complain and fall back to the compile- - * time values if these limits are exceeded. + * The boot-time rcu_fanout_leaf parameter must be at least two + * and cannot exceed the number of bits in the rcu_node masks. + * Complain and fall back to the compile-time values if this + * limit is exceeded. */ - if (rcu_fanout_leaf < RCU_FANOUT_LEAF || + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > sizeof(unsigned long) * 8) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); @@ -4253,10 +4252,13 @@ static void __init rcu_init_geometry(void) /* * The tree must be able to accommodate the configured number of CPUs. - * If this limit is exceeded than we have a serious problem elsewhere. + * If this limit is exceeded, fall back to the compile-time values. */ - if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) - panic("rcu_init_geometry: rcu_capacity[] is too small"); + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; + WARN_ON(1); + return; + } /* Calculate the number of levels in the tree. */ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { -- cgit v1.1 From 7f21aeef722d598ba350d1834f6e4134c7b5e8de Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2015 14:49:50 -0700 Subject: rcu: Add online/offline info to stall warning message This commit makes the RCU CPU stall warning message print online/offline indications immediately after a hyphen following the CPU number. A "O" indicates that the global CPU-hotplug system believes that the CPU is online, a "o" that RCU perceived the CPU to be online at the beginning of the current expedited grace period, and an "N" that RCU currently believes that it will perceive the CPU as being online at the beginning of the next expedited grace period, with "." otherwise for all three indications. So for CPU 10, you would normally see "10-OoN:" indicating that everything believes that the CPU is online. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 06116ae..57ed9c1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1702,8 +1702,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", - cpu, ticks_value, ticks_title, + pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), -- cgit v1.1 From 49f5903b473c5f63f3b57856d1bd4593db0a2eef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Sep 2015 00:42:57 -0700 Subject: rcu: Move preemption disabling out of __srcu_read_lock() Currently, __srcu_read_lock() cannot be invoked from restricted environments because it contains calls to preempt_disable() and preempt_enable(), both of which can invoke lockdep, which is a bad idea in some restricted execution modes. This commit therefore moves the preempt_disable() and preempt_enable() from __srcu_read_lock() to srcu_read_lock(). It also inserts the preempt_disable() and preempt_enable() around the call to __srcu_read_lock() in do_exit(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/exit.c | 2 ++ kernel/rcu/srcu.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ea95ee1..0e93b63 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -761,7 +761,9 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(preempt_disable()); TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); + TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9e61225..a63a1ea 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - preempt_disable(); __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ __this_cpu_inc(sp->per_cpu_ref->seq[idx]); - preempt_enable(); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); -- cgit v1.1 From 77f81fe08ebd99d7e0eefde42ddac06a675bc4ad Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 9 Sep 2015 12:09:49 -0700 Subject: rcu: Finish folding ->fqs_state into ->gp_state Commit commit 4cdfc175c25c89ee ("rcu: Move quiescent-state forcing into kthread") started the process of folding the old ->fqs_state into ->gp_state, but did not complete it. This situation does not cause any malfunction, but can result in extremely confusing trace output. This commit completes this task of eliminating ->fqs_state in favor of ->gp_state. The old ->fqs_state was also used to decide when to collect dyntick-idle snapshots. For this purpose, we add a boolean variable into the kthread, which is set on the first call to rcu_gp_fqs() for a given grace period and clear otherwise. Signed-off-by: Petr Mladek Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 18 ++++++++---------- kernel/rcu/tree.h | 14 +++----------- kernel/rcu/tree_trace.c | 2 +- 3 files changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 713eb92..4d296b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -98,7 +98,7 @@ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .rda = &sname##_data, \ .call = cr, \ - .fqs_state = RCU_GP_IDLE, \ + .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ @@ -1936,16 +1936,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) /* * Do one round of quiescent-state forcing. */ -static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - int fqs_state = fqs_state_in; bool isidle = false; unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; - if (fqs_state == RCU_SAVE_DYNTICK) { + if (first_time) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { isidle = true; @@ -1954,7 +1953,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) force_qs_rnp(rsp, dyntick_save_progress_counter, &isidle, &maxj); rcu_sysidle_report_gp(rsp, isidle, maxj); - fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ isidle = true; @@ -1968,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); } - return fqs_state; } /* @@ -2032,7 +2029,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); - rsp->fqs_state = RCU_GP_IDLE; + rsp->gp_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; @@ -2050,7 +2047,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ static int __noreturn rcu_gp_kthread(void *arg) { - int fqs_state; + bool first_gp_fqs; int gf; unsigned long j; int ret; @@ -2082,7 +2079,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle quiescent-state forcing. */ - fqs_state = RCU_SAVE_DYNTICK; + first_gp_fqs = true; j = jiffies_till_first_fqs; if (j > HZ) { j = HZ; @@ -2110,7 +2107,8 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsstart")); - fqs_state = rcu_gp_fqs(rsp, fqs_state); + rcu_gp_fqs(rsp, first_gp_fqs); + first_gp_fqs = false; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0c33c82..be6d1e8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -412,13 +412,6 @@ struct rcu_data { struct rcu_state *rsp; }; -/* Values for fqs_state field in struct rcu_state. */ -#define RCU_GP_IDLE 0 /* No grace period in progress. */ -#define RCU_GP_INIT 1 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK - /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOGP_WAKE_NOT 0 #define RCU_NOGP_WAKE 1 @@ -468,9 +461,8 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 fqs_state ____cacheline_internodealigned_in_smp; - /* Force QS state. */ - u8 boost; /* Subject to priority boost. */ + u8 boost ____cacheline_internodealigned_in_smp; + /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ @@ -538,7 +530,7 @@ struct rcu_state { #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ /* Values for rcu_state structure's gp_flags field. */ -#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 6fc4c5f..1d61f5b 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", ulong2long(rsp->completed), ulong2long(gpnum), - rsp->fqs_state, + rsp->gp_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", -- cgit v1.1 From c34d2f418485a5d710bc002e490148b8ea53f456 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Sep 2015 11:21:28 -0700 Subject: rcu: Correct comment for values of ->gp_state field This commit corrects the comment for the values of the ->gp_state field, which previously incorrectly said that these were for the ->gp_flags field. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index be6d1e8..674ebbc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -529,7 +529,7 @@ struct rcu_state { #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ -/* Values for rcu_state structure's gp_flags field. */ +/* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ -- cgit v1.1 From 095777c417db142970adeb776fa0cb10810b8122 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 22 Jul 2015 14:07:27 -0700 Subject: locktorture: Support rtmutex torturing Real time mutexes is one of the few general primitives that we do not have in locktorture. Address this -- a few considerations: o To spice things up, enable competing thread(s) to become rt, such that we can stress different prio boosting paths in the rtmutex code. Introduce a ->task_boost callback, only used by rtmutex-torturer. Tasks will boost/deboost around every 50k (arbitrarily) lock/unlock operations. o Hold times are similar to what we have for other locks: only occasionally having longer hold times (per ~200k ops). So we roughly do two full rt boost+deboosting ops with short hold times. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 114 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 3224418..e1ca7a2 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -17,12 +17,14 @@ * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney + * Authors: Paul E. McKenney + * Davidlohr Bueso * Based on kernel/rcu/torture.c. */ #include #include #include +#include #include #include #include @@ -91,11 +93,13 @@ struct lock_torture_ops { void (*init)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); + void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(void); - unsigned long flags; + + unsigned long flags; /* for irq spinlocks */ const char *name; }; @@ -139,9 +143,15 @@ static void torture_lock_busted_write_unlock(void) /* BUGGY, do not use in real life!!! */ } +static void torture_boost_dummy(struct torture_random_state *trsp) +{ + /* Only rtmutexes care about priority */ +} + static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -185,6 +195,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -211,6 +222,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -275,6 +287,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -315,6 +328,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -354,6 +368,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex) static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -361,6 +376,90 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#ifdef CONFIG_RT_MUTEXES +static DEFINE_RT_MUTEX(torture_rtmutex); + +static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; +} + +static void torture_rtmutex_boost(struct torture_random_state *trsp) +{ + int policy; + struct sched_param param; + const unsigned int factor = 50000; /* yes, quite arbitrary */ + + if (!rt_task(current)) { + /* + * (1) Boost priority once every ~50k operations. When the + * task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + policy = SCHED_FIFO; + param.sched_priority = MAX_RT_PRIO - 1; + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another ~500k operations, + * then restored back to its original prio, and so forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + policy = SCHED_NORMAL; + param.sched_priority = 0; + } else /* common case, do nothing */ + return; + } + + sched_setscheduler_nocheck(current, policy, ¶m); +} + +static void torture_rtmutex_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* + * We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +{ + rt_mutex_unlock(&torture_rtmutex); +} + +static struct lock_torture_ops rtmutex_lock_ops = { + .writelock = torture_rtmutex_lock, + .write_delay = torture_rtmutex_delay, + .task_boost = torture_rtmutex_boost, + .writeunlock = torture_rtmutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "rtmutex_lock" +}; +#endif + static DECLARE_RWSEM(torture_rwsem); static int torture_rwsem_down_write(void) __acquires(torture_rwsem) { @@ -419,6 +518,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -442,6 +542,7 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->task_boost(&rand); cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; @@ -456,6 +557,8 @@ static int lock_torture_writer(void *arg) stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); + + cxt.cur_ops->task_boost(NULL); /* reset prio */ torture_kthread_stopping("lock_torture_writer"); return 0; } @@ -642,6 +745,9 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, +#ifdef CONFIG_RT_MUTEXES + &rtmutex_lock_ops, +#endif &rwsem_lock_ops, }; @@ -676,6 +782,10 @@ static int __init lock_torture_init(void) if (strncmp(torture_type, "mutex", 5) == 0) cxt.debug_lock = true; #endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (strncmp(torture_type, "rtmutex", 7) == 0) + cxt.debug_lock = true; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if ((strncmp(torture_type, "spin", 4) == 0) || (strncmp(torture_type, "rw_lock", 7) == 0)) -- cgit v1.1 From 302707fd7cd341a23dbc448a335d432ad0069c20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Aug 2015 20:21:59 -0700 Subject: locking/percpu-rwsem: Export symbols for locktorture This commit exports percpu_down_read(), percpu_down_write(), __percpu_init_rwsem(), percpu_up_read(), and percpu_up_write() to allow locktorture to test them when built as a module. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/percpu-rwsem.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f325672..e2621fb 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -22,6 +22,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, init_waitqueue_head(&brw->write_waitq); return 0; } +EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) { @@ -87,6 +88,7 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) /* avoid up_read()->rwsem_release() */ __up_read(&brw->rw_sem); } +EXPORT_SYMBOL_GPL(percpu_down_read); int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) { @@ -112,6 +114,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw) if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); } +EXPORT_SYMBOL_GPL(percpu_up_read); static int clear_fast_ctr(struct percpu_rw_semaphore *brw) { @@ -163,6 +166,7 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); } +EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) { @@ -176,3 +180,4 @@ void percpu_up_write(struct percpu_rw_semaphore *brw) /* the last writer unblocks update_fast_ctr() */ atomic_dec(&brw->write_ctr); } +EXPORT_SYMBOL_GPL(percpu_up_write); -- cgit v1.1 From 617783dd99704331e22636388c932450e02ee636 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 29 Aug 2015 14:46:29 -0700 Subject: locktorture: Add torture tests for percpu_rwsem This commit adds percpu_rwsem tests based on the earlier rwsem tests. Signed-off-by: Paul E. McKenney Cc: Oleg Nesterov Cc: Davidlohr Bueso Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index e1ca7a2..8545e12 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -36,6 +36,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -526,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = { .name = "rwsem_lock" }; +#include +static struct percpu_rw_semaphore pcpu_rwsem; + +void torture_percpu_rwsem_init(void) +{ + BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); +} + +static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +{ + percpu_down_write(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +{ + percpu_up_write(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +{ + percpu_down_read(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +{ + percpu_up_read(&pcpu_rwsem); +} + +static struct lock_torture_ops percpu_rwsem_lock_ops = { + .init = torture_percpu_rwsem_init, + .writelock = torture_percpu_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_percpu_rwsem_up_write, + .readlock = torture_percpu_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_percpu_rwsem_up_read, + .name = "percpu_rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -749,6 +792,7 @@ static int __init lock_torture_init(void) &rtmutex_lock_ops, #endif &rwsem_lock_ops, + &percpu_rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) -- cgit v1.1 From 3836f5337f74fedc15981688c3c31dbf4293ae84 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 30 Aug 2015 03:29:58 -0700 Subject: torture: Consolidate cond_resched_rcu_qs() into stutter_wait() This commit moves cond_resched_rcu_qs() into stutter_wait(), saving a line and also avoiding RCU CPU stall warnings from all torture loops containing a stutter_wait(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 2 -- kernel/torture.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7719295..8a65b7d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/torture.c b/kernel/torture.c index 3e48406..44aa462 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -523,6 +523,7 @@ static int stutter; */ void stutter_wait(const char *title) { + cond_resched_rcu_qs(); while (READ_ONCE(stutter_pause_test) || (torture_runnable && !READ_ONCE(*torture_runnable))) { if (stutter_pause_test) -- cgit v1.1 From cc44ca848f5e517aeca9f5eabbe13609a3f71450 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:42:44 +0200 Subject: rcu: Create rcu_sync infrastructure The rcu_sync infrastructure can be thought of as infrastructure to be used to implement reader-writer primitives having extremely lightweight readers during times when there are no writers. The first use is in the percpu_rwsem used by the VFS subsystem. This infrastructure is functionally equivalent to struct rcu_sync_struct { atomic_t counter; }; /* Check possibility of fast-path read-side operations. */ static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss) { return atomic_read(&rss->counter) == 0; } /* Tell readers to use slowpaths. */ static inline void rcu_sync_enter(struct rcu_sync_struct *rss) { atomic_inc(&rss->counter); synchronize_sched(); } /* Allow readers to once again use fastpaths. */ static inline void rcu_sync_exit(struct rcu_sync_struct *rss) { synchronize_sched(); atomic_dec(&rss->counter); } The main difference is that it records the state and only calls synchronize_sched() if required. At least some of the calls to synchronize_sched() will be optimized away when rcu_sync_enter() and rcu_sync_exit() are invoked repeatedly in quick succession. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/Makefile | 2 +- kernel/rcu/sync.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 kernel/rcu/sync.c (limited to 'kernel') diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 50a8084..61a1656 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,4 @@ -obj-y += update.o +obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c new file mode 100644 index 0000000..0a11df4 --- /dev/null +++ b/kernel/rcu/sync.c @@ -0,0 +1,175 @@ +/* + * RCU-based infrastructure for lightweight reader-writer locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (c) 2015, Red Hat, Inc. + * + * Author: Oleg Nesterov + */ + +#include +#include + +enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; +enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; + +#define rss_lock gp_wait.lock + +/** + * rcu_sync_init() - Initialize an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be initialized + * @type: Flavor of RCU with which to synchronize rcu_sync structure + */ +void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +{ + memset(rsp, 0, sizeof(*rsp)); + init_waitqueue_head(&rsp->gp_wait); + + switch (type) { + case RCU_SYNC: + rsp->sync = synchronize_rcu; + rsp->call = call_rcu; + break; + + case RCU_SCHED_SYNC: + rsp->sync = synchronize_sched; + rsp->call = call_rcu_sched; + break; + + case RCU_BH_SYNC: + rsp->sync = synchronize_rcu_bh; + rsp->call = call_rcu_bh; + break; + } +} + +/** + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + bool need_wait, need_sync; + + spin_lock_irq(&rsp->rss_lock); + need_wait = rsp->gp_count++; + need_sync = rsp->gp_state == GP_IDLE; + if (need_sync) + rsp->gp_state = GP_PENDING; + spin_unlock_irq(&rsp->rss_lock); + + BUG_ON(need_wait && need_sync); + + if (need_sync) { + rsp->sync(); + rsp->gp_state = GP_PASSED; + wake_up_all(&rsp->gp_wait); + } else if (need_wait) { + wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); + } else { + /* + * Possible when there's a pending CB from a rcu_sync_exit(). + * Nobody has yet been allowed the 'fast' path and thus we can + * avoid doing any sync(). The callback will get 'dropped'. + */ + BUG_ON(rsp->gp_state != GP_PASSED); + } +} + +/** + * rcu_sync_func() - Callback function managing reader access to fastpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is passed to one of the call_rcu() functions by + * rcu_sync_exit(), so that it is invoked after a grace period following the + * that invocation of rcu_sync_exit(). It takes action based on events that + * have taken place in the meantime, so that closely spaced rcu_sync_enter() + * and rcu_sync_exit() pairs need not wait for a grace period. + * + * If another rcu_sync_enter() is invoked before the grace period + * ended, reset state to allow the next rcu_sync_exit() to let the + * readers back onto their fastpaths (after a grace period). If both + * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked + * before the grace period ended, re-invoke call_rcu() on behalf of that + * rcu_sync_exit(). Otherwise, set all state back to idle so that readers + * can again use their fastpaths. + */ +static void rcu_sync_func(struct rcu_head *rcu) +{ + struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + unsigned long flags; + + BUG_ON(rsp->gp_state != GP_PASSED); + BUG_ON(rsp->cb_state == CB_IDLE); + + spin_lock_irqsave(&rsp->rss_lock, flags); + if (rsp->gp_count) { + /* + * A new rcu_sync_begin() has happened; drop the callback. + */ + rsp->cb_state = CB_IDLE; + } else if (rsp->cb_state == CB_REPLAY) { + /* + * A new rcu_sync_exit() has happened; requeue the callback + * to catch a later GP. + */ + rsp->cb_state = CB_PENDING; + rsp->call(&rsp->cb_head, rcu_sync_func); + } else { + /* + * We're at least a GP after rcu_sync_exit(); eveybody will now + * have observed the write side critical section. Let 'em rip!. + */ + rsp->cb_state = CB_IDLE; + rsp->gp_state = GP_IDLE; + } + spin_unlock_irqrestore(&rsp->rss_lock, flags); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who have completed, and can therefore + * now allow readers to make use of their fastpaths after a grace period + * has elapsed. After this grace period has completed, all subsequent + * calls to rcu_sync_is_idle() will return true, which tells readers that + * they can once again use their fastpaths. + */ +void rcu_sync_exit(struct rcu_sync *rsp) +{ + spin_lock_irq(&rsp->rss_lock); + if (!--rsp->gp_count) { + if (rsp->cb_state == CB_IDLE) { + rsp->cb_state = CB_PENDING; + rsp->call(&rsp->cb_head, rcu_sync_func); + } else if (rsp->cb_state == CB_PENDING) { + rsp->cb_state = CB_REPLAY; + } + } + spin_unlock_irq(&rsp->rss_lock); +} -- cgit v1.1 From 82e8c565be8a72957570d7da8dd9b441db7bb648 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:42:47 +0200 Subject: rcu_sync: Simplify rcu_sync using new rcu_sync_ops structure This commit adds the new struct rcu_sync_ops which holds sync/call methods, and turns the function pointers in rcu_sync_struct into an array of struct rcu_sync_ops. This simplifies the "init" helpers by collapsing a switch statement and explicit multiple definitions into a simple assignment and a helper macro, respectively. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/sync.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 0a11df4..5a9aa4c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -23,6 +23,24 @@ #include #include +static const struct { + void (*sync)(void); + void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); +} gp_ops[] = { + [RCU_SYNC] = { + .sync = synchronize_rcu, + .call = call_rcu, + }, + [RCU_SCHED_SYNC] = { + .sync = synchronize_sched, + .call = call_rcu_sched, + }, + [RCU_BH_SYNC] = { + .sync = synchronize_rcu_bh, + .call = call_rcu_bh, + }, +}; + enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; @@ -37,23 +55,7 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - - switch (type) { - case RCU_SYNC: - rsp->sync = synchronize_rcu; - rsp->call = call_rcu; - break; - - case RCU_SCHED_SYNC: - rsp->sync = synchronize_sched; - rsp->call = call_rcu_sched; - break; - - case RCU_BH_SYNC: - rsp->sync = synchronize_rcu_bh; - rsp->call = call_rcu_bh; - break; - } + rsp->gp_type = type; } /** @@ -85,7 +87,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) BUG_ON(need_wait && need_sync); if (need_sync) { - rsp->sync(); + gp_ops[rsp->gp_type].sync(); rsp->gp_state = GP_PASSED; wake_up_all(&rsp->gp_wait); } else if (need_wait) { @@ -138,7 +140,7 @@ static void rcu_sync_func(struct rcu_head *rcu) * to catch a later GP. */ rsp->cb_state = CB_PENDING; - rsp->call(&rsp->cb_head, rcu_sync_func); + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); } else { /* * We're at least a GP after rcu_sync_exit(); eveybody will now @@ -166,7 +168,7 @@ void rcu_sync_exit(struct rcu_sync *rsp) if (!--rsp->gp_count) { if (rsp->cb_state == CB_IDLE) { rsp->cb_state = CB_PENDING; - rsp->call(&rsp->cb_head, rcu_sync_func); + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); } else if (rsp->cb_state == CB_PENDING) { rsp->cb_state = CB_REPLAY; } -- cgit v1.1 From 3a518b76af7bb411efe6dd090fbf098e29accb2e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:42:50 +0200 Subject: rcu_sync: Add CONFIG_PROVE_RCU checks This commit validates that the caller of rcu_sync_is_idle() holds the corresponding type of RCU read-side lock, but only in kernels built with CONFIG_PROVE_RCU=y. This validation is carried out via a new rcu_sync_ops->held() method that is checked within rcu_sync_is_idle(). Note that although this does add code to the fast path, it only does so in kernels built with CONFIG_PROVE_RCU=y. Suggested-by: "Paul E. McKenney" Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/sync.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 5a9aa4c..01c9807 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -23,21 +23,33 @@ #include #include +#ifdef CONFIG_PROVE_RCU +#define __INIT_HELD(func) .held = func, +#else +#define __INIT_HELD(func) +#endif + static const struct { void (*sync)(void); void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); +#ifdef CONFIG_PROVE_RCU + int (*held)(void); +#endif } gp_ops[] = { [RCU_SYNC] = { .sync = synchronize_rcu, .call = call_rcu, + __INIT_HELD(rcu_read_lock_held) }, [RCU_SCHED_SYNC] = { .sync = synchronize_sched, .call = call_rcu_sched, + __INIT_HELD(rcu_read_lock_sched_held) }, [RCU_BH_SYNC] = { .sync = synchronize_rcu_bh, .call = call_rcu_bh, + __INIT_HELD(rcu_read_lock_bh_held) }, }; @@ -46,6 +58,14 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; #define rss_lock gp_wait.lock +#ifdef CONFIG_PROVE_RCU +bool __rcu_sync_is_idle(struct rcu_sync *rsp) +{ + WARN_ON(!gp_ops[rsp->gp_type].held()); + return rsp->gp_state == GP_IDLE; +} +#endif + /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized -- cgit v1.1 From 07899a6e5f56136028c44a57ad0451e797365ac3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:42:52 +0200 Subject: rcu_sync: Introduce rcu_sync_dtor() This commit allows rcu_sync structures to be safely deallocated, The trick is to add a new ->wait field to the gp_ops array. This field is a pointer to the rcu_barrier() function corresponding to the flavor of RCU in question. This allows a new rcu_sync_dtor() to wait for any outstanding callbacks before freeing the rcu_sync structure. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/sync.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 01c9807..1e353f0 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -32,6 +32,7 @@ static const struct { void (*sync)(void); void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); + void (*wait)(void); #ifdef CONFIG_PROVE_RCU int (*held)(void); #endif @@ -39,16 +40,19 @@ static const struct { [RCU_SYNC] = { .sync = synchronize_rcu, .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_held) }, [RCU_SCHED_SYNC] = { .sync = synchronize_sched, .call = call_rcu_sched, + .wait = rcu_barrier_sched, __INIT_HELD(rcu_read_lock_sched_held) }, [RCU_BH_SYNC] = { .sync = synchronize_rcu_bh, .call = call_rcu_bh, + .wait = rcu_barrier_bh, __INIT_HELD(rcu_read_lock_bh_held) }, }; @@ -195,3 +199,25 @@ void rcu_sync_exit(struct rcu_sync *rsp) } spin_unlock_irq(&rsp->rss_lock); } + +/** + * rcu_sync_dtor() - Clean up an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be cleaned up + */ +void rcu_sync_dtor(struct rcu_sync *rsp) +{ + int cb_state; + + BUG_ON(rsp->gp_count); + + spin_lock_irq(&rsp->rss_lock); + if (rsp->cb_state == CB_REPLAY) + rsp->cb_state = CB_PENDING; + cb_state = rsp->cb_state; + spin_unlock_irq(&rsp->rss_lock); + + if (cb_state != CB_IDLE) { + gp_ops[rsp->gp_type].wait(); + BUG_ON(rsp->cb_state != CB_IDLE); + } +} -- cgit v1.1 From 95b19f684c61ffc9b039e02c5d1113c2d8cd7105 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:42:55 +0200 Subject: locking/percpu-rwsem: Make percpu_free_rwsem() after kzalloc() safe This is the temporary ugly hack which will be reverted later. We only need it to ensure that the next patch will not break "change sb_writers to use percpu_rw_semaphore" patches routed via the VFS tree. The alloc_super()->destroy_super() error path assumes that it is safe to call percpu_free_rwsem() after kzalloc() without percpu_init_rwsem(), so let's not disappoint it. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/percpu-rwsem.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index e2621fb..9529a30 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -26,6 +26,13 @@ EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) { + /* + * XXX: temporary kludge. The error path in alloc_super() + * assumes that percpu_free_rwsem() is safe after kzalloc(). + */ + if (!brw->fast_read_ctr) + return; + free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } -- cgit v1.1 From 001dac627ff37433d5528ffb0d897cd19c2b1e43 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:42:57 +0200 Subject: locking/percpu-rwsem: Make use of the rcu_sync infrastructure Currently down_write/up_write calls synchronize_sched_expedited() twice, which is evil. Change this code to rely on rcu-sync primitives. This avoids the _expedited "big hammer", and this can be faster in the contended case or even in the case when a single thread does down_write/up_write in a loop. Of course, a single down_write() will take more time, but otoh it will be much more friendly to the whole system. To simplify the review this patch doesn't update the comments, fixed by the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/percpu-rwsem.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 9529a30..183a711 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); + rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); return 0; @@ -33,6 +33,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) if (!brw->fast_read_ctr) return; + rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } @@ -62,13 +63,12 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { - bool success = false; + bool success; preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { + success = rcu_sync_is_idle(&brw->rss); + if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } preempt_enable(); return success; @@ -149,8 +149,6 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) */ void percpu_down_write(struct percpu_rw_semaphore *brw) { - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); /* * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read * so that update_fast_ctr() can't succeed. @@ -162,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) * fast-path, it executes a full memory barrier before we return. * See R_W case in the comment above update_fast_ctr(). */ - synchronize_sched_expedited(); + rcu_sync_enter(&brw->rss); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); @@ -183,8 +181,6 @@ void percpu_up_write(struct percpu_rw_semaphore *brw) * Insert the barrier before the next fast-path in down_read, * see W_R case in the comment above update_fast_ctr(). */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); + rcu_sync_exit(&brw->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); -- cgit v1.1 From f324a76324c97e81a6ba66a8efac20cdbffd759e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:43:00 +0200 Subject: locking/percpu-rwsem: Fix the comments outdated by rcu_sync Update the comments broken by the previous change. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/percpu-rwsem.c | 50 ++++++++++--------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 183a711..02a726d 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -39,27 +39,12 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) } /* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer + * This is the fast-path for down_read/up_read. If it succeeds we rely + * on the barriers provided by rcu_sync_enter/exit; see the comments in + * percpu_down_write() and percpu_up_write(). * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { @@ -136,29 +121,15 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) return sum; } -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ void percpu_down_write(struct percpu_rw_semaphore *brw) { /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). + * Make rcu_sync_is_idle() == F and thus disable the fast-path in + * percpu_down_read() and percpu_up_read(), and wait for gp pass. * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). + * The latter synchronises us with the preceding readers which used + * the fast-past, so we can not miss the result of __this_cpu_add() + * or anything else inside their criticial sections. */ rcu_sync_enter(&brw->rss); @@ -178,8 +149,9 @@ void percpu_up_write(struct percpu_rw_semaphore *brw) /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). + * Enable the fast-path in percpu_down_read() and percpu_up_read() + * but only after another gp pass; this adds the necessary barrier + * to ensure the reader can't miss the changes done by us. */ rcu_sync_exit(&brw->rss); } -- cgit v1.1 From cc5f730b41506d37a5c2826b2e801d0a59853d11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Aug 2015 19:43:03 +0200 Subject: locking/percpu-rwsem: Clean up the lockdep annotations in percpu_down_read() Based on Peter Zijlstra's earlier patch. Change percpu_down_read() to use __down_read(), this way we can do rwsem_acquire_read() unconditionally at the start to make this code more symmetric and clean. Originally-From: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/percpu-rwsem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 02a726d..f231e0b 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -70,14 +70,14 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + + if (likely(update_fast_ctr(brw, +1))) return; - } - down_read(&brw->rw_sem); + /* Avoid rwsem_acquire_read() and rwsem_release() */ + __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ __up_read(&brw->rw_sem); } EXPORT_SYMBOL_GPL(percpu_down_read); -- cgit v1.1 From 4bace7344d6dbd7a1b0b801abf24ea9878064317 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 Sep 2015 17:59:18 +0200 Subject: rcu_sync: Cleanup the CONFIG_PROVE_RCU checks 1. Rename __rcu_sync_is_idle() to rcu_sync_lockdep_assert() and change it to use rcu_lockdep_assert(). 2. Change rcu_sync_is_idle() to return rsp->gp_state == GP_IDLE unconditonally, this way we can remove the same check from rcu_sync_lockdep_assert() and clearly isolate the debugging code. Note: rcu_sync_enter()->wait_event(gp_state == GP_PASSED) needs another CONFIG_PROVE_RCU check, the same as is done in ->sync(); but this needs some simple preparations in the core RCU code to avoid the code duplication. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/sync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 1e353f0..be922c9 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -63,10 +63,10 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; #define rss_lock gp_wait.lock #ifdef CONFIG_PROVE_RCU -bool __rcu_sync_is_idle(struct rcu_sync *rsp) +void rcu_sync_lockdep_assert(struct rcu_sync *rsp) { - WARN_ON(!gp_ops[rsp->gp_type].held()); - return rsp->gp_state == GP_IDLE; + RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), + "suspicious rcu_sync_is_idle() usage"); } #endif -- cgit v1.1 From 889d487a26de4bcd1a0a668754bcbce893969edf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Aug 2015 11:37:58 -0700 Subject: rcutorture: Fix module unwind when bad torture_type specified The rcutorture module has a list of torture types, and specifying a type not on this list is supposed to cleanly fail the module load. Unfortunately, the "fail" happens without the "cleanly". This commit therefore adds the needed clean-up after an incorrect torture_type. Reported-by: David Miller Signed-off-by: Paul E. McKenney Acked-by: David Miller Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7719295..b74b564 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1742,15 +1742,15 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cur_ops->init(); if (nreaders >= 0) { nrealreaders = nreaders; -- cgit v1.1 From 4f441a258f7badf752b3d9b04b675869ca4e751c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Aug 2015 13:13:51 -0700 Subject: rcutorture: Fix unused-function warning for torturing_tasks() The torturing_tasks() function is used only in kernels built with CONFIG_PROVE_RCU=y, so the second definition can result in unused-function compiler warnings. This commit adds __maybe_unused to suppress these warnings. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b74b564..009b62c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void) #define RCUTORTURE_TASKS_OPS -static bool torturing_tasks(void) +static bool __maybe_unused torturing_tasks(void) { return false; } -- cgit v1.1 From a36a99618b1adb2d6ca0b7e08e3a656a04e477fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 30 Aug 2015 20:01:48 -0700 Subject: locktorture: Fix module unwind when bad torture_type specified The locktorture module has a list of torture types, and specifying a type not on this list is supposed to cleanly fail the module load. Unfortunately, the "fail" happens without the "cleanly". This commit therefore adds the needed clean-up after an incorrect torture_type. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 3224418..820852f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -661,11 +661,11 @@ static int __init lock_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cxt.cur_ops->init) - cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cxt.cur_ops->init(); if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; -- cgit v1.1 From 6587a23b6b9bdb47205ec96c703e5bf8a2d39701 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2015 16:50:39 -0700 Subject: rcu: Switch synchronize_sched_expedited() to IPI This commit switches synchronize_sched_expedited() from stop_one_cpu_nowait() to smp_call_function_single(), thus moving from an IPI and a pair of context switches to an IPI and a single pass through the scheduler. Of course, if the scheduler actually does decide to switch to a different task, there will still be a pair of context switches, but there would likely have been a pair of context switches anyway, just a bit later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 32 ++++++++++++++++++++------------ kernel/rcu/tree.h | 3 --- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3e2875b..a421689 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -161,6 +161,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +static void rcu_report_exp_rdp(struct rcu_state *rsp, + struct rcu_data *rdp, bool wake); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -250,6 +252,12 @@ void rcu_sched_qs(void) __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), + true); + } } } @@ -3555,8 +3563,8 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified rcu_data (CPU). * Caller must hold the root rcu_node's exp_funnel_mutex. */ -static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp, - struct rcu_data *rdp, bool wake) +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) { rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); } @@ -3637,14 +3645,10 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static int synchronize_sched_expedited_cpu_stop(void *data) +static void synchronize_sched_expedited_cpu_stop(void *data) { - struct rcu_data *rdp = data; - struct rcu_state *rsp = rdp->rsp; - - /* Report the quiescent state. */ - rcu_report_exp_rdp(rsp, rdp, true); - return 0; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); } /* @@ -3659,6 +3663,7 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp) unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; struct rcu_data *rdp; + int ret; struct rcu_node *rnp; sync_exp_reset_tree(rsp); @@ -3694,9 +3699,9 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp) if (!(mask_ofl_ipi & mask)) continue; rdp = per_cpu_ptr(rsp->rda, cpu); - stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, - rdp, &rdp->exp_stop_work); - mask_ofl_ipi &= ~mask; + ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0); + if (!ret) + mask_ofl_ipi &= ~mask; } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -4201,6 +4206,9 @@ int rcu_cpu_notify(struct notifier_block *self, rcu_cleanup_dying_cpu(rsp); break; case CPU_DYING_IDLE: + /* QS for any half-done expedited RCU-sched GP. */ + rcu_sched_qs(); + for_each_rcu_flavor(rsp) { rcu_cleanup_dying_idle_cpu(cpu, rsp); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3eee48b..1b969ce 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -324,9 +324,6 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ - struct cpu_stop_work exp_stop_work; - /* Expedited grace-period control */ - /* for CPU stopping. */ /* 2) batch handling */ /* -- cgit v1.1 From 83c2c735e78da1a0d994911f730f6e1d36c88d7a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2015 20:43:02 -0700 Subject: rcu: Stop silencing lockdep false positive for expedited grace periods This reverts commit af859beaaba4 (rcu: Silence lockdep false positive for expedited grace periods). Because synchronize_rcu_expedited() no longer invokes synchronize_sched_expedited(), ->exp_funnel_mutex acquisition is no longer nested, so the false positive no longer happens. This commit therefore removes the extra lockdep data structures, as they are no longer needed. --- kernel/rcu/tree.c | 17 ++--------------- kernel/rcu/tree.h | 8 -------- 2 files changed, 2 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a421689..15b19ba 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -4095,7 +4094,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - static struct lock_class_key rcu_exp_sched_rdp_class; unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -4111,10 +4109,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rsp == &rcu_sched_state) - lockdep_set_class_and_name(&rdp->exp_funnel_mutex, - &rcu_exp_sched_rdp_class, - "rcu_data_exp_sched"); } /* @@ -4340,7 +4334,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; - static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4400,14 +4393,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); mutex_init(&rnp->exp_funnel_mutex); - if (rsp == &rcu_sched_state) - lockdep_set_class_and_name( - &rnp->exp_funnel_mutex, - &rcu_exp_sched_class[i], exp_sched[i]); - else - lockdep_set_class_and_name( - &rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + lockdep_set_class_and_name(&rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 1b969ce..6f3b63b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,8 +70,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -81,8 +79,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -93,8 +89,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -106,8 +100,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -- cgit v1.1 From 807226e2fbb504d82cd504b7b6114896db41ef63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Aug 2015 12:03:45 -0700 Subject: rcu: Stop excluding CPU hotplug in synchronize_sched_expedited() Now that synchronize_sched_expedited() uses IPIs, a hook in rcu_sched_qs(), and the ->expmask field in the rcu_node combining tree, it is no longer necessary to exclude CPU hotplug. Any races with CPU hotplug will be detected when attempting to send the IPI. This commit therefore removes the code excluding CPU hotplug operations. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 15b19ba..7e7df5d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3785,19 +3785,9 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; - } - WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) { - put_online_cpus(); + if (rnp == NULL) return; /* Someone else did our work for us. */ - } rcu_exp_gp_seq_start(rsp); sync_sched_exp_select_cpus(rsp); @@ -3805,8 +3795,6 @@ void synchronize_sched_expedited(void) rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); - - put_online_cpus(); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -- cgit v1.1 From 02ef3c4a2aae65a1632b27770bfea3f83ca06772 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Aug 2015 15:14:30 -0700 Subject: cpu: Remove try_get_online_cpus() Now that synchronize_sched_expedited() no longer uses it, there are no users of try_get_online_cpus() in mainline. This commit therefore removes it. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner --- kernel/cpu.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 82cf9df..14a9cdf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -102,19 +102,6 @@ void get_online_cpus(void) } EXPORT_SYMBOL_GPL(get_online_cpus); -bool try_get_online_cpus(void) -{ - if (cpu_hotplug.active_writer == current) - return true; - if (!mutex_trylock(&cpu_hotplug.lock)) - return false; - cpuhp_lock_acquire_tryread(); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); - return true; -} -EXPORT_SYMBOL_GPL(try_get_online_cpus); - void put_online_cpus(void) { int refcount; -- cgit v1.1 From 66fe6cbee44e3f78d05882195a6a38e30b97b936 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2015 17:20:58 -0700 Subject: rcu: Prepare for consolidating expedited CPU selection This commit brings sync_sched_exp_select_cpus() into alignment with sync_rcu_exp_select_cpus(), as a first step towards consolidating them into one function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7e7df5d..ae582e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3661,7 +3661,6 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp) unsigned long mask; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; - struct rcu_data *rdp; int ret; struct rcu_node *rnp; @@ -3697,7 +3696,6 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { if (!(mask_ofl_ipi & mask)) continue; - rdp = per_cpu_ptr(rsp->rda, cpu); ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0); if (!ret) mask_ofl_ipi &= ~mask; -- cgit v1.1 From dcdb8807ba0f5127d4dc67daeeb154edf50a93d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 15 Aug 2015 19:00:31 -0700 Subject: rcu: Consolidate expedited CPU selection Now that sync_sched_exp_select_cpus() and sync_rcu_exp_select_cpus() are identical aside from the the argument to smp_call_function_single(), this commit consolidates them with a functional argument. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++--- kernel/rcu/tree_plugin.h | 61 +----------------------------------------------- 2 files changed, 5 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ae582e3..f44f4b3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3654,7 +3654,8 @@ static void synchronize_sched_expedited_cpu_stop(void *data) * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_sched_exp_select_cpus(struct rcu_state *rsp) +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) { int cpu; unsigned long flags; @@ -3696,7 +3697,7 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { if (!(mask_ofl_ipi & mask)) continue; - ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0); + ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) mask_ofl_ipi &= ~mask; } @@ -3788,7 +3789,7 @@ void synchronize_sched_expedited(void) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); - sync_sched_exp_select_cpus(rsp); + sync_rcu_exp_select_cpus(rsp, synchronize_sched_expedited_cpu_stop); synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7880202..6cbfbfc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -708,65 +708,6 @@ static void sync_rcu_exp_handler(void *info) rcu_report_exp_rdp(rsp, rdp, true); } -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - cpu_is_offline(cpu) || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; - ret = smp_call_function_single(cpu, - sync_rcu_exp_handler, - rsp, 0); - if (!ret) - mask_ofl_ipi &= ~mask; - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - /** * synchronize_rcu_expedited - Brute-force RCU grace period * @@ -795,7 +736,7 @@ void synchronize_rcu_expedited(void) rcu_exp_gp_seq_start(rsp); /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp); + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); -- cgit v1.1 From 74611ecb0fc4c850a8f89a744ce99cbf0dd43cb2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Aug 2015 10:20:43 -0700 Subject: rcu: Add online/offline info to expedited stall warning message This commit makes the RCU CPU stall warning message print online/offline indications immediately after the CPU number. A "O" indicates global offline, a "." global online, and a "o" indicates RCU believes that the CPU is offline for the current grace period and "." otherwise, and an "N" indicates that RCU believes that the CPU will be offline for the next grace period, and "." otherwise, all right after the CPU number. So for CPU 10, you would normally see "10-...:" indicating that everything believes that the CPU is online. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++++- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 31 +++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f44f4b3..3d033fe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3737,11 +3737,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_err("INFO: %s detected expedited stalls on CPUs: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { + (void)rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + if (!(rnp->expmask & mask)) continue; - pr_cont(" %d", cpu); + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } mask <<= 1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6f3b63b..191aa36 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -589,6 +589,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6cbfbfc..7b61cec 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -586,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; +} + +/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -846,6 +867,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + return 0; +} + +/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. -- cgit v1.1 From c58656382e5f2919b05913584f2c54b4f841bc9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Aug 2015 11:25:48 -0700 Subject: rcu: Add tasks to expedited stall-warning messages This commit adds task-print ability to the expedited RCU CPU stall warning messages in preparation for adding stall warnings to synchornize_rcu_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3d033fe..b4f71ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3734,7 +3734,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) sync_rcu_preempt_exp_done(rnp_root)); return; } - pr_err("INFO: %s detected expedited stalls on CPUs: {", + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { (void)rcu_print_task_exp_stall(rnp); -- cgit v1.1 From b08517c76d764c373c232cd309ed058c98705219 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Aug 2015 12:17:29 -0700 Subject: rcu: Enable stall warnings for synchronize_rcu_expedited() This commit redirects synchronize_rcu_expedited()'s wait to synchronize_sched_expedited_wait(), thus enabling RCU CPU stall warnings. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7b61cec..ffeb99e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -761,8 +761,7 @@ void synchronize_rcu_expedited(void) /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); - wait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp)); + synchronize_sched_expedited_wait(rsp); /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); -- cgit v1.1 From 338b0f760e84676130c6e4d8268cb8c923b38c8c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Sep 2015 00:45:02 -0700 Subject: rcu: Better hotplug handling for synchronize_sched_expedited() Earlier versions of synchronize_sched_expedited() can prematurely end grace periods due to the fact that a CPU marked as cpu_is_offline() can still be using RCU read-side critical sections during the time that CPU makes its last pass through the scheduler and into the idle loop and during the time that a given CPU is in the process of coming online. This commit therefore eliminates this window by adding additional interaction with the CPU-hotplug operations. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b4f71ff..42b317e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -246,17 +246,23 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { + unsigned long flags; + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + local_irq_save(flags); if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); rcu_report_exp_rdp(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data), true); } + local_irq_restore(flags); } } @@ -3553,7 +3559,10 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - WARN_ON_ONCE((rnp->expmask & mask) != mask); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } rnp->expmask &= ~mask; __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ } @@ -3644,12 +3653,37 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void synchronize_sched_expedited_cpu_stop(void *data) +static void sync_sched_exp_handler(void *data) { + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. @@ -3677,7 +3711,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (raw_smp_processor_id() == cpu || - cpu_is_offline(cpu) || !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) mask_ofl_test |= rdp->grpmask; } @@ -3697,9 +3730,28 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { if (!(mask_ofl_ipi & mask)) continue; +retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) + if (!ret) { mask_ofl_ipi &= ~mask; + } else { + /* Failed, raced with offline. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, + flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave(&rnp->lock, + flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3796,7 +3848,7 @@ void synchronize_sched_expedited(void) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); - sync_rcu_exp_select_cpus(rsp, synchronize_sched_expedited_cpu_stop); + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); @@ -4183,6 +4235,7 @@ int rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: + sync_sched_exp_online_cleanup(cpu); rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: @@ -4195,7 +4248,10 @@ int rcu_cpu_notify(struct notifier_block *self, break; case CPU_DYING_IDLE: /* QS for any half-done expedited RCU-sched GP. */ - rcu_sched_qs(); + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); for_each_rcu_flavor(rsp) { rcu_cleanup_dying_idle_cpu(cpu, rsp); -- cgit v1.1 From 3ad0040573b0c00f88488bc31958acd07a55ee2e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Oct 2015 01:20:39 +0200 Subject: bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs While recently arguing on a seccomp discussion that raw prandom_u32() access shouldn't be exposed to unpriviledged user space, I forgot the fact that SKF_AD_RANDOM extension actually already does it for some time in cBPF via commit 4cd3675ebf74 ("filter: added BPF random opcode"). Since prandom_u32() is being used in a lot of critical networking code, lets be more conservative and split their states. Furthermore, consolidate eBPF and cBPF prandom handlers to use the new internal PRNG. For eBPF, bpf_get_prandom_u32() was only accessible for priviledged users, but should that change one day, we also don't want to leak raw sequences through things like eBPF maps. One thought was also to have own per bpf_prog states, but due to ABI reasons this is not easily possible, i.e. the program code currently cannot access bpf_prog itself, and copying the rnd_state to/from the stack scratch space whenever a program uses the prng seems not really worth the trouble and seems too hacky. If needed, taus113 could in such cases be implemented within eBPF using a map entry to keep the state space, or get_random_bytes() could become a second helper in cases where performance would not be critical. Both sides can trigger a one-time late init via prandom_init_once() on the shared state. Performance-wise, there should even be a tiny gain as bpf_user_rnd_u32() saves one function call. The PRNG needs to live inside the BPF core since kernels could have a NET-less config as well. Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Cc: Chema Gonzalez Signed-off-by: David S. Miller --- kernel/bpf/core.c | 26 ++++++++++++++++++++++++++ kernel/bpf/helpers.c | 7 +------ kernel/bpf/syscall.c | 2 ++ 3 files changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c8855c2..8086471 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -731,6 +731,32 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); + +void bpf_user_rnd_init_once(void) +{ + prandom_init_once(&bpf_user_rnd_state); +} + +u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* Should someone ever have the rather unwise idea to use some + * of the registers passed into this function, then note that + * this function is called from native eBPF and classic-to-eBPF + * transformations. Register assignments from both sides are + * different, f.e. classic always sets fn(ctx, A, X) here. + */ + struct rnd_state *state; + u32 res; + + state = &get_cpu_var(bpf_user_rnd_state); + res = prandom_u32_state(state); + put_cpu_var(state); + + return res; +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1447ec0..4504ca6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return prandom_u32(); -} - const struct bpf_func_proto bpf_get_prandom_u32_proto = { - .func = bpf_get_prandom_u32, + .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5f35f42..c868caf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -404,6 +404,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in -- cgit v1.1 From 9d67dc5da59d63f746aad8f6ec4fbb86d6486f76 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Oct 2015 22:24:50 +0200 Subject: genirq: Export handle_bad_irq A cleanup of the omap gpio driver introduced a use of the handle_bad_irq() function in a device driver that can be a loadable module. This broke the ARM allmodconfig build: ERROR: "handle_bad_irq" [drivers/gpio/gpio-omap.ko] undefined! This patch exports the handle_bad_irq symbol in order to allow the use in modules. Signed-off-by: Arnd Bergmann Cc: Grygorii Strashko Cc: Santosh Shilimkar Cc: Linus Walleij Cc: Austin Schuh Cc: Tony Lindgren Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/5847725.4IBopItaOr@wuerfel Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index de41a68..77983fc 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -35,6 +35,7 @@ void handle_bad_irq(struct irq_desc *desc) kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); } +EXPORT_SYMBOL_GPL(handle_bad_irq); /* * Special, empty irq handler: -- cgit v1.1 From e3096c9c7c645279808a6bf7ac2031b1895ddffb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Oct 2015 22:59:40 +0200 Subject: genirq: Fix handle_bad_irq kerneldoc comment A recent cleanup removed the 'irq' parameter from many functions, but left the documentation for this in place for at least one function. This removes it. Fixes: bd0b9ac405e1 ("genirq: Remove irq argument from irq flow handlers") Reported-by: kbuild test robot Signed-off-by: Arnd Bergmann Cc: Grygorii Strashko Cc: Tony Lindgren Cc: Linus Walleij Cc: kbuild-all@01.org Cc: Austin Schuh Cc: Santosh Shilimkar Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/5400000.cD19rmgWjV@wuerfel Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 77983fc..e25a83b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -22,7 +22,6 @@ /** * handle_bad_irq - handle spurious and unhandled irqs - * @irq: the interrupt number * @desc: description of the interrupt * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. -- cgit v1.1 From e509bd7da149dc34916037484cd7545b2d48a2b0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 5 Oct 2015 13:12:15 +0300 Subject: genirq: Allow migration of chained interrupts by installing default action When a CPU is offlined all interrupts that have an action are migrated to other still online CPUs. However, if the interrupt has chained handler installed this is not done. Chained handlers are used by GPIO drivers which support interrupts, for instance. When the affinity is not corrected properly we end up in situation where most interrupts are not arriving to the online CPUs anymore. For example on Intel Braswell system which has SD-card card detection signal connected to a GPIO the IO-APIC routing entries look like below after CPU1 is offlined: pin30, enabled , level, low , V(52), IRR(0), S(0), logical , D(03), M(1) pin31, enabled , level, low , V(42), IRR(0), S(0), logical , D(03), M(1) pin32, enabled , level, low , V(62), IRR(0), S(0), logical , D(03), M(1) pin5b, enabled , level, low , V(72), IRR(0), S(0), logical , D(03), M(1) The problem here is that the destination mask still contains both CPUs even if CPU1 is already offline. This means that the IO-APIC still routes interrupts to the other CPU as well. We solve the problem by providing a default action for chained interrupts. This action allows the migration code to correct affinity (as it finds desc->action != NULL). Also make the default action handler to emit a warning if for some reason a chained handler ends up calling it. Signed-off-by: Mika Westerberg Cc: Jiang Liu Link: http://lkml.kernel.org/r/1444039935-30475-1-git-send-email-mika.westerberg@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 17 +++++++++++++++++ kernel/irq/internals.h | 2 ++ kernel/irq/proc.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 46f1fb5..4aa00d3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -21,6 +21,20 @@ #include "internals.h" +static irqreturn_t bad_chained_irq(int irq, void *dev_id) +{ + WARN_ONCE(1, "Chained irq %d should not call an action\n", irq); + return IRQ_NONE; +} + +/* + * Chained handlers should never call action on their IRQ. This default + * action will emit warning if such thing happens. + */ +struct irqaction chained_action = { + .handler = bad_chained_irq, +}; + /** * irq_set_chip - set the irq chip for an irq * @irq: irq number @@ -746,6 +760,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); + if (is_chained) + desc->action = NULL; desc->depth = 1; } desc->handle_irq = handle; @@ -755,6 +771,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); + desc->action = &chained_action; irq_startup(desc, true); } } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index cd60bb4..05c2188 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,6 +18,8 @@ extern bool noirqdebug; +extern struct irqaction chained_action; + /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e3a8c95..7d60905 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -460,7 +460,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; - if (!action && !any_count) + if ((!action || action == &chained_action) && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); -- cgit v1.1 From fcf1ae2f7a044cda9956ec7afb487296afff058e Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Sat, 3 Oct 2015 16:20:38 +0800 Subject: genirq: Make irq_set_vcpu_affinity available for CONFIG_SMP=n irq_set_vcpu_affinity() is needed when CONFIG_SMP=n, so move the definition out of "#ifdef CONFIG_SMP" Suggested-by: Paolo Bonzini Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Cc: pbonzini@redhat.com Link: http://lkml.kernel.org/r/1443860438-144926-1-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 62 ++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a63c2b..312f9cb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -258,37 +258,6 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); -/** - * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt - * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data - * - * This function uses the vCPU specific data to set the vCPU - * affinity for an irq. The vCPU specific data is passed from - * outside, such as KVM. One example code path is as below: - * KVM -> IOMMU -> irq_set_vcpu_affinity(). - */ -int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - struct irq_data *data; - struct irq_chip *chip; - int ret = -ENOSYS; - - if (!desc) - return -EINVAL; - - data = irq_desc_get_irq_data(desc); - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) - ret = chip->irq_set_vcpu_affinity(data, vcpu_info); - irq_put_desc_unlock(desc, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); - static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = @@ -424,6 +393,37 @@ setup_affinity(struct irq_desc *desc, struct cpumask *mask) } #endif +/** + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data + * + * This function uses the vCPU specific data to set the vCPU + * affinity for an irq. The vCPU specific data is passed from + * outside, such as KVM. One example code path is as below: + * KVM -> IOMMU -> irq_set_vcpu_affinity(). + */ +int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + struct irq_data *data; + struct irq_chip *chip; + int ret = -ENOSYS; + + if (!desc) + return -EINVAL; + + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + ret = chip->irq_set_vcpu_affinity(data, vcpu_info); + irq_put_desc_unlock(desc, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); + void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) -- cgit v1.1 From d741314fe8318c1af497b8cf04ea3976b9509eca Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 15 Sep 2015 02:37:48 -0400 Subject: devm_memunmap: use devres_release() Remove open coded call to memunmap. Cc: Christoph Hellwig Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 72b0c66..0756273 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -131,9 +131,8 @@ EXPORT_SYMBOL(devm_memremap); void devm_memunmap(struct device *dev, void *addr) { - WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, - addr)); - memunmap(addr); + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); } EXPORT_SYMBOL(devm_memunmap); -- cgit v1.1 From b36f47617f6ce7c5e8e7c264b9d9ea0654d9f20a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 15 Sep 2015 02:42:20 -0400 Subject: devm_memremap: convert to return ERR_PTR Make devm_memremap consistent with the error return scheme of devm_memremap_pages to remove special casing in the pmem driver. Cc: Christoph Hellwig Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 0756273..0d818ce 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -116,7 +116,7 @@ void *devm_memremap(struct device *dev, resource_size_t offset, ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) - return NULL; + return ERR_PTR(-ENOMEM); addr = memremap(offset, size, flags); if (addr) { -- cgit v1.1 From 7eff93b7c99f5d0024aee677c6c92e32af22e1d2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Oct 2015 20:35:55 -0400 Subject: devm_memremap_pages: use numa_mem_id Hint to closest numa node for the placement of newly allocated pages. As that is where the device's other allocations will originate by default when it does not specify a NUMA node. Cc: Ross Zwisler Reviewed-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 0d818ce..56fc478 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -174,7 +174,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) nid = dev_to_node(dev); if (nid < 0) - nid = 0; + nid = numa_mem_id(); error = arch_add_memory(nid, res->start, resource_size(res), true); if (error) { -- cgit v1.1 From 538ea4aa44737127ce2b5c8511c7349d2abdcf9c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Oct 2015 20:35:56 -0400 Subject: pmem, memremap: convert to numa aware allocations Given that pmem ranges come with numa-locality hints, arrange for the resulting driver objects to be obtained from node-local memory. Reviewed-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 56fc478..3218e8b 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -114,7 +114,8 @@ void *devm_memremap(struct device *dev, resource_size_t offset, { void **ptr, *addr; - ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); if (!ptr) return ERR_PTR(-ENOMEM); @@ -165,8 +166,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); - page_map = devres_alloc(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL); + page_map = devres_alloc_node(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); -- cgit v1.1 From e9849777d0e27cdd2902805be51da73e7c79578c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Oct 2015 23:28:58 +0200 Subject: genirq: Add flag to force mask in disable_irq[_nosync]() If an irq chip does not implement the irq_disable callback, then we use a lazy approach for disabling the interrupt. That means that the interrupt is marked disabled, but the interrupt line is not immediately masked in the interrupt chip. It only becomes masked if the interrupt is raised while it's marked disabled. We use this to avoid possibly expensive mask/unmask operations for common case operations. Unfortunately there are devices which do not allow the interrupt to be disabled easily at the device level. They are forced to use disable_irq_nosync(). This can result in taking each interrupt twice. Instead of enforcing the non lazy mode on all interrupts of a irq chip, provide a settings flag, which can be set by the driver for that particular interrupt line. Reported-and-tested-by: Duc Dang Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1510092348370.6097@nanos --- kernel/irq/chip.c | 9 +++++++++ kernel/irq/manage.c | 1 + kernel/irq/settings.h | 12 ++++++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4aa00d3..1520645 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -241,6 +241,13 @@ void irq_enable(struct irq_desc *desc) * disabled. If an interrupt happens, then the interrupt flow * handler masks the line at the hardware level and marks it * pending. + * + * If the interrupt chip does not implement the irq_disable callback, + * a driver can disable the lazy approach for a particular irq line by + * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can + * be used for devices which cannot disable the interrupt at the + * device level under certain circumstances and have to use + * disable_irq[_nosync] instead. */ void irq_disable(struct irq_desc *desc) { @@ -248,6 +255,8 @@ void irq_disable(struct irq_desc *desc) if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); irq_state_set_masked(desc); + } else if (irq_settings_disable_unlazy(desc)) { + mask_irq(desc); } } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 312f9cb..a71175f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1463,6 +1463,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { + irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); irq_release_resources(desc); } diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 3320b84..320579d 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,6 +15,7 @@ enum { _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, + _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -28,6 +29,7 @@ enum { #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON +#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -154,3 +156,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_IS_POLLED; } + +static inline bool irq_settings_disable_unlazy(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY; +} + +static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; +} -- cgit v1.1 From ff936a04e5f28b7e0455be0e7fa91334f89e4b44 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 7 Oct 2015 10:55:41 -0700 Subject: bpf: fix cb access in socket filter programs eBPF socket filter programs may see junk in 'u32 cb[5]' area, since it could have been used by protocol layers earlier. For socket filter programs used in af_packet we need to clean 20 bytes of skb->cb area if it could be used by the program. For programs attached to TCP/UDP sockets we need to save/restore these 20 bytes, since it's used by protocol layers. Remove SK_RUN_FILTER macro, since it's no longer used. Long term we may move this bpf cb area to per-cpu scratch, but that requires addition of new 'per-cpu load/store' instructions, so not suitable as a short term fix. Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") Reported-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b074b23..f8da034 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2024,7 +2024,7 @@ static int convert_ctx_accesses(struct verifier_env *env) cnt = env->prog->aux->ops-> convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf); + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; -- cgit v1.1 From cfed432d7f4114e16e0163bcfe65e96f0c304493 Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Wed, 23 Sep 2015 13:19:19 +0200 Subject: clocksource: Remove return statement from void functions Signed-off-by: Guillaume Gomez Cc: John Stultz Link: http://lkml.kernel.org/r/CAAOQCfSDgmqSWDBsetau%2ByF8x0%2BDagCF_pfFw0p5xH_BKkKEog@mail.gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 174c594..de0ad66 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -595,16 +595,15 @@ static void __clocksource_select(bool skipcur) */ static void clocksource_select(void) { - return __clocksource_select(false); + __clocksource_select(false); } static void clocksource_select_fallback(void) { - return __clocksource_select(true); + __clocksource_select(true); } #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ - static inline void clocksource_select(void) { } static inline void clocksource_select_fallback(void) { } -- cgit v1.1 From 9fc4468d546b6eb55b0aa5b04b0c36238ebf57e7 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 2 Oct 2015 09:45:30 +0200 Subject: timers: Use __fls in apply_slack() In apply_slack(), find_last_bit() is applied to a bitmask consisting of precisely BITS_PER_LONG bits. Since mask is non-zero, we might as well eliminate the function call and use __fls() directly. On x86_64, this shaves 23 bytes of the only caller, mod_timer(). This also gets rid of Coverity CID 1192106, but that is a false positive: Coverity is not aware that mask != 0 implies that find_last_bit will not return BITS_PER_LONG. Signed-off-by: Rasmus Villemoes Cc: John Stultz Link: http://lkml.kernel.org/r/1443771931-6284-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d3f5e92..74591ba 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -874,7 +874,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) if (mask == 0) return expires; - bit = find_last_bit(&mask, BITS_PER_LONG); + bit = __fls(mask); mask = (1UL << bit) - 1; -- cgit v1.1 From 9babcd7929bc8967ae3bb6093f603b93c2f9958f Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 8 Oct 2015 15:36:06 -0300 Subject: sched, tracing: Stop/start critical timings around the idle=poll idle loop When using idle=poll, the preemptoff tracer is always showing the idle task as the culprit for long latencies. That happens because critical timings are not stopped before idle loop. This patch stops critical timings before entering the idle loop, starting it again after the idle loop. This problem does not affect the irqsoff tracer because interruptions are enabled before entering the idle loop. Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Luis Claudio R. Goncalves Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/10fc3705874aef11dbe152a068b591a7be1899b4.1444314899.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f177c7..4a2ef5a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); + stop_critical_timings(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); return 1; -- cgit v1.1 From e2273584d3f33f7f2cfe6d7aaade0fa2f1cb3db5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 9 Oct 2015 11:53:12 +0800 Subject: workqueue: Allocate the unbound pool using local node memory Currently, get_unbound_pool() uses kzalloc() to allocate the worker pool. Actually, we can use the right node to do the allocation, achieving local memory access. This patch selects target node first, and uses kzalloc_node() instead. Signed-off-by: Xunlei Pang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca71582..96d3747 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3199,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int node; + int target_node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3210,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(attrs->cpumask, + wq_numa_possible_cpumask[node])) { + target_node = node; + break; + } + } + } + /* nope, create a new one */ - pool = kzalloc(sizeof(*pool), GFP_KERNEL); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); if (!pool || init_worker_pool(pool) < 0) goto fail; lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + pool->node = target_node; /* * no_numa isn't a worker_pool attribute, always clear it. See @@ -3224,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) */ pool->attrs->no_numa = false; - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { - pool->node = node; - break; - } - } - } - if (worker_pool_assign_id(pool) < 0) goto fail; -- cgit v1.1 From 1be7f75d1668d6296b80bf35dcf6762393530afc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 7 Oct 2015 22:23:21 -0700 Subject: bpf: enable non-root eBPF programs In order to let unprivileged users load and execute eBPF programs teach verifier to prevent pointer leaks. Verifier will prevent - any arithmetic on pointers (except R10+Imm which is used to compute stack addresses) - comparison of pointers (except if (map_value_ptr == 0) ... ) - passing pointers to helper functions - indirectly passing pointers in stack to helper functions - returning pointer from bpf program - storing pointers into ctx or maps Spill/fill of pointers into stack is allowed, but mangling of pointers stored in the stack or reading them byte by byte is not. Within bpf programs the pointers do exist, since programs need to be able to access maps, pass skb pointer to LD_ABS insns, etc but programs cannot pass such pointer values to the outside or obfuscate them. Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs, so that socket filters (tcpdump), af_packet (quic acceleration) and future kcm can use it. tracing and tc cls/act program types still require root permissions, since tracing actually needs to be able to see all kernel pointers and tc is for root only. For example, the following unprivileged socket filter program is allowed: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += skb->len; return 0; } but the following program is not: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += (u64) skb; return 0; } since it would leak the kernel address into the map. Unprivileged socket filter bpf programs have access to the following helper functions: - map lookup/update/delete (but they cannot store kernel pointers into them) - get_random (it's already exposed to unprivileged user space) - get_smp_processor_id - tail_call into another socket filter program - ktime_get_ns The feature is controlled by sysctl kernel.unprivileged_bpf_disabled. This toggle defaults to off (0), but can be set true (1). Once true, bpf programs and maps cannot be accessed from unprivileged process, and the toggle cannot be set back to false. Signed-off-by: Alexei Starovoitov Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 11 +++--- kernel/bpf/verifier.c | 106 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/sysctl.c | 13 +++++++ 3 files changed, 116 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c868caf..83697bc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include #include +int sysctl_unprivileged_bpf_disabled __read_mostly; + static LIST_HEAD(bpf_map_types); static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -544,6 +546,9 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; + if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) @@ -599,11 +604,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr = {}; int err; - /* the syscall is limited to root temporarily. This restriction will be - * lifted when security audit is clean. Note that eBPF+tracing must have - * this restriction, since it may pass kernel data to user space - */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) return -EPERM; if (!access_ok(VERIFY_READ, uattr, 1)) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f8da034..1d6b97b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -199,6 +199,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + bool allow_ptr_leaks; }; /* verbose verifier prints what it's seeing @@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size) return -EINVAL; } +static bool is_spillable_regtype(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_MAP_VALUE: + case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_STACK: + case PTR_TO_CTX: + case FRAME_PTR: + case CONST_PTR_TO_MAP: + return true; + default: + return false; + } +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, */ if (value_regno >= 0 && - (state->regs[value_regno].type == PTR_TO_MAP_VALUE || - state->regs[value_regno].type == PTR_TO_STACK || - state->regs[value_regno].type == PTR_TO_CTX)) { + is_spillable_regtype(state->regs[value_regno].type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } +static bool is_pointer_value(struct verifier_env *env, int regno) +{ + if (env->allow_ptr_leaks) + return false; + + switch (env->cur_state.regs[regno].type) { + case UNKNOWN_VALUE: + case CONST_IMM: + return false; + default: + return true; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into map\n", value_regno); + return -EACCES; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); } else if (state->regs[regno].type == PTR_TO_CTX) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into ctx\n", value_regno); + return -EACCES; + } err = check_ctx_access(env, off, size, t); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } - if (t == BPF_WRITE) + if (t == BPF_WRITE) { + if (!env->allow_ptr_leaks && + state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } err = check_stack_write(state, off, size, value_regno); - else + } else { err = check_stack_read(state, off, size, value_regno); + } } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[state->regs[regno].type]); @@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_ANYTHING) { + if (is_pointer_value(env, regno)) { + verbose("R%d leaks addr into helper function\n", regno); + return -EACCES; + } return 0; + } if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id) } /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { + struct reg_state *regs = env->cur_state.regs; u8 opcode = BPF_OP(insn->code); int err; @@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } + /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); if (err) @@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) */ regs[insn->dst_reg] = regs[insn->src_reg]; } else { + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d partial copy of pointer\n", + insn->src_reg); + return -EACCES; + } regs[insn->dst_reg].type = UNKNOWN_VALUE; regs[insn->dst_reg].map_ptr = NULL; } @@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) + BPF_SRC(insn->code) == BPF_K) { stack_relative = true; + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } else if (BPF_SRC(insn->code) == BPF_X && + is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->src_reg); + return -EACCES; + } /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); @@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env, err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer comparison prohibited\n", + insn->src_reg); + return -EACCES; + } } else { if (insn->src_reg != BPF_REG_0) { verbose("BPF_JMP uses reserved fields\n"); @@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env, regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = 0; } + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + return -EACCES; } else if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE)) { @@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env) } if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(regs, insn); + err = check_alu_op(env, insn); if (err) return err; @@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (is_pointer_value(env, BPF_REG_0)) { + verbose("R0 leaks addr as return value\n"); + return -EACCES; + } + process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { @@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = do_check(env); skip_full_check: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e69201d..96c856b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1139,6 +1140,18 @@ static struct ctl_table kern_table[] = { .proc_handler = timer_migration_handler, }, #endif +#ifdef CONFIG_BPF_SYSCALL + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif { } }; -- cgit v1.1 From aaac3ba95e4c8b496d22f68bd1bc01cfbf525eca Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 7 Oct 2015 22:23:22 -0700 Subject: bpf: charge user for creation of BPF maps and programs since eBPF programs and maps use kernel memory consider it 'locked' memory from user accounting point of view and charge it against RLIMIT_MEMLOCK limit. This limit is typically set to 64Kbytes by distros, so almost all bpf+tracing programs would need to increase it, since they use maps, but kernel charges maximum map size upfront. For example the hash map of 1024 elements will be charged as 64Kbyte. It's inconvenient for current users and changes current behavior for root, but probably worth doing to be consistent root vs non-root. Similar accounting logic is done by mmap of perf_event. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/hashtab.c | 4 ++++ kernel/bpf/syscall.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2fecc4a..f2d9e69 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; return &array->map; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83c209d..28592d7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -88,6 +88,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8) + htab->map.value_size; + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->elem_size * htab->map.max_entries, + PAGE_SIZE) >> PAGE_SHIFT; return &htab->map; free_htab: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 83697bc..f640e5f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -46,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +static int bpf_map_charge_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(map->pages, &user->locked_vm); + + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + map->user = user; + return 0; +} + +static void bpf_map_uncharge_memlock(struct bpf_map *map) +{ + struct user_struct *user = map->user; + + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + bpf_map_uncharge_memlock(map); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -110,6 +137,10 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); + err = bpf_map_charge_memlock(map); + if (err) + goto free_map; + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); if (err < 0) @@ -442,11 +473,37 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +static int bpf_prog_charge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(prog->pages, &user->locked_vm); + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + prog->aux->user = user; + return 0; +} + +static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = prog->aux->user; + + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); +} + static void __prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); free_used_maps(aux); + bpf_prog_uncharge_memlock(aux->prog); bpf_prog_free(aux->prog); } @@ -554,6 +611,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_nouncharge; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -595,6 +656,8 @@ static int bpf_prog_load(union bpf_attr *attr) free_used_maps: free_used_maps(prog->aux); free_prog: + bpf_prog_uncharge_memlock(prog); +free_prog_nouncharge: bpf_prog_free(prog); return err; } -- cgit v1.1 From 5d4c9bc7767bc86eb9a0e66df783e3fbada7dc97 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:29 +0100 Subject: irqdomain: Use irq_domain_get_of_node() instead of direct field access The struct irq_domain contains a "struct device_node *" field (of_node) that is almost the only link between the irqdomain and the device tree infrastructure. In order to prepare for the removal of that field, convert all users to use irq_domain_get_of_node() instead. Signed-off-by: Marc Zyngier Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dc9d27c..8f8b538 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -102,7 +102,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(domain->of_node); + of_node_put(irq_domain_get_of_node(domain)); kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); @@ -208,10 +208,12 @@ struct irq_domain *irq_find_matching_host(struct device_node *node, */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { + struct device_node *of_node; + of_node = irq_domain_get_of_node(h); if (h->ops->match) rc = h->ops->match(h, node, bus_token); else - rc = ((h->of_node != NULL) && (h->of_node == node) && + rc = ((of_node != NULL) && (of_node == node) && ((bus_token == DOMAIN_BUS_ANY) || (h->bus_token == bus_token))); @@ -336,10 +338,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { + struct device_node *of_node; int i; + of_node = irq_domain_get_of_node(domain); pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, - of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + of_node_full_name(of_node), irq_base, (int)hwirq_base, count); for (i = 0; i < count; i++) { irq_domain_associate(domain, irq_base + i, hwirq_base + i); @@ -359,12 +363,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { + struct device_node *of_node; unsigned int virq; if (domain == NULL) domain = irq_default_domain; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + of_node = irq_domain_get_of_node(domain); + virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; @@ -399,6 +405,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct device_node *of_node; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -412,6 +419,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("-> using domain @%p\n", domain); + of_node = irq_domain_get_of_node(domain); + /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { @@ -420,8 +429,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, - of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -433,7 +441,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, of_node_full_name(domain->of_node), virq); + hwirq, of_node_full_name(of_node), virq); return virq; } @@ -460,10 +468,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping); int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { + struct device_node *of_node; int ret; + of_node = irq_domain_get_of_node(domain); ret = irq_alloc_descs(irq_base, irq_base, count, - of_node_to_nid(domain->of_node)); + of_node_to_nid(of_node)); if (unlikely(ret < 0)) return ret; @@ -590,14 +600,16 @@ static int virq_debug_show(struct seq_file *m, void *private) "name", "mapped", "linear-max", "direct-max", "devtree-node"); mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { + struct device_node *of_node; int count = 0; + of_node = irq_domain_get_of_node(domain); radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - domain->of_node ? of_node_full_name(domain->of_node) : ""); + of_node ? of_node_full_name(of_node) : ""); } mutex_unlock(&irq_domain_mutex); -- cgit v1.1 From f110711a6053f08731858aa91420104094188973 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:30 +0100 Subject: irqdomain: Convert irqdomain-%3Eof_node to fwnode Now that we have everyone accessing the of_node field via the irq_domain_get_of_node accessor, it is pretty easy to swap it for a pointer to a fwnode_handle. This translates into a few limited changes in __irq_domain_add, and an updated irq_domain_get_of_node. Signed-off-by: Marc Zyngier Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8f8b538..1aee5c1 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -46,17 +46,21 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, void *host_data) { struct irq_domain *domain; + struct fwnode_handle *fwnode; domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; + of_node_get(of_node); + fwnode = of_node ? &of_node->fwnode : NULL; + /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; - domain->of_node = of_node_get(of_node); + domain->fwnode = fwnode; domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; -- cgit v1.1 From 130b8c6c8d86075304952241bf2365cea6489df1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:31 +0100 Subject: irqdomain: Allow irq domain lookup by fwnode So far, our irq domains are still looked up by device node. Let's change this and allow a domain to be looked up using a fwnode_handle pointer. The existing interfaces are preserved with a couple of helpers. Signed-off-by: Marc Zyngier Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-4-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1aee5c1..023ab6d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -191,12 +191,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_matching_host() - Locates a domain for a given device node - * @node: device-tree node of the interrupt controller + * irq_find_matching_fwnode() - Locates a domain for a given fwnode + * @fwnode: FW descriptor of the interrupt controller * @bus_token: domain-specific data */ -struct irq_domain *irq_find_matching_host(struct device_node *node, - enum irq_domain_bus_token bus_token) +struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; int rc; @@ -212,12 +212,10 @@ struct irq_domain *irq_find_matching_host(struct device_node *node, */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { - struct device_node *of_node; - of_node = irq_domain_get_of_node(h); if (h->ops->match) - rc = h->ops->match(h, node, bus_token); + rc = h->ops->match(h, to_of_node(fwnode), bus_token); else - rc = ((of_node != NULL) && (of_node == node) && + rc = ((fwnode != NULL) && (h->fwnode == fwnode) && ((bus_token == DOMAIN_BUS_ANY) || (h->bus_token == bus_token))); @@ -229,7 +227,7 @@ struct irq_domain *irq_find_matching_host(struct device_node *node, mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_matching_host); +EXPORT_SYMBOL_GPL(irq_find_matching_fwnode); /** * irq_set_default_host() - Set a "default" irq domain -- cgit v1.1 From 11e4438ee330fab0f216ee7cc1b651cb2ddceb5d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:32 +0100 Subject: irqdomain: Introduce a firmware-specific IRQ specifier structure So far the closest thing to a generic IRQ specifier structure is of_phandle_args, which happens to be pretty OF specific (the of_node pointer in there is quite annoying). Let's introduce 'struct irq_fwspec' that can be used in place of of_phandle_args for OF, but also for other firmware implementations (that'd be ACPI). This is used together with a new 'translate' method that is the pendent of 'xlate'. We convert irq_create_of_mapping to use this new structure (with a small hack that will be removed later). Signed-off-by: Marc Zyngier Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-5-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 59 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 023ab6d..86dfd40 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -484,30 +484,67 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); +static int irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + irq_hw_number_t *hwirq, unsigned int *type) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (d->ops->translate) + return d->ops->translate(d, fwspec, hwirq, type); +#endif + if (d->ops->xlate) + return d->ops->xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); + + /* If domain has no translation, then we assume interrupt line */ + *hwirq = fwspec->param[0]; + return 0; +} + +static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, + struct irq_fwspec *fwspec) +{ + int i; + + fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; + fwspec->param_count = irq_data->args_count; + + for (i = 0; i < irq_data->args_count; i++) + fwspec->param[i] = irq_data->args[i]; +} + unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { + struct irq_fwspec fwspec; struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; - domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; + of_phandle_args_to_fwspec(irq_data, &fwspec); + + if (fwspec.fwnode) + domain = irq_find_matching_fwnode(fwspec.fwnode, DOMAIN_BUS_ANY); + else + domain = irq_default_domain; + if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(irq_data->np)); + of_node_full_name(to_of_node(fwspec.fwnode))); return 0; } - /* If domain has no translation, then we assume interrupt line */ - if (domain->ops->xlate == NULL) - hwirq = irq_data->args[0]; - else { - if (domain->ops->xlate(domain, irq_data->np, irq_data->args, - irq_data->args_count, &hwirq, &type)) - return 0; - } + if (irq_domain_translate(domain, &fwspec, &hwirq, &type)) + return 0; if (irq_domain_is_hierarchy(domain)) { + /* Temporary hack */ + void *desc = &fwspec; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (!domain->ops->translate) + desc = irq_data; +#endif /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. @@ -516,7 +553,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) if (virq) return virq; - virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, desc); if (virq <= 0) return 0; } else { -- cgit v1.1 From c0131f09de8c2d301814cac86d78f643b8ee0574 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:34 +0100 Subject: irqdomain: Introduce irq_create_fwspec_mapping Just like we have irq_create_of_mapping, irq_create_fwspec_mapping creates a IRQ domain mapping for an interrupt described in a struct irq_fwspec. irq_create_of_mapping gets rewritten in terms of the new function, and the hack we introduced before gets removed (now that no stacked irqchip uses of_phandle_args anymore). Signed-off-by: Marc Zyngier Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-7-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 86dfd40..b1fda6d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -514,37 +514,28 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, fwspec->param[i] = irq_data->args[i]; } -unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { - struct irq_fwspec fwspec; struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; - of_phandle_args_to_fwspec(irq_data, &fwspec); - - if (fwspec.fwnode) - domain = irq_find_matching_fwnode(fwspec.fwnode, DOMAIN_BUS_ANY); + if (fwspec->fwnode) + domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); else domain = irq_default_domain; if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(to_of_node(fwspec.fwnode))); + of_node_full_name(to_of_node(fwspec->fwnode))); return 0; } - if (irq_domain_translate(domain, &fwspec, &hwirq, &type)) + if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; if (irq_domain_is_hierarchy(domain)) { - /* Temporary hack */ - void *desc = &fwspec; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - if (!domain->ops->translate) - desc = irq_data; -#endif /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. @@ -553,7 +544,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) if (virq) return virq; - virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, desc); + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; } else { @@ -569,6 +560,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) irq_set_irq_type(virq, type); return virq; } +EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); + +unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(irq_data, &fwspec); + return irq_create_fwspec_mapping(&fwspec); +} EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** -- cgit v1.1 From 1bf4ddc46c5d6123897a54cea4ffe3e90f30600b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:35 +0100 Subject: irqdomain: Introduce irq_domain_create_{linear, tree} Just like we have irq_domain_add_{linear,tree} to create a irq domain identified by an of_node, introduce irq_domain_create_{linear,tree} that do the same thing, except that they take a struct fwnode_handle. Existing functions get rewritten in terms of the new ones so that everything keeps working as before (and __irq_domain_add is now fwnode_handle based as well). Signed-off-by: Marc Zyngier Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-8-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b1fda6d..de6d749 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -40,13 +40,15 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain); * Allocates and initialize and irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; - struct fwnode_handle *fwnode; + struct device_node *of_node; + + of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); @@ -54,7 +56,6 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, return NULL; of_node_get(of_node); - fwnode = of_node ? &of_node->fwnode : NULL; /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -137,7 +138,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); + domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); if (!domain) return NULL; @@ -181,7 +182,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, first_hwirq + size, + domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size, first_hwirq + size, 0, ops, host_data); if (domain) irq_domain_associate_many(domain, first_irq, first_hwirq, size); -- cgit v1.1 From b145dcc45a6af0abfcf9b4de8006d40559c50fc6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:36 +0100 Subject: irqdomain: Add a fwnode_handle allocator In order to be able to reference an irqdomain from ACPI, we need to be able to create an identifier, which is usually a struct device_node. This device node does't really fit the ACPI infrastructure, so we cunningly allocate a new structure containing a fwnode_handle, and return that. This structure doesn't really point to a device (interrupt controllers are not "real" devices in Linux), but as we cannot really deny that they exist, we create them with a new fwnode_type (FWNODE_IRQCHIP). Signed-off-by: Marc Zyngier Acked-by: Rafael J. Wysocki Reviewed-and-tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-9-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index de6d749..41151d3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); +struct irqchip_fwid { + struct fwnode_handle fwnode; + char *name; + void *data; +}; + +/** + * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for + * identifying an irq domain + * @data: optional user-provided data + * + * Allocate a struct device_node, and return a poiner to the embedded + * fwnode_handle (or NULL on failure). + */ +struct fwnode_handle *irq_domain_alloc_fwnode(void *data) +{ + struct irqchip_fwid *fwid; + char *name; + + fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "irqchip@%p", data); + + if (!fwid || !name) { + kfree(fwid); + kfree(name); + return NULL; + } + + fwid->name = name; + fwid->data = data; + fwid->fwnode.type = FWNODE_IRQCHIP; + return &fwid->fwnode; +} + +/** + * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle + * + * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. + */ +void irq_domain_free_fwnode(struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid; + + if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) + return; + + fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + kfree(fwid->name); + kfree(fwid); +} + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller -- cgit v1.1 From 2a5e9a072da6469a37d1f0b1577416f51223c280 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:43 +0100 Subject: irqdomain: Introduce irq_domain_create_hierarchy As we're about to start converting the various MSI layers to use fwnode_handle instead of device_node, add irq_domain_create_hierarchy as a directly equivalent of irq_domain_add_hierarchy (which still exists as a compatibility interface). Signed-off-by: Marc Zyngier Tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-16-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 41151d3..22aa961 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -854,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** - * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy * @parent: Parent irq domain to associate with the new domain * @flags: Irq domain flags associated to the domain * @size: Size of the domain. See below - * @node: Optional device-tree node of the interrupt controller + * @fwnode: Optional fwnode of the interrupt controller * @ops: Pointer to the interrupt domain callbacks * @host_data: Controller private data pointer * @@ -868,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, * domain flags are set. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, unsigned int flags, unsigned int size, - struct device_node *node, + struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; if (size) - domain = irq_domain_add_linear(node, size, ops, host_data); + domain = irq_domain_create_linear(fwnode, size, ops, host_data); else - domain = irq_domain_add_tree(node, ops, host_data); + domain = irq_domain_create_tree(fwnode, ops, host_data); if (domain) { domain->parent = parent; domain->flags |= flags; -- cgit v1.1 From be5436c83ac8921f33fe07323fab03c6644ce52e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 12:51:44 +0100 Subject: irqdomain/msi: Use fwnode instead of of_node As we continue to push of_node towards the outskirts of irq domains, let's start tackling the case of msi_create_irq_domain and its little friends. This has limited impact in both PCI/MSI, platform MSI, and a few drivers. Signed-off-by: Marc Zyngier Tested-by: Hanjun Guo Tested-by: Lorenzo Pieralisi Cc: Cc: Tomasz Nowicki Cc: Suravee Suthikulpanit Cc: Graeme Gregory Cc: Jake Oshins Cc: Jiang Liu Cc: Jason Cooper Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1444737105-31573-17-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7e6512b..95354bb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -239,11 +239,11 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) /** * msi_create_irq_domain - Create a MSI interrupt domain - * @of_node: Optional device-tree node of the interrupt controller + * @fwnode: Optional fwnode of the interrupt controller * @info: MSI domain info * @parent: Parent irq domain */ -struct irq_domain *msi_create_irq_domain(struct device_node *node, +struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { @@ -252,8 +252,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, - info); + return irq_domain_create_hierarchy(parent, 0, 0, fwnode, + &msi_domain_ops, info); } /** -- cgit v1.1 From ef25ba0476015908ef5960f9faac149ddf34ede0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 7 Oct 2015 00:49:34 +0200 Subject: PM / sleep: Add flags to indicate platform firmware involvement There are quite a few cases in which device drivers, bus types or even the PM core itself may benefit from knowing whether or not the platform firmware will be involved in the upcoming system power transition (during system suspend) or whether or not it was involved in it (during system resume). For this reason, introduce global system suspend flags that can be used by the platform code to expose that information for the benefit of the other parts of the kernel and make the ACPI core set them as appropriate. Users of the new flags will be added later. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7e4cda4..f9fe133 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -35,6 +35,9 @@ const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; +unsigned int pm_suspend_global_flags; +EXPORT_SYMBOL_GPL(pm_suspend_global_flags); + static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); @@ -493,6 +496,7 @@ static int enter_state(suspend_state_t state) #endif pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); + pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) goto Unlock; -- cgit v1.1 From d439e64f22ce0eea681ae90c71f584d3a0145ded Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 29 Sep 2015 20:36:58 -0700 Subject: PM / hibernate: fix a comment typo Just fix a typo in a function name in kerneldoc comments. Signed-off-by: Geliang Tang Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 690f78f..b7342a2 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -733,7 +733,7 @@ int hibernate(void) * contents of memory is restored from the saved image. * * If this is successful, control reappears in the restored target kernel in - * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ -- cgit v1.1 From 5e3949f0ac5a81a1b06a5d972085cbf1aaf17508 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Tue, 29 Sep 2015 19:46:12 +0300 Subject: ftrace: Remove redundant strsep in mod_callback By now there isn't any subcommand for mod. Before: sh$ echo '*:mod:ipv6:a' > set_ftrace_filter sh$ echo '*:mod:ipv6' > set_ftrace_filter had the same results, but now first will result in: sh$ echo '*:mod:ipv6:a' > set_ftrace_filter -bash: echo: write error: Invalid argument Also, I clarified ftrace_mod_callback code a little. Link: http://lkml.kernel.org/r/1443545176-3215-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> [ converted 'if (ret == 0)' to 'if (!ret)' ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f7b78d7..8892b45 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3569,8 +3569,7 @@ static int ftrace_mod_callback(struct ftrace_hash *hash, char *func, char *cmd, char *param, int enable) { - char *mod; - int ret = -EINVAL; + int ret; /* * cmd == 'mod' because we only registered this func @@ -3581,16 +3580,12 @@ ftrace_mod_callback(struct ftrace_hash *hash, */ /* we must have a module name */ - if (!param) - return ret; - - mod = strsep(¶m, ":"); - if (!strlen(mod)) - return ret; + if (!param || !strlen(param)) + return -EINVAL; - ret = ftrace_match_module_records(hash, func, mod); + ret = ftrace_match_module_records(hash, func, param); if (!ret) - ret = -EINVAL; + return -EINVAL; if (ret < 0) return ret; -- cgit v1.1 From 7c177d994eb9637302b79e80d331f48dfbe26368 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 14 Oct 2015 12:07:53 -0700 Subject: posix_cpu_timer: Optimize fastpath_timer_check() In fastpath_timer_check(), the task_cputime() function is always called to compute the utime and stime values. However, this is not necessary if there are no per-thread timers to check for. This patch modifies the code such that we compute the task_cputime values only when there are per-thread timers set. Signed-off-by: Jason Low Reviewed-by: Oleg Nesterov Reviewed-by: Frederic Weisbecker Reviewed-by: Davidlohr Bueso Reviewed-by: George Spelvin Cc: Paul E. McKenney Cc: Steven Rostedt Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1444849677-29330-2-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-cpu-timers.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 892e3da..aa4b6f4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1117,17 +1117,12 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - cputime_t utime, stime; - - task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { - struct task_cputime task_sample = { - .utime = utime, - .stime = stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; + struct task_cputime task_sample; + task_cputime(tsk, &task_sample.utime, &task_sample.stime); + task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } -- cgit v1.1 From 934715a191e4be0c602d39455a7a74316f274d60 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 14 Oct 2015 12:07:54 -0700 Subject: posix_cpu_timer: Check thread timers only when there are active thread timers The fastpath_timer_check() contains logic to check for if any timers are set by checking if !task_cputime_zero(). Similarly, we can do this before calling check_thread_timers(). In the case where there are only process-wide timers, this will skip all of the computations for per-thread timers when there are no per-thread timers. As suggested by George, we can put the task_cputime_zero() check in check_thread_timers(), since that is more of an optization to the function. Similarly, we move the existing check of cputimer->running to check_process_timers(). Signed-off-by: Jason Low Reviewed-by: Oleg Nesterov Reviewed-by: George Spelvin Cc: Paul E. McKenney Cc: Frederic Weisbecker Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1444849677-29330-3-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-cpu-timers.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index aa4b6f4..6f6e252 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -864,6 +864,13 @@ static void check_thread_timers(struct task_struct *tsk, unsigned long long expires; unsigned long soft; + /* + * If cputime_expires is zero, then there are no active + * per thread CPU timers. + */ + if (task_cputime_zero(&tsk->cputime_expires)) + return; + expires = check_timers_list(timers, firing, prof_ticks(tsk)); tsk_expires->prof_exp = expires_to_cputime(expires); @@ -962,6 +969,13 @@ static void check_process_timers(struct task_struct *tsk, unsigned long soft; /* + * If cputimer is not running, then there are no active + * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). + */ + if (!READ_ONCE(tsk->signal->cputimer.running)) + return; + + /* * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); @@ -1169,12 +1183,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) * put them on the firing list. */ check_thread_timers(tsk, &firing); - /* - * If there are any active process wide timers (POSIX 1.b, itimers, - * RLIMIT_CPU) cputimer must be running. - */ - if (READ_ONCE(tsk->signal->cputimer.running)) - check_process_timers(tsk, &firing); + + check_process_timers(tsk, &firing); /* * We must release these locks before taking any timer's lock. -- cgit v1.1 From d5c373eb5610686162ff50429f63f4c00c554799 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 14 Oct 2015 12:07:55 -0700 Subject: posix_cpu_timer: Convert cputimer->running to bool In the next patch in this series, a new field 'checking_timer' will be added to 'struct thread_group_cputimer'. Both this and the existing 'running' integer field are just used as boolean values. To save space in the structure, we can make both of these fields booleans. This is a preparatory patch to convert the existing running integer field to a boolean. Suggested-by: George Spelvin Signed-off-by: Jason Low Reviewed: George Spelvin Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Frederic Weisbecker Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1444849677-29330-4-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner --- kernel/fork.c | 2 +- kernel/time/posix-cpu-timers.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2845623..6ac8942 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1101,7 +1101,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); - sig->cputimer.running = 1; + sig->cputimer.running = true; } /* The timer lists. */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 6f6e252..2d58153 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * but barriers are not required because update_gt_cputime() * can handle concurrent updates. */ - WRITE_ONCE(cputimer->running, 1); + WRITE_ONCE(cputimer->running, true); } sample_cputime_atomic(times, &cputimer->cputime_atomic); } @@ -918,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; /* Turn off cputimer->running. This is done without locking. */ - WRITE_ONCE(cputimer->running, 0); + WRITE_ONCE(cputimer->running, false); } static u32 onecputick; -- cgit v1.1 From c8d75aa47dd585c9538a8205e9bb9847e12cfb84 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 14 Oct 2015 12:07:56 -0700 Subject: posix_cpu_timer: Reduce unnecessary sighand lock contention It was found while running a database workload on large systems that significant time was spent trying to acquire the sighand lock. The issue was that whenever an itimer expired, many threads ended up simultaneously trying to send the signal. Most of the time, nothing happened after acquiring the sighand lock because another thread had just already sent the signal and updated the "next expire" time. The fastpath_timer_check() didn't help much since the "next expire" time was updated after the threads exit fastpath_timer_check(). This patch addresses this by having the thread_group_cputimer structure maintain a boolean to signify when a thread in the group is already checking for process wide timers, and adds extra logic in the fastpath to check the boolean. Signed-off-by: Jason Low Reviewed-by: Oleg Nesterov Reviewed-by: George Spelvin Cc: Paul E. McKenney Cc: Frederic Weisbecker Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: hideaki.kimura@hpe.com Cc: terry.rudd@hpe.com Cc: scott.norton@hpe.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1444849677-29330-5-git-send-email-jason.low2@hp.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-cpu-timers.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2d58153..f5e86d2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -975,6 +975,12 @@ static void check_process_timers(struct task_struct *tsk, if (!READ_ONCE(tsk->signal->cputimer.running)) return; + /* + * Signify that a thread is checking for process timers. + * Write access to this field is protected by the sighand lock. + */ + sig->cputimer.checking_timer = true; + /* * Collect the current process totals. */ @@ -1029,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk, sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); + + sig->cputimer.checking_timer = false; } /* @@ -1142,8 +1150,22 @@ static inline int fastpath_timer_check(struct task_struct *tsk) } sig = tsk->signal; - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(sig->cputimer.running)) { + /* + * Check if thread group timers expired when the cputimer is + * running and no other thread in the group is already checking + * for thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to + * be a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to check/handle timers. + * + * In the worst case scenario, if 'running' or 'checking_timer' gets + * set but the current thread doesn't see the change yet, we'll wait + * until the next thread in the group gets a scheduler interrupt to + * handle the timer. This isn't an issue in practice because these + * types of delays with signals actually getting sent are expected. + */ + if (READ_ONCE(sig->cputimer.running) && + !READ_ONCE(sig->cputimer.checking_timer)) { struct task_cputime group_sample; sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); -- cgit v1.1 From b309e5b743a999cdb34a3989d1800c608e656c2b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:49 -0400 Subject: cgroup: remove an unused parameter from cgroup_task_migrate() cgroup_task_migrate() no longer uses @old_cgrp. Remove it. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae23814..49f30f1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2235,14 +2235,12 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) /** * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp: the cgroup @tsk is being migrated from * @tsk: the task being migrated * @new_cset: the new css_set @tsk is being attached to * * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. */ -static void cgroup_task_migrate(struct cgroup *old_cgrp, - struct task_struct *tsk, +static void cgroup_task_migrate(struct task_struct *tsk, struct css_set *new_cset) { struct css_set *old_cset; @@ -2311,8 +2309,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, down_write(&css_set_rwsem); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) - cgroup_task_migrate(cset->mg_src_cgrp, task, - cset->mg_dst_cset); + cgroup_task_migrate(task, cset->mg_dst_cset); } up_write(&css_set_rwsem); -- cgit v1.1 From 0de0942db2b36dd91c088a7950398d2e87f23b23 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:49 -0400 Subject: cgroup: make cgroup->nr_populated count the number of populated css_sets Currently, cgroup->nr_populated counts whether the cgroup has any css_sets linked to it and the number of children which has non-zero ->nr_populated. This works because a css_set's refcnt converges with the number of tasks linked to it and thus there's no css_set linked to a cgroup if it doesn't have any live tasks. To help tracking resource usage of zombie tasks, putting the ref of css_set will be separated from disassociating the task from the css_set which means that a cgroup may have css_sets linked to it even when it doesn't have any live tasks. This patch updates cgroup->nr_populated so that for the cgroup itself it counts the number of css_sets which have tasks associated with them so that empty css_sets don't skew the populated test. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49f30f1..e5231d0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -582,14 +582,25 @@ struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ /** + * css_set_populated - does a css_set contain any tasks? + * @cset: target css_set + */ +static bool css_set_populated(struct css_set *cset) +{ + lockdep_assert_held(&css_set_rwsem); + + return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); +} + +/** * cgroup_update_populated - updated populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * - * @cgrp is either getting the first task (css_set) or losing the last. - * Update @cgrp->populated_cnt accordingly. The count is propagated - * towards root so that a given cgroup's populated_cnt is zero iff the - * cgroup and all its descendants are empty. + * One of the css_sets associated with @cgrp is either getting its first + * task or losing the last. Update @cgrp->populated_cnt accordingly. The + * count is propagated towards root so that a given cgroup's populated_cnt + * is zero iff the cgroup and all its descendants don't contain any tasks. * * @cgrp's interface file "cgroup.populated" is zero if * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt @@ -618,6 +629,24 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) } while (cgrp); } +/** + * css_set_update_populated - update populated state of a css_set + * @cset: target css_set + * @populated: whether @cset is populated or depopulated + * + * @cset is either getting the first task or losing the last. Update the + * ->populated_cnt of all associated cgroups accordingly. + */ +static void css_set_update_populated(struct css_set *cset, bool populated) +{ + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_rwsem); + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + cgroup_update_populated(link->cgrp, populated); +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -663,10 +692,8 @@ static void put_css_set_locked(struct css_set *cset) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links)) { - cgroup_update_populated(cgrp, false); + if (list_empty(&cgrp->cset_links)) check_for_release(cgrp); - } kfree(link); } @@ -875,8 +902,6 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link->cset = cset; link->cgrp = cgrp; - if (list_empty(&cgrp->cset_links)) - cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); /* @@ -1754,6 +1779,8 @@ static void cgroup_enable_task_cg_lists(void) if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); + if (!css_set_populated(cset)) + css_set_update_populated(cset, true); list_add(&p->cg_list, &cset->tasks); get_css_set(cset); } @@ -1868,8 +1895,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) * objects. */ down_write(&css_set_rwsem); - hash_for_each(css_set_table, i, cset, hlist) + hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); + if (css_set_populated(cset)) + cgroup_update_populated(root_cgrp, true); + } up_write(&css_set_rwsem); BUG_ON(!list_empty(&root_cgrp->self.children)); @@ -2256,10 +2286,16 @@ static void cgroup_task_migrate(struct task_struct *tsk, WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); + if (!css_set_populated(new_cset)) + css_set_update_populated(new_cset, true); + get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); + if (!css_set_populated(old_cset)) + css_set_update_populated(old_cset, false); + /* * We just gained a reference on old_cset by taking it from the * task. As trading it for new_cset is protected by cgroup_mutex, @@ -3774,7 +3810,7 @@ static void css_advance_task_iter(struct css_task_iter *it) link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } - } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + } while (!css_set_populated(cset)); it->cset_pos = l; @@ -5492,17 +5528,20 @@ void cgroup_exit(struct task_struct *tsk) /* * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check cg_list without grabbing css_set_rwsem. + * with us, we can check css_set and cg_list without synchronization. */ + cset = task_css_set(tsk); + if (!list_empty(&tsk->cg_list)) { down_write(&css_set_rwsem); list_del_init(&tsk->cg_list); + if (!css_set_populated(cset)) + css_set_update_populated(cset, false); up_write(&css_set_rwsem); put_cset = true; } /* Reassign the task to the init_css_set. */ - cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); /* see cgroup_post_fork() for details */ -- cgit v1.1 From 27bd4dbb8d51c476298e62bd088225317b7853de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:50 -0400 Subject: cgroup: replace cgroup_has_tasks() with cgroup_is_populated() Currently, cgroup_has_tasks() tests whether the target cgroup has any css_set linked to it. This works because a css_set's refcnt converges with the number of tasks linked to it and thus there's no css_set linked to a cgroup if it doesn't have any live tasks. To help tracking resource usage of zombie tasks, putting the ref of css_set will be separated from disassociating the task from the css_set which means that a cgroup may have css_sets linked to it even when it doesn't have any live tasks. This patch replaces cgroup_has_tasks() with cgroup_is_populated() which tests cgroup->nr_populated instead which locally counts the number of populated css_sets. Unlike cgroup_has_tasks(), cgroup_is_populated() is recursive - if any of the descendants is populated, the cgroup is populated too. While this changes the meaning of the test, all the existing users are okay with the change. While at it, replace the open-coded ->populated_cnt test in cgroup_events_show() with cgroup_is_populated(). Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko --- kernel/cgroup.c | 6 +++--- kernel/cpuset.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5231d0..435aa68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3121,7 +3121,7 @@ err_undo_css: static int cgroup_events_show(struct seq_file *seq, void *v) { seq_printf(seq, "populated %d\n", - (bool)seq_css(seq)->cgroup->populated_cnt); + cgroup_is_populated(seq_css(seq)->cgroup)); return 0; } @@ -5558,7 +5558,7 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { - if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } @@ -5806,7 +5806,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return (!cgroup_has_tasks(css->cgroup) && + return (!cgroup_is_populated(css->cgroup) && !css_has_online_children(&css->cgroup->self)); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e4d9999..d7ccb87 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -498,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; -- cgit v1.1 From ad2ed2b35b76f01a876230a3a632efbc81d3fcd6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:50 -0400 Subject: cgroup: move check_for_release() invocation To trigger release agent when the last task leaves the cgroup, check_for_release() is called from put_css_set_locked(); however, css_set being unlinked is being decoupled from task leaving the cgroup and the correct condition to test is cgroup->nr_populated dropping to zero which check_for_release() is already updated to test. This patch moves check_for_release() invocation from put_css_set_locked() to cgroup_update_populated(). Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 435aa68..855313d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -623,6 +623,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; + check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); cgrp = cgroup_parent(cgrp); @@ -686,15 +687,8 @@ static void put_css_set_locked(struct css_set *cset) css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { - struct cgroup *cgrp = link->cgrp; - list_del(&link->cset_link); list_del(&link->cgrp_link); - - /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links)) - check_for_release(cgrp); - kfree(link); } -- cgit v1.1 From 052c3f3a0b03651d94050d917ebc1df46a31c2f0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:50 -0400 Subject: cgroup: relocate cgroup_[try]get/put() Relocate cgroup_get(), cgroup_tryget() and cgroup_put() upwards. This is pure code reorganization to prepare for future changes. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 855313d..ab5c9a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -428,6 +428,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return !(cgrp->self.flags & CSS_ONLINE); } +static void cgroup_get(struct cgroup *cgrp) +{ + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); +} + +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + +static void cgroup_put(struct cgroup *cgrp) +{ + css_put(&cgrp->self); +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -1177,22 +1193,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static void cgroup_get(struct cgroup *cgrp) -{ - WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); -} - -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - /** * cgroup_calc_child_subsys_mask - calculate child_subsys_mask * @cgrp: the target cgroup -- cgit v1.1 From 2ceb231b0ab0e3d700c5f7c839273bfeecbefe3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:51 -0400 Subject: cgroup: make css_sets pin the associated cgroups Currently, css_sets don't pin the associated cgroups. This is okay as a cgroup with css_sets associated are not allowed to be removed; however, to help resource tracking for zombie tasks, this is scheduled to change such that a cgroup can be removed even when it has css_sets associated as long as none of them are populated. To ensure that a cgroup doesn't go away while css_sets are still associated with it, make each associated css_set hold a reference on the cgroup if non-root. v2: Root cgroups are special and shouldn't be ref'd by css_sets. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ab5c9a5..ea87340 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -705,6 +705,8 @@ static void put_css_set_locked(struct css_set *cset) list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); + if (cgroup_parent(link->cgrp)) + cgroup_put(link->cgrp); kfree(link); } @@ -919,6 +921,9 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, * is sorted by order of hierarchy creation */ list_add_tail(&link->cgrp_link, &cset->cgrp_links); + + if (cgroup_parent(cgrp)) + cgroup_get(cgrp); } /** @@ -4998,10 +5003,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - /* - * css_set_rwsem synchronizes access to ->cset_links and prevents - * @cgrp from being removed while put_css_set() is in progress. - */ + /* css_set_rwsem synchronizes access to ->cset_links */ down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); up_read(&css_set_rwsem); -- cgit v1.1 From 91486f61f486662c27ef86dd910f875832e3a5de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:51 -0400 Subject: cgroup: make cgroup_destroy_locked() test cgroup_is_populated() cgroup_destroy_locked() currently tests whether any css_sets are associated to reject removal if the cgroup contains tasks. This works because a css_set's refcnt converges with the number of tasks linked to it and thus there's no css_set linked to a cgroup if it doesn't have any live tasks. To help tracking resource usage of zombie tasks, putting the ref of css_set will be separated from disassociating the task from the css_set which means that a cgroup may have css_sets linked to it even when it doesn't have any live tasks. This patch updates cgroup_destroy_locked() so that it tests cgroup_is_populated(), which counts the number of populated css_sets, instead of whether cgrp->cset_links is empty to determine whether the cgroup is populated or not. This ensures that rmdirs won't be incorrectly rejected for cgroups which only contain zombie tasks. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ea87340..61ee85d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4998,16 +4998,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; - bool empty; int ssid; lockdep_assert_held(&cgroup_mutex); - /* css_set_rwsem synchronizes access to ->cset_links */ - down_read(&css_set_rwsem); - empty = list_empty(&cgrp->cset_links); - up_read(&css_set_rwsem); - if (!empty) + /* + * Only migration can raise populated from zero and we're already + * holding cgroup_mutex. + */ + if (cgroup_is_populated(cgrp)) return -EBUSY; /* -- cgit v1.1 From 389b9c1bc927c8194a49f5f0c7e069ed0ec79b9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:51 -0400 Subject: cgroup: keep css_set and task lists in chronological order css task iteration will be updated to not leak cgroup internal locking to iterator users. In preparation, update css_set and task lists to be in chronological order. For tasks, as migration path is already using list_splice_tail_init(), only cgroup_enable_task_cg_lists() and cgroup_post_fork() need updating. For css_sets, link_css_set() is the only place which needs to be updated. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 61ee85d..5244467 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -914,12 +914,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link->cset = cset; link->cgrp = cgrp; - list_move(&link->cset_link, &cgrp->cset_links); - /* - * Always add links to the tail of the list so that the list - * is sorted by order of hierarchy creation + * Always add links to the tail of the lists so that the lists are + * in choronological order. */ + list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) @@ -1780,7 +1779,7 @@ static void cgroup_enable_task_cg_lists(void) if (!css_set_populated(cset)) css_set_update_populated(cset, true); - list_add(&p->cg_list, &cset->tasks); + list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } spin_unlock_irq(&p->sighand->siglock); @@ -5480,7 +5479,7 @@ void cgroup_post_fork(struct task_struct *child, cset = task_css_set(current); if (list_empty(&child->cg_list)) { rcu_assign_pointer(child->cgroups, cset); - list_add(&child->cg_list, &cset->tasks); + list_add_tail(&child->cg_list, &cset->tasks); get_css_set(cset); } up_write(&css_set_rwsem); -- cgit v1.1 From f6d7d049c17a29fbc4c2723899a242d6889554aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:52 -0400 Subject: cgroup: factor out css_set_move_task() A task is associated and disassociated with its css_set in three places - during migration, after a new task is created and when a task exits. The first is handled by cgroup_task_migrate() and the latter two are open-coded. These are similar operations and spreading them over multiple places makes it harder to follow and update. This patch collects all task css_set [dis]association operations into css_set_move_task(). While css_set_move_task() may check whether populated state needs to be updated when not strictly necessary, the behavior is essentially equivalent before and after this patch. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 104 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5244467..0b6e50c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -664,6 +664,52 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/** + * css_set_move_task - move a task from one css_set to another + * @task: task being moved + * @from_cset: css_set @task currently belongs to (may be NULL) + * @to_cset: new css_set @task is being moved to (may be NULL) + * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks + * + * Move @task from @from_cset to @to_cset. If @task didn't belong to any + * css_set, @from_cset can be NULL. If @task is being disassociated + * instead of moved, @to_cset can be NULL. + * + * This function automatically handles populated_cnt updates but the caller + * is responsible for managing @from_cset and @to_cset's reference counts. + */ +static void css_set_move_task(struct task_struct *task, + struct css_set *from_cset, struct css_set *to_cset, + bool use_mg_tasks) +{ + lockdep_assert_held(&css_set_rwsem); + + if (from_cset) { + WARN_ON_ONCE(list_empty(&task->cg_list)); + list_del_init(&task->cg_list); + if (!css_set_populated(from_cset)) + css_set_update_populated(from_cset, false); + } else { + WARN_ON_ONCE(!list_empty(&task->cg_list)); + } + + if (to_cset) { + /* + * We are synchronized through cgroup_threadgroup_rwsem + * against PF_EXITING setting such that we can't race + * against cgroup_exit() changing the css_set to + * init_css_set and dropping the old one. + */ + WARN_ON_ONCE(task->flags & PF_EXITING); + + if (!css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + rcu_assign_pointer(task->cgroups, to_cset); + list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : + &to_cset->tasks); + } +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -2262,47 +2308,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) } /** - * cgroup_task_migrate - move a task from one cgroup to another. - * @tsk: the task being migrated - * @new_cset: the new css_set @tsk is being attached to - * - * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. - */ -static void cgroup_task_migrate(struct task_struct *tsk, - struct css_set *new_cset) -{ - struct css_set *old_cset; - - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); - - /* - * We are synchronized through cgroup_threadgroup_rwsem against - * PF_EXITING setting such that we can't race against cgroup_exit() - * changing the css_set to init_css_set and dropping the old one. - */ - WARN_ON_ONCE(tsk->flags & PF_EXITING); - old_cset = task_css_set(tsk); - - if (!css_set_populated(new_cset)) - css_set_update_populated(new_cset, true); - - get_css_set(new_cset); - rcu_assign_pointer(tsk->cgroups, new_cset); - list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); - - if (!css_set_populated(old_cset)) - css_set_update_populated(old_cset, false); - - /* - * We just gained a reference on old_cset by taking it from the - * task. As trading it for new_cset is protected by cgroup_mutex, - * we're safe to drop it here; it will be freed under RCU. - */ - put_css_set_locked(old_cset); -} - -/** * cgroup_taskset_migrate - migrate a taskset to a cgroup * @tset: taget taskset * @dst_cgrp: destination cgroup @@ -2342,8 +2347,14 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ down_write(&css_set_rwsem); list_for_each_entry(cset, &tset->src_csets, mg_node) { - list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) - cgroup_task_migrate(task, cset->mg_dst_cset); + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { + struct css_set *from_cset = task_css_set(task); + struct css_set *to_cset = cset->mg_dst_cset; + + get_css_set(to_cset); + css_set_move_task(task, from_cset, to_cset, true); + put_css_set_locked(from_cset); + } } up_write(&css_set_rwsem); @@ -5478,9 +5489,8 @@ void cgroup_post_fork(struct task_struct *child, down_write(&css_set_rwsem); cset = task_css_set(current); if (list_empty(&child->cg_list)) { - rcu_assign_pointer(child->cgroups, cset); - list_add_tail(&child->cg_list, &cset->tasks); get_css_set(cset); + css_set_move_task(child, NULL, cset, false); } up_write(&css_set_rwsem); } @@ -5528,9 +5538,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { down_write(&css_set_rwsem); - list_del_init(&tsk->cg_list); - if (!css_set_populated(cset)) - css_set_update_populated(cset, false); + css_set_move_task(tsk, cset, NULL, false); up_write(&css_set_rwsem); put_cset = true; } -- cgit v1.1 From ecb9d535df967b3ca565535e14456f612373bf5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:52 -0400 Subject: cgroup: reorganize css_task_iter functions * Rename css_advance_task_iter() to css_task_iter_advance_css_set() and make it clear it->task_pos too at the end of the iteration. * Factor out css_task_iter_advance() from css_task_iter_next(). The new function whines if called on a terminated iterator. Except for the termination check, this is pure reorganization and doesn't introduce any behavior changes. This will help the planned locking update for css_task_iter. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b6e50c..56e2b77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3793,12 +3793,12 @@ bool css_has_online_children(struct cgroup_subsys_state *css) } /** - * css_advance_task_iter - advance a task itererator to the next css_set + * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void css_advance_task_iter(struct css_task_iter *it) +static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; @@ -3809,6 +3809,7 @@ static void css_advance_task_iter(struct css_task_iter *it) l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; + it->task_pos = NULL; return; } @@ -3832,6 +3833,28 @@ static void css_advance_task_iter(struct css_task_iter *it) it->mg_tasks_head = &cset->mg_tasks; } +static void css_task_iter_advance(struct css_task_iter *it) +{ + struct list_head *l = it->task_pos; + + WARN_ON_ONCE(!l); + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ + l = l->next; + + if (l == it->tasks_head) + l = it->mg_tasks_head->next; + + if (l == it->mg_tasks_head) + css_task_iter_advance_css_set(it); + else + it->task_pos = l; +} + /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of @@ -3864,7 +3887,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, it->cset_head = it->cset_pos; - css_advance_task_iter(it); + css_task_iter_advance_css_set(it); } /** @@ -3878,28 +3901,12 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task_pos; - /* If the iterator cg is NULL, we have no tasks */ if (!it->cset_pos) return NULL; - res = list_entry(l, struct task_struct, cg_list); - - /* - * Advance iterator to find next entry. cset->tasks is consumed - * first and then ->mg_tasks. After ->mg_tasks, we move onto the - * next cset. - */ - l = l->next; - - if (l == it->tasks_head) - l = it->mg_tasks_head->next; - - if (l == it->mg_tasks_head) - css_advance_task_iter(it); - else - it->task_pos = l; + res = list_entry(it->task_pos, struct task_struct, cg_list); + css_task_iter_advance(it); return res; } -- cgit v1.1 From ed27b9f7a17ddfbc007e16d4d11f33dff4fc2de7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:52 -0400 Subject: cgroup: don't hold css_set_rwsem across css task iteration css_sets are synchronized through css_set_rwsem but the locking scheme is kinda bizarre. The hot paths - fork and exit - have to write lock the rwsem making the rw part pointless; furthermore, many readers already hold cgroup_mutex. One of the readers is css task iteration. It read locks the rwsem over the entire duration of iteration. This leads to silly locking behavior. When cpuset tries to migrate processes of a cgroup to a different NUMA node, css_set_rwsem is held across the entire migration attempt which can take a long time locking out forking, exiting and other cgroup operations. This patch updates css task iteration so that it locks css_set_rwsem only while the iterator is being advanced. css task iteration involves two levels - css_set and task iteration. As css_sets in use are practically immutable, simply pinning the current one is enough for resuming iteration afterwards. Task iteration is tricky as tasks may leave their css_set while iteration is in progress. This is solved by keeping track of active iterators and advancing them if their next task leaves its css_set. v2: put_task_struct() in css_task_iter_next() moved outside css_set_rwsem. A later patch will add cgroup operations to task_struct free path which may grab the same lock and this avoids deadlock possibilities. css_set_move_task() updated to use list_for_each_entry_safe() when walking task_iters and advancing them. This is necessary as advancing an iter may remove it from the list. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 56e2b77..0c5b0e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -216,6 +216,7 @@ static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); +static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, bool visible); @@ -593,6 +594,7 @@ struct css_set init_css_set = { .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ @@ -675,8 +677,9 @@ static void css_set_update_populated(struct css_set *cset, bool populated) * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * - * This function automatically handles populated_cnt updates but the caller - * is responsible for managing @from_cset and @to_cset's reference counts. + * This function automatically handles populated_cnt updates and + * css_task_iter adjustments but the caller is responsible for managing + * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, @@ -685,7 +688,22 @@ static void css_set_move_task(struct task_struct *task, lockdep_assert_held(&css_set_rwsem); if (from_cset) { + struct css_task_iter *it, *pos; + WARN_ON_ONCE(list_empty(&task->cg_list)); + + /* + * @task is leaving, advance task iterators which are + * pointing to it so that they can resume at the next + * position. Advancing an iterator might remove it from + * the list, use safe walk. See css_task_iter_advance*() + * for details. + */ + list_for_each_entry_safe(it, pos, &from_cset->task_iters, + iters_node) + if (it->task_pos == &task->cg_list) + css_task_iter_advance(it); + list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -1019,6 +1037,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->mg_preload_node); INIT_LIST_HEAD(&cset->mg_node); + INIT_LIST_HEAD(&cset->task_iters); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in @@ -3804,6 +3823,8 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) struct cgrp_cset_link *link; struct css_set *cset; + lockdep_assert_held(&css_set_rwsem); + /* Advance to the next non-empty css_set */ do { l = l->next; @@ -3831,12 +3852,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + + /* + * We don't keep css_sets locked across iteration steps and thus + * need to take steps to ensure that iteration can be resumed after + * the lock is re-acquired. Iteration is performed at two levels - + * css_sets and tasks in them. + * + * Once created, a css_set never leaves its cgroup lists, so a + * pinned css_set is guaranteed to stay put and we can resume + * iteration afterwards. + * + * Tasks may leave @cset across iteration steps. This is resolved + * by registering each iterator with the css_set currently being + * walked and making css_set_move_task() advance iterators whose + * next task is leaving. + */ + if (it->cur_cset) { + list_del(&it->iters_node); + put_css_set_locked(it->cur_cset); + } + get_css_set(cset); + it->cur_cset = cset; + list_add(&it->iters_node, &cset->task_iters); } static void css_task_iter_advance(struct css_task_iter *it) { struct list_head *l = it->task_pos; + lockdep_assert_held(&css_set_rwsem); WARN_ON_ONCE(!l); /* @@ -3864,19 +3909,16 @@ static void css_task_iter_advance(struct css_task_iter *it) * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. - * - * Note that this function acquires a lock which is released when the - * iteration finishes. The caller can't sleep while iteration is in - * progress. */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_rwsem) { /* no one should try to iterate before mounting cgroups */ WARN_ON_ONCE(!use_task_css_set_links); - down_read(&css_set_rwsem); + memset(it, 0, sizeof(*it)); + + down_write(&css_set_rwsem); it->ss = css->ss; @@ -3888,6 +3930,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css, it->cset_head = it->cset_pos; css_task_iter_advance_css_set(it); + + up_write(&css_set_rwsem); } /** @@ -3900,14 +3944,22 @@ void css_task_iter_start(struct cgroup_subsys_state *css, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { - struct task_struct *res; - if (!it->cset_pos) return NULL; - res = list_entry(it->task_pos, struct task_struct, cg_list); + if (it->cur_task) + put_task_struct(it->cur_task); + + down_write(&css_set_rwsem); + + it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); + get_task_struct(it->cur_task); + css_task_iter_advance(it); - return res; + + up_write(&css_set_rwsem); + + return it->cur_task; } /** @@ -3917,9 +3969,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_rwsem) { - up_read(&css_set_rwsem); + if (it->cur_cset) { + down_write(&css_set_rwsem); + list_del(&it->iters_node); + put_css_set_locked(it->cur_cset); + up_write(&css_set_rwsem); + } + + if (it->cur_task) + put_task_struct(it->cur_task); } /** -- cgit v1.1 From f0d9a5f175753a371bc7fdff0d584a8d9cd72bb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock css_set_rwsem is the inner lock protecting css_sets and is accessed from hot paths such as fork and exit. Internally, it has no reason to be a rwsem or even mutex. There are no internal blocking operations while holding it. This was rwsem because css task iteration used to expose it to external iterator users. As the previous patch updated css task iteration such that the locking is not leaked to its users, there's no reason to keep it a rwsem. This patch converts css_set_rwsem to a spinlock and rename it to css_set_lock. It uses bh-safe operations as a planned usage needs to access it from RCU callback context. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 144 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 72 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0c5b0e6..ba7b328 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include @@ -76,7 +75,7 @@ * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * - * css_set_rwsem protects task->cgroups pointer, the list of css_set + * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in @@ -84,12 +83,12 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -DECLARE_RWSEM(css_set_rwsem); +DEFINE_SPINLOCK(css_set_lock); EXPORT_SYMBOL_GPL(cgroup_mutex); -EXPORT_SYMBOL_GPL(css_set_rwsem); +EXPORT_SYMBOL_GPL(css_set_lock); #else static DEFINE_MUTEX(cgroup_mutex); -static DECLARE_RWSEM(css_set_rwsem); +static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -605,7 +604,7 @@ static int css_set_count = 1; /* 1 for init_css_set */ */ static bool css_set_populated(struct css_set *cset) { - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } @@ -628,7 +627,7 @@ static bool css_set_populated(struct css_set *cset) */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); do { bool trigger; @@ -660,7 +659,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); @@ -685,7 +684,7 @@ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (from_cset) { struct css_task_iter *it, *pos; @@ -755,7 +754,7 @@ static void put_css_set_locked(struct css_set *cset) struct cgroup_subsys *ss; int ssid; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (!atomic_dec_and_test(&cset->refcount)) return; @@ -787,9 +786,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); put_css_set_locked(cset); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /* @@ -1012,11 +1011,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); if (cset) return cset; @@ -1044,7 +1043,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1066,7 +1065,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, list_add_tail(&cset->e_cset_node[ssid], &cset->subsys[ssid]->cgroup->e_csets[ssid]); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return cset; } @@ -1130,14 +1129,15 @@ static void cgroup_destroy_root(struct cgroup_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } - up_write(&css_set_rwsem); + + spin_unlock_bh(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -1159,7 +1159,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup *res = NULL; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { res = &root->cgrp; @@ -1182,7 +1182,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_rwsem held. + * called with cgroup_mutex and css_set_lock held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1531,11 +1531,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = dcgrp; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); src_root->subsys_mask &= ~(1 << ssid); scgrp->subtree_control &= ~(1 << ssid); @@ -1812,7 +1812,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1851,7 +1851,7 @@ static void cgroup_enable_task_cg_lists(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1915,7 +1915,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) goto out; /* - * We're accessing css_set_count without locking css_set_rwsem here, + * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. The worst that can happen is that we * have some link structures left over @@ -1957,13 +1957,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -2196,7 +2196,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2209,7 +2209,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -2258,7 +2258,7 @@ static void cgroup_taskset_add(struct task_struct *task, { struct css_set *cset; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) @@ -2364,7 +2364,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, * the new cgroup. There are no failure cases after here, so this * is the commit point. */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); @@ -2375,7 +2375,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, put_css_set_locked(from_cset); } } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. @@ -2399,13 +2399,13 @@ out_cancel_attach: css->ss->cancel_attach(css, tset); } out_release_tset: - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return ret; } @@ -2422,14 +2422,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /** @@ -2455,7 +2455,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2571,7 +2571,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2580,7 +2580,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, break; } while_each_thread(leader, task); rcu_read_unlock(); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return cgroup_taskset_migrate(&tset, cgrp); } @@ -2601,7 +2601,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, int ret; /* look up all src csets */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2611,7 +2611,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); @@ -2644,9 +2644,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *cgrp; struct inode *inode; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); while (!cgroup_is_descendant(dst_cgrp, cgrp)) cgrp = cgroup_parent(cgrp); @@ -2743,9 +2743,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (root == &cgrp_dfl_root) continue; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2870,7 +2870,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { struct cgrp_cset_link *link; @@ -2882,14 +2882,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_migrate_add_src(link->cset, cgrp, &preloaded_csets); } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); if (ret) goto out_finish; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; @@ -2901,7 +2901,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_taskset_add(task, &tset); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); ret = cgroup_taskset_migrate(&tset, cgrp); out_finish: @@ -3577,10 +3577,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return count; } @@ -3823,7 +3823,7 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) struct cgrp_cset_link *link; struct css_set *cset; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { @@ -3881,7 +3881,7 @@ static void css_task_iter_advance(struct css_task_iter *it) { struct list_head *l = it->task_pos; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); WARN_ON_ONCE(!l); /* @@ -3918,7 +3918,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, memset(it, 0, sizeof(*it)); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); it->ss = css->ss; @@ -3931,7 +3931,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, css_task_iter_advance_css_set(it); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /** @@ -3950,14 +3950,14 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) if (it->cur_task) put_task_struct(it->cur_task); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); get_task_struct(it->cur_task); css_task_iter_advance(it); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return it->cur_task; } @@ -3971,10 +3971,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } if (it->cur_task) @@ -4003,10 +4003,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); if (ret) @@ -5359,7 +5359,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; @@ -5391,7 +5391,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5537,7 +5537,7 @@ void cgroup_post_fork(struct task_struct *child, * @child during its iteration. * * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_rwsem guarantees both that the + * css_set. Grabbing css_set_lock guarantees both that the * association is stable, and, on completion of the parent's * migration, @child is visible in the source of migration or * already in the destination cgroup. This guarantee is necessary @@ -5552,13 +5552,13 @@ void cgroup_post_fork(struct task_struct *child, if (use_task_css_set_links) { struct css_set *cset; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); css_set_move_task(child, NULL, cset, false); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /* @@ -5603,9 +5603,9 @@ void cgroup_exit(struct task_struct *tsk) cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); put_cset = true; } @@ -5823,7 +5823,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -5834,7 +5834,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); kfree(name_buf); return 0; } @@ -5845,7 +5845,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -5868,7 +5868,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return 0; } -- cgit v1.1 From 2e91fa7f6d451e3ea9fec999065d2fd199691f9d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: keep zombies associated with their original cgroups cgroup_exit() is called when a task exits and disassociates the exiting task from its cgroups and half-attach it to the root cgroup. This is unnecessary and undesirable. No controller actually needs an exiting task to be disassociated with non-root cgroups. Both cpu and perf_event controllers update the association to the root cgroup from their exit callbacks just to keep consistent with the cgroup core behavior. Also, this disassociation makes it difficult to track resources held by zombies or determine where the zombies came from. Currently, pids controller is completely broken as it uncharges on exit and zombies always escape the resource restriction. With cgroup association being reset on exit, fixing it is pretty painful. There's no reason to reset cgroup membership on exit. The zombie can be removed from its css_set so that it doesn't show up on "cgroup.procs" and thus can't be migrated or interfere with cgroup removal. It can still pin and point to the css_set so that its cgroup membership is maintained. This patch makes cgroup core keep zombies associated with their cgroups at the time of exit. * Previous patches decoupled populated_cnt tracking from css_set lifetime, so a dying task can be simply unlinked from its css_set while pinning and pointing to the css_set. This keeps css_set association from task side alive while hiding it from "cgroup.procs" and populated_cnt tracking. The css_set reference is dropped when the task_struct is freed. * ->exit() callback no longer needs the css arguments as the associated css never changes once PF_EXITING is set. Removed. * cpu and perf_events controllers no longer need ->exit() callbacks. There's no reason to explicitly switch away on exit. The final schedule out is enough. The callbacks are removed. * On traditional hierarchies, nothing changes. "/proc/PID/cgroup" still reports "/" for all zombies. On the default hierarchy, "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged to at the time of exit. If the cgroup gets removed before the task is reaped, " (deleted)" is appended. v2: Build brekage due to missing dummy cgroup_free() when !CONFIG_CGROUP fixed. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- kernel/cgroup.c | 51 ++++++++++++++++++++++++++++++++++----------------- kernel/cgroup_pids.c | 6 ++---- kernel/events/core.c | 16 ---------------- kernel/fork.c | 1 + kernel/sched/core.c | 16 ---------------- 5 files changed, 37 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ba7b328..9186584 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5379,14 +5379,34 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); + cgrp = task_cgroup_from_root(tsk, root); - path = cgroup_path(cgrp, buf, PATH_MAX); - if (!path) { - retval = -ENAMETOOLONG; - goto out_unlock; + + /* + * On traditional hierarchies, all zombie tasks show up as + * belonging to the root cgroup. On the default hierarchy, + * while a zombie doesn't show up in "cgroup.procs" and + * thus can't be migrated, its /proc/PID/cgroup keeps + * reporting the cgroup it belonged to before exiting. If + * the cgroup is removed before the zombie is reaped, + * " (deleted)" is appended to the cgroup path. + */ + if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; + goto out_unlock; + } + } else { + path = "/"; } + seq_puts(m, path); - seq_putc(m, '\n'); + + if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) + seq_puts(m, " (deleted)\n"); + else + seq_putc(m, '\n'); } retval = 0; @@ -5593,7 +5613,6 @@ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; - bool put_cset = false; int i; /* @@ -5606,22 +5625,20 @@ void cgroup_exit(struct task_struct *tsk) spin_lock_bh(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); spin_unlock_bh(&css_set_lock); - put_cset = true; + } else { + get_css_set(cset); } - /* Reassign the task to the init_css_set. */ - RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) { - struct cgroup_subsys_state *old_css = cset->subsys[i]; - struct cgroup_subsys_state *css = task_css(tsk, i); + for_each_subsys_which(ss, i, &have_exit_callback) + ss->exit(tsk); +} - ss->exit(css, old_css, tsk); - } +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); - if (put_cset) - put_css_set(cset); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 806cd76..45f0856 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv) css_put(old_css); } -static void pids_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) +static void pids_exit(struct task_struct *task) { - struct pids_cgroup *pids = css_pids(old_css); + struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); pids_uncharge(pids, 1); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69..e987494 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9293,25 +9293,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - task_function_call(task, __perf_cgroup_move, task); -} - struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, - .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f1..118743bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + cgroup_free(tsk); task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403..2cad9ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8163,21 +8163,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, sched_move_task(task); } -static void cpu_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - sched_move_task(task); -} - #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -8509,7 +8494,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .exit = cpu_cgroup_exit, .legacy_cftypes = cpu_files, .early_init = 1, }; -- cgit v1.1 From afcf6c8b75444382e0f9996157207ebae34a8848 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: add cgroup_subsys->free() method and use it to fix pids controller pids controller is completely broken in that it uncharges when a task exits allowing zombies to escape resource control. With the recent updates, cgroup core now maintains cgroup association till task free and pids controller can be fixed by uncharging on free instead of exit. This patch adds cgroup_subsys->free() method and update pids controller to use it instead of ->exit() for uncharging. Signed-off-by: Tejun Heo Cc: Aleksa Sarai --- kernel/cgroup.c | 7 +++++++ kernel/cgroup_pids.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9186584..8673843 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -206,6 +206,7 @@ static u64 css_serial_nr_next = 1; */ static unsigned long have_fork_callback __read_mostly; static unsigned long have_exit_callback __read_mostly; +static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; @@ -5180,6 +5181,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; + have_free_callback |= (bool)ss->free << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5637,6 +5639,11 @@ void cgroup_exit(struct task_struct *tsk) void cgroup_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); + struct cgroup_subsys *ss; + int ssid; + + for_each_subsys_which(ss, ssid, &have_free_callback) + ss->free(task); put_css_set(cset); } diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 45f0856..cdd8df4 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -266,7 +266,7 @@ static void pids_fork(struct task_struct *task, void *priv) css_put(old_css); } -static void pids_exit(struct task_struct *task) +static void pids_free(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -347,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .fork = pids_fork, - .exit = pids_exit, + .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, }; -- cgit v1.1 From 035f4f510583193949168c77dc957293df24bd77 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 17:00:43 -0400 Subject: cgroup: replace error handling in cgroup_init() with WARN_ON()s The init sequence shouldn't fail short of bugs and even when it does it's better to continue with the rest of initialization and we were silently ignoring /proc/cgroups creation failure. Drop the explicit error handling and wrap sysfs_create_mount_point(), register_filesystem() and proc_create() with WARN_ON()s. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8673843..74daeef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5242,7 +5242,7 @@ int __init cgroup_init(void) { struct cgroup_subsys *ss; unsigned long key; - int ssid, err; + int ssid; BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); @@ -5304,17 +5304,10 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } - err = sysfs_create_mount_point(fs_kobj, "cgroup"); - if (err) - return err; - - err = register_filesystem(&cgroup_fs_type); - if (err < 0) { - sysfs_remove_mount_point(fs_kobj, "cgroup"); - return err; - } + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); + WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); - proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); return 0; } -- cgit v1.1 From e4b7037c8613da41fb3f7b029414fe25370f53c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 17:00:43 -0400 Subject: cgroup: drop cgroup__DEVEL__legacy_files_on_dfl Now that interfaces for the major three controllers - cpu, memory, io - are shaping up, there's no reason to have an option to force legacy files to show up on the unified hierarchy for testing. Drop it. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 74daeef..4f4fc53 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -173,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_root_visible; -/* - * Set by the boot param of the same name and makes subsystems with NULL - * ->dfl_files to use ->legacy_files on the default hierarchy. - */ -static bool cgroup_legacy_files_on_dfl; - /* some controllers are not supported in the default hierarchy */ static unsigned long cgrp_dfl_root_inhibit_ss_mask; @@ -3553,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - /* - * If legacy_flies_on_dfl, we want to show the legacy files on the - * dfl hierarchy but iff the target subsystem hasn't been updated - * for the dfl hierarchy yet. - */ - if (!cgroup_legacy_files_on_dfl || - ss->dfl_cftypes != ss->legacy_cftypes) { - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; - } - + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } @@ -5287,9 +5272,6 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) - ss->dfl_cftypes = ss->legacy_cftypes; - if (!ss->dfl_cftypes) cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; @@ -5729,14 +5711,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -static int __init cgroup_set_legacy_files_on_dfl(char *str) -{ - printk("cgroup: using legacy files on the default hierarchy\n"); - cgroup_legacy_files_on_dfl = true; - return 0; -} -__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); - /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest -- cgit v1.1 From ac00737f4e8198f8ff5007c70af4dfe4fd47ea94 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 14 Oct 2015 14:40:44 -0700 Subject: bpf: Need to call bpf_prog_uncharge_memlock from bpf_prog_put Currently, is only called from __prog_put_rcu in the bpf_prog_release path. Need this to call this from bpf_prog_put also to get correct accounting. Fixes: aaac3ba95e4c8b49 ("bpf: charge user for creation of BPF maps and programs") Signed-off-by: Tom Herbert Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f640e5f..687dd6c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -520,6 +520,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { free_used_maps(prog->aux); + bpf_prog_uncharge_memlock(prog); bpf_prog_free(prog); } } -- cgit v1.1 From 0701c53e460ea64daf0ee789d0b08fef57800016 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 19:14:45 +0100 Subject: genirq/msi: Do not use pci_msi_[un]mask_irq as default methods When we create a generic MSI domain, that MSI_FLAG_USE_DEF_CHIP_OPS is set, and that any of .mask or .unmask are NULL in the irq_chip structure, we set them to pci_msi_[un]mask_irq. This is a bad idea for at least two reasons: - PCI_MSI might not be selected, kernel fails to build (yes, this is legitimate, at least on arm64!) - This may not be a PCI/MSI domain at all (platform MSI, for example) Either way, this looks wrong. Move the overriding of mask/unmask to the PCI counterpart, and panic is any of these two methods is not set in the core code (they really should be present). Signed-off-by: Marc Zyngier Cc: Jiang Liu Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1444760085-27857-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7e6512b..be9149f 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -228,11 +228,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) { struct irq_chip *chip = info->chip; - BUG_ON(!chip); - if (!chip->irq_mask) - chip->irq_mask = pci_msi_mask_irq; - if (!chip->irq_unmask) - chip->irq_unmask = pci_msi_unmask_irq; + BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask); if (!chip->irq_set_affinity) chip->irq_set_affinity = msi_domain_set_affinity; } -- cgit v1.1 From 56fd16cabac9cd8f15e2902898a9d0cc96e2fa70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 16 Oct 2015 15:50:22 +0200 Subject: timekeeping: Increment clock_was_set_seq in timekeeping_init() timekeeping_init() can set the wall time offset, so we need to increment the clock_was_set_seq counter. That way hrtimers will pick up the early offset immediately. Otherwise on a machine which does not set wall time later in the boot process the hrtimer offset is stale at 0 and wall time timers are going to expire with a delay of 45 years. Fixes: 868a3e915f7f "hrtimer: Make offset update smarter" Reported-and-tested-by: Heiko Carstens Signed-off-by: Thomas Gleixner Cc: Stefan Liebler Cc: Peter Zijlstra Cc: John Stultz --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3739ac6..44d2cc0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1251,7 +1251,7 @@ void __init timekeeping_init(void) set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.1 From f0a3b154bd7d969feaac1f4645e4177433e5f46a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Tue, 29 Sep 2015 19:46:13 +0300 Subject: ftrace: Clarify code for mod command "Not" is too abstract variable name - changed to clear_filter. Removed ftrace_match_module_records function: comparison with !* or * not does the general code in filter_parse_regex() as it works without mod command for sh# echo '!*' > /sys/kernel/debug/tracing/set_ftrace_filter Link: http://lkml.kernel.org/r/1443545176-3215-2-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8892b45..fc755a4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3454,13 +3454,13 @@ static int ftrace_match(char *str, char *regex, int len, int type) } static int -enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) { struct ftrace_func_entry *entry; int ret = 0; entry = ftrace_lookup_ip(hash, rec->ip); - if (not) { + if (clear_filter) { /* Do nothing if it doesn't exist */ if (!entry) return 0; @@ -3499,8 +3499,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *mod, } static int -match_records(struct ftrace_hash *hash, char *buff, - int len, char *mod, int not) +match_records(struct ftrace_hash *hash, char *buff, int len, char *mod) { unsigned search_len = 0; struct ftrace_page *pg; @@ -3509,9 +3508,10 @@ match_records(struct ftrace_hash *hash, char *buff, char *search = buff; int found = 0; int ret; + int clear_filter; if (len) { - type = filter_parse_regex(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, &clear_filter); search_len = strlen(search); } @@ -3522,7 +3522,7 @@ match_records(struct ftrace_hash *hash, char *buff, do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, mod, search, search_len, type)) { - ret = enter_record(hash, rec, not); + ret = enter_record(hash, rec, clear_filter); if (ret < 0) { found = ret; goto out_unlock; @@ -3539,26 +3539,9 @@ match_records(struct ftrace_hash *hash, char *buff, static int ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) { - return match_records(hash, buff, len, NULL, 0); + return match_records(hash, buff, len, NULL); } -static int -ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) -{ - int not = 0; - - /* blank or '*' mean the same */ - if (strcmp(buff, "*") == 0) - buff[0] = 0; - - /* handle the case of 'dont filter this module' */ - if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { - buff[0] = 0; - not = 1; - } - - return match_records(hash, buff, strlen(buff), mod, not); -} /* * We register the module command as a template to show others how @@ -3567,7 +3550,7 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) static int ftrace_mod_callback(struct ftrace_hash *hash, - char *func, char *cmd, char *param, int enable) + char *func, char *cmd, char *module, int enable) { int ret; @@ -3580,10 +3563,10 @@ ftrace_mod_callback(struct ftrace_hash *hash, */ /* we must have a module name */ - if (!param || !strlen(param)) + if (!module || !strlen(module)) return -EINVAL; - ret = ftrace_match_module_records(hash, func, param); + ret = match_records(hash, func, strlen(func), module); if (!ret) return -EINVAL; if (ret < 0) -- cgit v1.1 From fde7d22e01aa0d252fc5c95fa11f0dac35a4dd59 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 13 Oct 2015 09:18:22 +0800 Subject: sched/fair: Fix overly small weight for interactive group entities Commit: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") led to an overly small weight for interactive group entities. The bad case can be easily reproduced when a number of CPU hogs compete for the CPUs at the same time (thanks to Mike). This is largly because the task group's load average tracking cross CPUs lags behind the real changes. To fix this we accelerate the group share distribution process by using the load.weight of the cfs_rq. This may increase the entire group's share, but we have to do so to protect the (fragile) interactive tasks, especially from CPU hogs. Reported-by: Mike Galbraith Tested-by: Dietmar Eggemann Tested-by: Mike Galbraith Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444699103-20272-1-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e2e348..bc62c50 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2363,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq_load_avg(cfs_rq); + tg_weight += cfs_rq->load.weight; return tg_weight; } @@ -2373,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq_load_avg(cfs_rq); + load = cfs_rq->load.weight; shares = (tg->shares * load); if (tg_weight) -- cgit v1.1 From 3e386d56bafbb6d2540b49367444997fc671ea69 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 13 Oct 2015 09:18:23 +0800 Subject: sched/fair: Update task group's load_avg after task migration When cfs_rq has cfs_rq->removed_load_avg set (when a task migrates from this cfs_rq), we need to update its contribution to the group's load_avg. This should not increase tg's update too much, because in most cases, the cfs_rq has already decayed its load_avg. Tested-by: Dietmar Eggemann Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444699103-20272-2-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc62c50..9a5e60f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2664,13 +2664,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - int decayed; struct sched_avg *sa = &cfs_rq->avg; + int decayed, removed = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + removed = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -2688,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed; + return decayed || removed; } /* Update task and its cfs_rq load average */ -- cgit v1.1 From 0baabb385eb4bce699ddab0db015112be6cf1e6a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Oct 2015 17:21:23 +0200 Subject: nohz: Revert "nohz: Set isolcpus when nohz_full is set" This reverts: 8cb9764fc88b ("nohz: Set isolcpus when nohz_full is set") We assumed that full-nohz users always want scheduler isolation on full dynticks CPUs, therefore we included full-nohz CPUs on cpu_isolated_map. This means that tasks run by default on CPUs outside the nohz_full range unless their affinity is explicity overwritten. This suits pure isolation workloads but when the machine is needed to run common workloads, the available sets of CPUs to run common tasks becomes reduced. We reach an extreme case when CONFIG_NO_HZ_FULL_ALL is enabled as it leaves only CPU 0 for non-isolation tasks, which makes people think that their supercomputer regressed to 90's UP - which is true in a sense. Some full-nohz users appear to be interested in running normal workloads either before or after an isolation workload. Full-nohz isn't optimized toward normal workloads but it's still better than UP performance. We are reaching a limitation in kernel presets here. Lets revert this cpu_isolated_map inclusion and let userspace do its own scheduler isolation using cpusets or explicit affinity settings. Reported-by: Ingo Molnar Reported-by: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Dave Jones Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Link: http://lkml.kernel.org/r/1444663283-30068-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 10a8faa..5bd7d60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7238,9 +7238,6 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - /* nohz_full won't take effect without isolating the cpus. */ - tick_nohz_full_add_cpus_to(cpu_isolated_map); - sched_init_numa(); /* -- cgit v1.1 From 5aa5050787f449e7eaef2c5ec93c7b357aa7dcdc Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Fri, 16 Oct 2015 10:06:21 +0200 Subject: sched/deadline: Fix migration of SCHED_DEADLINE tasks Commit: 9d5142624256 ("sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target") broke select_task_rq_dl() and find_lock_later_rq(), because it introduced a comparison between the local task's deadline and dl.earliest_dl.curr of the remote queue. However, if the remote runqueue does not contain any SCHED_DEADLINE task its earliest_dl.curr is 0 (always smaller than the deadline of the local task) and the remote runqueue is not selected for pushing. As a result, if an application creates multiple SCHED_DEADLINE threads, they will never be pushed to runqueues that do not already contain SCHED_DEADLINE tasks. This patch fixes the issue by checking if dl.dl_nr_running == 0. Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Fixes: 9d5142624256 ("sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target") Link: http://lkml.kernel.org/r/1444982781-15608-1-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc8f010..142df26 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1066,8 +1066,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) int target = find_later_rq(p); if (target != -1 && - dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr)) + (dl_time_before(p->dl.deadline, + cpu_rq(target)->dl.earliest_dl.curr) || + (cpu_rq(target)->dl.dl_nr_running == 0))) cpu = target; } rcu_read_unlock(); @@ -1417,7 +1418,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (!dl_time_before(task->dl.deadline, + if (later_rq->dl.dl_nr_running && + !dl_time_before(task->dl.deadline, later_rq->dl.earliest_dl.curr)) { /* * Target rq has tasks of equal or earlier deadline, -- cgit v1.1 From 233e7f267e580fefdeb36628b7efe8bfe056d27c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Oct 2015 16:51:31 +0200 Subject: stop_machine: Ensure that a queued callback will be called before cpu_stop_park() cpu_stop_queue_work() checks stopper->enabled before it queues the work, but ->enabled == T can only guarantee cpu_stop_signal_done() if we race with cpu_down(). This is not enough for stop_two_cpus() or stop_machine(), they will deadlock if multi_cpu_stop() won't be called by one of the target CPU's. stop_machine/stop_cpus are fine, they rely on stop_cpus_mutex. But stop_two_cpus() has to check cpu_active() to avoid the same race with hotplug, and this check is very unobvious and probably not even correct if we race with cpu_up(). Change cpu_down() pass to clear ->enabled before cpu_stopper_thread() flushes the pending ->works and returns with KTHREAD_SHOULD_PARK set. Note also that smpboot_thread_call() calls cpu_stop_unpark() which sets enabled == T at CPU_ONLINE stage, so this CPU can't go away until cpu_stopper_thread() is called at least once. This all means that if cpu_stop_queue_work() succeeds, we know that work->fn() will be called. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151008145131.GA18139@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- kernel/stop_machine.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 050c634..c85df27 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -344,7 +344,7 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - kthread_park(current); + stop_machine_park((long)param->hcpu); return 0; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12484e5..6a40209 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -452,6 +452,18 @@ repeat: } } +void stop_machine_park(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + /* + * Lockless. cpu_stopper_thread() will take stopper->lock and flush + * the pending works before it parks, until then it is fine to queue + * the new works. + */ + stopper->enabled = false; + kthread_park(stopper->thread); +} + extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) @@ -462,17 +474,8 @@ static void cpu_stop_create(unsigned int cpu) static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work, *tmp; - unsigned long flags; - /* drain remaining works */ - spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry_safe(work, tmp, &stopper->works, list) { - list_del_init(&work->list); - cpu_stop_signal_done(work->done, false); - } - stopper->enabled = false; - spin_unlock_irqrestore(&stopper->lock, flags); + WARN_ON(!list_empty(&stopper->works)); } static void cpu_stop_unpark(unsigned int cpu) -- cgit v1.1 From 5caa1c089aebcb83ccd5b79a3b88b0aa58288d05 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Oct 2015 16:51:34 +0200 Subject: stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() Preparation to simplify the review of the next change. Add two simple helpers, __cpu_stop_queue_work() and cpu_stop_queue_two_works() which simply take a bit of code from their callers. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151008145134.GA18146@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6a40209..688d6b3 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) } } +static void __cpu_stop_queue_work(struct cpu_stopper *stopper, + struct cpu_stop_work *work) +{ + list_add_tail(&work->list, &stopper->works); + wake_up_process(stopper->thread); +} + /* queue @work to @stopper. if offline, @work is completed immediately */ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - unsigned long flags; spin_lock_irqsave(&stopper->lock, flags); - - if (stopper->enabled) { - list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); - } else + if (stopper->enabled) + __cpu_stop_queue_work(stopper, work); + else cpu_stop_signal_done(work->done, false); - spin_unlock_irqrestore(&stopper->lock, flags); } @@ -213,6 +216,16 @@ static int multi_cpu_stop(void *data) return err; } +static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, + int cpu2, struct cpu_stop_work *work2) +{ + lg_double_lock(&stop_cpus_lock, cpu1, cpu2); + cpu_stop_queue_work(cpu1, work1); + cpu_stop_queue_work(cpu2, work2); + lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + + return 0; +} /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop @@ -260,10 +273,12 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * return -ENOENT; } - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); - cpu_stop_queue_work(cpu1, &work1); - cpu_stop_queue_work(cpu2, &work2); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + if (cpu1 > cpu2) + swap(cpu1, cpu2); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { + preempt_enable(); + return -ENOENT; + } preempt_enable(); -- cgit v1.1 From d8bc853582bfd81a9c08ca6922aeb01570080ccc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Oct 2015 19:01:41 +0200 Subject: stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled Change cpu_stop_queue_two_works() to ensure that both CPU's have stopper->enabled == T or fail otherwise. This way stop_two_cpus() no longer needs to check cpu_active() to avoid the deadlock. This patch doesn't remove these checks, we will do this later. Note: we need to take both stopper->lock's at the same time, but this will also help to remove lglock from stop_machine.c, so I hope this is fine. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151008170141.GA25537@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 688d6b3..91fbb10 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -219,12 +219,27 @@ static int multi_cpu_stop(void *data) static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, int cpu2, struct cpu_stop_work *work2) { + struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); + struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + int err; + lg_double_lock(&stop_cpus_lock, cpu1, cpu2); - cpu_stop_queue_work(cpu1, work1); - cpu_stop_queue_work(cpu2, work2); + spin_lock_irq(&stopper1->lock); + spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + + err = -ENOENT; + if (!stopper1->enabled || !stopper2->enabled) + goto unlock; + + err = 0; + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); +unlock: + spin_unlock(&stopper2->lock); + spin_unlock_irq(&stopper1->lock); lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); - return 0; + return err; } /** * stop_two_cpus - stops two cpus @@ -261,12 +276,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * set_state(&msdata, MULTI_STOP_PREPARE); /* - * If we observe both CPUs active we know _cpu_down() cannot yet have - * queued its stop_machine works and therefore ours will get executed - * first. Or its not either one of our CPUs that's getting unplugged, - * in which case we don't care. - * - * This relies on the stopper workqueues to be FIFO. + * We do not want to migrate to inactive CPU. FIXME: move this + * into migrate_swap_stop() callback. */ if (!cpu_active(cpu1) || !cpu_active(cpu2)) { preempt_enable(); -- cgit v1.1 From c00166d87e730088d919814020e96ffed129d0d1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 9 Oct 2015 18:00:49 +0200 Subject: stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() 1. Change smpboot_unpark_thread() to check ->selfparking, just like smpboot_park_thread() does. 2. Introduce stop_machine_unpark() which sets ->enabled and calls kthread_unpark(). 3. Change smpboot_thread_call() and cpu_stop_init() to call stop_machine_unpark() by hand. This way: - IMO the ->selfparking logic becomes more consistent. - We can kill the smp_hotplug_thread->pre_unpark() method. - We can easily unpark the stopper thread earlier. Say, we can move stop_machine_unpark() from smpboot_thread_call() to sched_cpu_active() as Peter suggests. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151009160049.GA10166@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 1 + kernel/smpboot.c | 5 ++--- kernel/stop_machine.c | 10 +++++++++- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c85df27..6467521 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -475,6 +475,7 @@ static int smpboot_thread_call(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: + stop_machine_unpark(cpu); smpboot_unpark_threads(cpu); break; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index a818cbc..d264f59 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (ht->pre_unpark) - ht->pre_unpark(cpu); - kthread_unpark(tsk); + if (!ht->selfparking) + kthread_unpark(tsk); } void smpboot_unpark_threads(unsigned int cpu) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 91fbb10..59096a5 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,6 +513,14 @@ static void cpu_stop_unpark(unsigned int cpu) spin_unlock_irq(&stopper->lock); } +void stop_machine_unpark(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + + cpu_stop_unpark(cpu); + kthread_unpark(stopper->thread); +} + static struct smp_hotplug_thread cpu_stop_threads = { .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, @@ -521,7 +529,6 @@ static struct smp_hotplug_thread cpu_stop_threads = { .create = cpu_stop_create, .setup = cpu_stop_unpark, .park = cpu_stop_park, - .pre_unpark = cpu_stop_unpark, .selfparking = true, }; @@ -537,6 +544,7 @@ static int __init cpu_stop_init(void) } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); + stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } -- cgit v1.1 From f0cf16cbd0659d2dd21352da9f06f3fab7a51596 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 9 Oct 2015 18:00:51 +0200 Subject: stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() Now that we always use stop_machine_unpark() to wake the stopper threas up, we can kill ->setup() and fold cpu_stop_unpark() into stop_machine_unpark(). And we do not need stopper->lock to set stopper->enabled = true. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151009160051.GA10169@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 59096a5..e5a09d2 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -504,20 +504,11 @@ static void cpu_stop_park(unsigned int cpu) WARN_ON(!list_empty(&stopper->works)); } -static void cpu_stop_unpark(unsigned int cpu) -{ - struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - - spin_lock_irq(&stopper->lock); - stopper->enabled = true; - spin_unlock_irq(&stopper->lock); -} - void stop_machine_unpark(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - cpu_stop_unpark(cpu); + stopper->enabled = true; kthread_unpark(stopper->thread); } @@ -527,7 +518,6 @@ static struct smp_hotplug_thread cpu_stop_threads = { .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, - .setup = cpu_stop_unpark, .park = cpu_stop_park, .selfparking = true, }; -- cgit v1.1 From 07f06cb3b5f6bd21374a48dbefdb431d71d53974 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2015 18:00:54 +0200 Subject: sched: Start stopper early Ensure the stopper thread is active 'early', because the load balancer pretty much assumes that its available. And when 'online && active' the load-balancer is fully available. Not only the numa balancing stop_two_cpus() caller relies on it, but also the self migration stuff does, and at CPU_ONLINE time the cpu really is 'free' to run anything. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151009160054.GA10176@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 1 - kernel/sched/core.c | 12 +++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6467521..c85df27 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -475,7 +475,6 @@ static int smpboot_thread_call(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - stop_machine_unpark(cpu); smpboot_unpark_threads(cpu); break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f45a7c7..7ee8cae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5545,21 +5545,27 @@ static void set_cpu_rq_start_time(void) static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: /* * At this point a starting CPU has marked itself as online via * set_cpu_online(). But it might not yet have marked itself * as active, which is essential from here on. - * - * Thus, fall-through and help the starting CPU along. */ + set_cpu_active(cpu, true); + stop_machine_unpark(cpu); + return NOTIFY_OK; + case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); + set_cpu_active(cpu, true); return NOTIFY_OK; + default: return NOTIFY_DONE; } -- cgit v1.1 From 62694cd51322262a9142e946915fc4783113ccff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2015 18:36:29 +0200 Subject: sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() The cpu_active() tests are not fundamentally part of stop_two_cpus(), move then into the scheduler where they belong. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ kernel/stop_machine.c | 9 --------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ee8cae..a7b368e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1335,12 +1335,16 @@ static int migrate_swap_stop(void *data) struct rq *src_rq, *dst_rq; int ret = -EAGAIN; + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; + src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e5a09d2..867bc20 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -275,15 +275,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); - /* - * We do not want to migrate to inactive CPU. FIXME: move this - * into migrate_swap_stop() callback. - */ - if (!cpu_active(cpu1) || !cpu_active(cpu2)) { - preempt_enable(); - return -ENOENT; - } - if (cpu1 > cpu2) swap(cpu1, cpu2); if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { -- cgit v1.1 From e73e85f0593832aa583b252f9a16cf90ed6d30fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 10 Oct 2015 20:53:15 +0200 Subject: sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS If CONFIG_CPUSETS=n then "case cpuset" changes the state and runs the already failed for_each_cpu() loop again for no reason. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151010185315.GA24100@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a7b368e..b4d263d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1580,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) goto out; } + /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; - + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; -- cgit v1.1 From a2d7629048322ae62bff57f34f5f995e25ed234c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Oct 2015 11:38:08 -0400 Subject: tracing: Have stack tracer force RCU to be watching The stack tracer was triggering the WARN_ON() in module.c: static void module_assert_mutex_or_preempt(void) { #ifdef CONFIG_LOCKDEP if (unlikely(!debug_locks)) return; WARN_ON(!rcu_read_lock_sched_held() && !lockdep_is_held(&module_mutex)); #endif } The reason is that the stack tracer traces all function calls, and some of those calls happen while exiting or entering user space and idle. Some of these functions are called after RCU had already stopped watching, as RCU does not watch userspace or idle CPUs. If a max stack is hit, then the save_stack_trace() is called, which will check module addresses and call module_assert_mutex_or_preempt(), and then trigger the warning. Sad part is, the warning itself will also do a stack trace and tigger the same warning. That probably should be fixed. The warning was added by 0be964be0d45 "module: Sanitize RCU usage and locking" but this bug has probably been around longer. But it's unlikely to cause much harm, but the new warning causes the system to lock up. Cc: stable@vger.kernel.org # 4.2+ Cc: Peter Zijlstra Cc:"Paul E. McKenney" Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b746399..5f29402 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -88,6 +88,12 @@ check_stack(unsigned long ip, unsigned long *stack) local_irq_save(flags); arch_spin_lock(&max_stack_lock); + /* + * RCU may not be watching, make it see us. + * The stack trace code uses rcu_sched. + */ + rcu_irq_enter(); + /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -169,6 +175,7 @@ check_stack(unsigned long ip, unsigned long *stack) } out: + rcu_irq_exit(); arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } -- cgit v1.1 From 3ba009297149fa45956c33ab5de7c5f4da1f28b8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Tue, 29 Sep 2015 19:46:14 +0300 Subject: ftrace: Introduce ftrace_glob structure ftrace_match parameters are very related and I reduce the number of local variables & parameters with it. This is also preparation for module globbing as it would introduce more realated variables & parameters. Link: http://lkml.kernel.org/r/1443545176-3215-3-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> [ Made some formatting changes ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 84 ++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fc755a4..450a5f5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3425,27 +3425,35 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static int ftrace_match(char *str, char *regex, int len, int type) +/* Type for quick search ftrace basic regexes (globs) from filter_parse_regex */ +struct ftrace_glob { + char *search; + unsigned len; + int type; +}; + +static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; - switch (type) { + switch (g->type) { case MATCH_FULL: - if (strcmp(str, regex) == 0) + if (strcmp(str, g->search) == 0) matched = 1; break; case MATCH_FRONT_ONLY: - if (strncmp(str, regex, len) == 0) + if (strncmp(str, g->search, g->len) == 0) matched = 1; break; case MATCH_MIDDLE_ONLY: - if (strstr(str, regex)) + if (strstr(str, g->search)) matched = 1; break; case MATCH_END_ONLY: slen = strlen(str); - if (slen >= len && memcmp(str + slen - len, regex, len) == 0) + if (slen >= g->len && + memcmp(str + slen - g->len, g->search, g->len) == 0) matched = 1; break; } @@ -3477,8 +3485,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) } static int -ftrace_match_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) +ftrace_match_record(struct dyn_ftrace *rec, + char *mod, struct ftrace_glob *func_g) { char str[KSYM_SYMBOL_LEN]; char *modname; @@ -3491,28 +3499,27 @@ ftrace_match_record(struct dyn_ftrace *rec, char *mod, return 0; /* blank search means to match all funcs in the mod */ - if (!len) + if (!func_g->len) return 1; } - return ftrace_match(str, regex, len, type); + return ftrace_match(str, func_g); } static int -match_records(struct ftrace_hash *hash, char *buff, int len, char *mod) +match_records(struct ftrace_hash *hash, char *func, int len, char *mod) { - unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; + struct ftrace_glob func_g = { .type = MATCH_FULL }; int found = 0; int ret; int clear_filter; if (len) { - type = filter_parse_regex(buff, len, &search, &clear_filter); - search_len = strlen(search); + func_g.type = filter_parse_regex(func, len, &func_g.search, + &clear_filter); + func_g.len = strlen(func_g.search); } mutex_lock(&ftrace_lock); @@ -3521,7 +3528,7 @@ match_records(struct ftrace_hash *hash, char *buff, int len, char *mod) goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { + if (ftrace_match_record(rec, mod, &func_g)) { ret = enter_record(hash, rec, clear_filter); if (ret < 0) { found = ret; @@ -3682,19 +3689,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, { struct ftrace_ops_hash old_hash_ops; struct ftrace_func_probe *entry; + struct ftrace_glob func_g; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; - int type, len, not; + int not; unsigned long key; int count = 0; - char *search; int ret; - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -3721,7 +3729,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, NULL, search, len, type)) + if (!ftrace_match_record(rec, NULL, &func_g)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -3794,24 +3802,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; + struct ftrace_glob func_g; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; - int type = MATCH_FULL; - int i, len = 0; - char *search; - int ret; + int i, ret; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) - glob = NULL; + func_g.search = NULL; else if (glob) { int not; - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); + func_g.search = glob; /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -3840,10 +3848,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; /* do this last, since it is the most expensive */ - if (glob) { + if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); - if (!ftrace_match(str, glob, len, type)) + if (!ftrace_match(str, &func_g)) continue; } @@ -3872,7 +3880,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ftrace_free_entry(entry); } mutex_unlock(&ftrace_lock); - + out_unlock: mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); @@ -4588,21 +4596,21 @@ ftrace_graph_release(struct inode *inode, struct file *file) static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { + struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; - int search_len; int fail = 1; - int type, not; - char *search; + int not; bool exists; int i; /* decode regex */ - type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); + func_g.type = filter_parse_regex(buffer, strlen(buffer), + &func_g.search, ¬); if (!not && *idx >= size) return -EBUSY; - search_len = strlen(search); + func_g.len = strlen(func_g.search); mutex_lock(&ftrace_lock); @@ -4613,7 +4621,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, NULL, search, search_len, type)) { + if (ftrace_match_record(rec, NULL, &func_g)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { -- cgit v1.1 From 0b507e1ed1b7364def464cfb348ea7c9e87e6e18 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Tue, 29 Sep 2015 19:46:15 +0300 Subject: ftrace: add module globbing Extend module command for function filter selection with globbing. It uses the same globbing as function filter. sh# echo '*alloc*:mod:*' > set_ftrace_filter Will trace any function with the letters 'alloc' in the name in any module but not in kernel. sh# echo '!*alloc*:mod:ipv6' >> set_ftrace_filter Will prevent from tracing functions with 'alloc' in the name from module ipv6 (do not forget to append to set_ftrace_filter file). sh# echo '*alloc*:mod:!ipv6' > set_ftrace_filter Will trace functions with 'alloc' in the name from kernel and any module except ipv6. sh# echo '*alloc*:mod:!*' > set_ftrace_filter Will trace any function with the letters 'alloc' in the name only from kernel, but not from any module. sh# echo '*:mod:!*' > set_ftrace_filter or sh# echo ':mod:!' > set_ftrace_filter Will trace every function in the kernel, but will not trace functions from any module. sh# echo '*:mod:*' > set_ftrace_filter or sh# echo ':mod:' > set_ftrace_filter As the opposite will trace all functions from all modules, but not from kernel. sh# echo '*:mod:*snd*' > set_ftrace_filter Will trace your sound drivers only (if any). Link: http://lkml.kernel.org/r/1443545176-3215-4-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> [ Made format changes ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 51 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 450a5f5..ea27250 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3485,19 +3485,37 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) } static int -ftrace_match_record(struct dyn_ftrace *rec, - char *mod, struct ftrace_glob *func_g) +ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, + struct ftrace_glob *mod_g, int exclude_mod) { char str[KSYM_SYMBOL_LEN]; char *modname; kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - if (mod) { - /* module lookup requires matching the module */ - if (!modname || strcmp(modname, mod)) + if (mod_g) { + int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; + + /* blank module name to match all modules */ + if (!mod_g->len) { + /* blank module globbing: modname xor exclude_mod */ + if ((!exclude_mod) != (!modname)) + goto func_match; + return 0; + } + + /* not matching the module */ + if (!modname || !mod_matches) { + if (exclude_mod) + goto func_match; + else + return 0; + } + + if (mod_matches && exclude_mod) return 0; +func_match: /* blank search means to match all funcs in the mod */ if (!func_g->len) return 1; @@ -3512,23 +3530,32 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) struct ftrace_page *pg; struct dyn_ftrace *rec; struct ftrace_glob func_g = { .type = MATCH_FULL }; + struct ftrace_glob mod_g = { .type = MATCH_FULL }; + struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL; + int exclude_mod = 0; int found = 0; int ret; int clear_filter; - if (len) { + if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, &clear_filter); func_g.len = strlen(func_g.search); } + if (mod) { + mod_g.type = filter_parse_regex(mod, strlen(mod), + &mod_g.search, &exclude_mod); + mod_g.len = strlen(mod_g.search); + } + mutex_lock(&ftrace_lock); if (unlikely(ftrace_disabled)) goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, &func_g)) { + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); if (ret < 0) { found = ret; @@ -3568,17 +3595,11 @@ ftrace_mod_callback(struct ftrace_hash *hash, * you can tell which command was used by the cmd * parameter. */ - - /* we must have a module name */ - if (!module || !strlen(module)) - return -EINVAL; - ret = match_records(hash, func, strlen(func), module); if (!ret) return -EINVAL; if (ret < 0) return ret; - return 0; } @@ -3729,7 +3750,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, NULL, &func_g)) + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -4621,7 +4642,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, NULL, &func_g)) { + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { -- cgit v1.1 From 1904be1b6bb92058c8e00063dd59df2df294e258 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Oct 2015 21:48:02 -0400 Subject: tracing: Do not allow stack_tracer to record stack in NMI The code in stack tracer should not be executed within an NMI as it grabs spinlocks and stack tracing an NMI gives the possibility of causing a deadlock. Although this is safe on x86_64, because it does not perform stack traces when the task struct stack is not in use (interrupts and NMIs), it may be an issue for NMIs on i386 and other archs that use the same stack as the NMI. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5f29402..8abf1ba 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -85,6 +85,10 @@ check_stack(unsigned long ip, unsigned long *stack) if (!object_is_on_stack(stack)) return; + /* Can't do this from NMI context (can cause deadlocks) */ + if (in_nmi()) + return; + local_irq_save(flags); arch_spin_lock(&max_stack_lock); -- cgit v1.1 From ddd70280bf0e92ad81a9526971409603fba21679 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Sat, 1 Aug 2015 15:27:58 +0300 Subject: tracing: gpio: Add Kconfig option for enabling/disabling trace events Add a new options to trace Kconfig, CONFIG_TRACING_EVENTS_GPIO, that is used for enabling/disabling compilation of gpio function trace events. Link: http://lkml.kernel.org/r/1438432079-11704-4-git-send-email-tal.shorer@gmail.com Acked-by: Linus Walleij Signed-off-by: Tal Shorer Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1153c43..8d6363f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -635,6 +635,13 @@ config TRACE_ENUM_MAP_FILE If unsure, say N +config TRACING_EVENTS_GPIO + bool "Trace gpio events" + depends on GPIOLIB + default y + help + Enable tracing events for gpio subsystem + endif # FTRACE endif # TRACING_SUPPORT -- cgit v1.1 From 53b90c0c56b502056da83d768047dcf765bac9fb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Oct 2015 11:17:47 +0200 Subject: kexec/crash: Say which char is the unrecognized It is helpful when the crashkernel cmdline parsing routines actually say which character is the unrecognized one. Make them do so. Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Reviewed-by: Joerg Roedel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Juergen Gross Cc: Linus Torvalds Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vivek Goyal Cc: WANG Chao Cc: jerry_hoemann@hp.com Link: http://lkml.kernel.org/r/1445246268-26285-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/kexec_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 201b453..bd9f8a0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1149,7 +1149,7 @@ static int __init parse_crashkernel_simple(char *cmdline, if (*cur == '@') *crash_base = memparse(cur+1, &cur); else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char: %c\n", *cur); return -EINVAL; } @@ -1186,12 +1186,12 @@ static int __init parse_crashkernel_suffix(char *cmdline, /* check with suffix */ if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char: %c\n", *cur); return -EINVAL; } cur += strlen(suffix); if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char: %c\n", *cur); return -EINVAL; } -- cgit v1.1 From 3061692921f2d701bb09699d16ed780903dd54e2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 16 Oct 2015 16:04:49 +0300 Subject: tracing: Remove {start,stop}_branch_trace Both start_branch_trace() and stop_branch_trace() are used in only one location, and are both static. As they are small functions there is no need to keep them separated out. Link: http://lkml.kernel.org/r/1445000689-32596-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e2e12ad..3a2a737 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -125,25 +125,14 @@ void disable_branch_tracing(void) mutex_unlock(&branch_tracing_mutex); } -static void start_branch_trace(struct trace_array *tr) -{ - enable_branch_tracing(tr); -} - -static void stop_branch_trace(struct trace_array *tr) -{ - disable_branch_tracing(); -} - static int branch_trace_init(struct trace_array *tr) { - start_branch_trace(tr); - return 0; + return enable_branch_tracing(tr); } static void branch_trace_reset(struct trace_array *tr) { - stop_branch_trace(tr); + disable_branch_tracing(); } static enum print_line_t trace_branch_print(struct trace_iterator *iter, -- cgit v1.1 From 48dbc164b40dd9195dea8cd966e394819e420b64 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 21 Oct 2015 14:04:47 +0100 Subject: certs: add .gitignore to stop git nagging about x509_certificate_list Currently we see this in "git status" if we build in the source dir: Untracked files: (use "git add ..." to include in what will be committed) certs/x509_certificate_list It looks like it used to live in kernel/ so we squash that .gitignore entry at the same time. I didn't bother to dig through git history to see when it moved, since it is just a minor annoyance at most. Cc: David Woodhouse Cc: keyrings@linux-nfs.org Signed-off-by: Paul Gortmaker Signed-off-by: David Howells --- kernel/.gitignore | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index 790d83c..b3097bd 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -5,4 +5,3 @@ config_data.h config_data.gz timeconst.h hz.bc -x509_certificate_list -- cgit v1.1 From 146aa8b1453bd8f1ff2304ffb71b4ee0eb9acdcc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Oct 2015 14:04:48 +0100 Subject: KEYS: Merge the type-specific data with the payload data Merge the type-specific data with the payload data into one four-word chunk as it seems pointless to keep them separate. Use user_key_payload() for accessing the payloads of overloaded user-defined keys. Signed-off-by: David Howells cc: linux-cifs@vger.kernel.org cc: ecryptfs@vger.kernel.org cc: linux-ext4@vger.kernel.org cc: linux-f2fs-devel@lists.sourceforge.net cc: linux-nfs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-ima-devel@lists.sourceforge.net --- kernel/module_signing.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index bd62f5c..6528a79 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include "module-internal.h" -- cgit v1.1 From 58c9c87ca9dca99f269267f5aadcd051fef1637b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Oct 2015 14:34:57 +0200 Subject: genirq: Make the cpuhotplug migration code less noisy The original arm code has a pr_debug() statement for the case where the irq chip has no set_affinity() callback. That's sufficient for debugging and we really don't want to spam dmesg with useless warnings for the normal case. Fixes: f1e0bb0ad473: "genirq: Introduce generic irq migration for cpu hotunplug" Reported-by: Geert Uytterhoeven Requested-by: Russell King Signed-off-by: Thomas Gleixner Cc: Yang Yingliang Cc: Mark Rutland Cc: Marc Zyngier Cc: Will Deacon Cc: Hanjun Guo Cc: Jiang Liu --- kernel/irq/cpuhotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 80f4f4e..011f8c4 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -36,7 +36,7 @@ static bool migrate_one_irq(struct irq_desc *desc) c = irq_data_get_irq_chip(d); if (!c->irq_set_affinity) { - pr_warn_ratelimited("IRQ%u: unable to set affinity\n", d->irq); + pr_debug("IRQ%u: unable to set affinity\n", d->irq); } else { int r = irq_do_set_affinity(d, affinity, false); if (r) -- cgit v1.1 From fa128e6a148a0a58355bd6814c6283515bbd028a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Oct 2015 20:02:33 -0700 Subject: perf: pad raw data samples automatically Instead of WARN_ON in perf_event_output() on unpaded raw samples, pad them automatically. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b11756f..64754bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5286,9 +5286,15 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_RAW) { if (data->raw) { - perf_output_put(handle, data->raw->size); - __output_copy(handle, data->raw->data, - data->raw->size); + u32 raw_size = data->raw->size; + u32 real_size = round_up(raw_size + sizeof(u32), + sizeof(u64)) - sizeof(u32); + u64 zero = 0; + + perf_output_put(handle, real_size); + __output_copy(handle, data->raw->data, raw_size); + if (real_size - raw_size) + __output_copy(handle, &zero, real_size - raw_size); } else { struct { u32 size; @@ -5420,8 +5426,7 @@ void perf_prepare_sample(struct perf_event_header *header, else size += sizeof(u32); - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; + header->size += round_up(size, sizeof(u64)); } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { -- cgit v1.1 From a43eec304259a6c637f4014a6d4767159b6a3aa3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Oct 2015 20:02:34 -0700 Subject: bpf: introduce bpf_perf_event_output() helper This helper is used to send raw data from eBPF program into special PERF_TYPE_SOFTWARE/PERF_COUNT_SW_BPF_OUTPUT perf_event. User space needs to perf_event_open() it (either for one or all cpus) and store FD into perf_event_array (similar to bpf_perf_event_read() helper) before eBPF program can send data into it. Today the programs triggered by kprobe collect the data and either store it into the maps or print it via bpf_trace_printk() where latter is the debug facility and not suitable to stream the data. This new helper replaces such bpf_trace_printk() usage and allows programs to have dedicated channel into user space for post-processing of the raw data collected. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 ++ kernel/bpf/verifier.c | 3 ++- kernel/trace/bpf_trace.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f2d9e69..e3cfe46 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -295,6 +295,8 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) return (void *)attr; if (attr->type != PERF_TYPE_RAW && + !(attr->type == PERF_TYPE_SOFTWARE && + attr->config == PERF_COUNT_SW_BPF_OUTPUT) && attr->type != PERF_TYPE_HARDWARE) { perf_event_release_kernel(event); return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d6b97b..b56cf51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -245,6 +245,7 @@ static const struct { } func_limit[] = { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, }; static void print_verifier_state(struct verifier_env *env) @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_map != bool_func) + if (bool_func && bool_map != bool_func) return -EINVAL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0fe96c7..47febbe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,6 +215,50 @@ const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + void *data = (void *) (long) r4; + struct perf_sample_data sample_data; + struct perf_event *event; + struct perf_raw_record raw = { + .size = size, + .data = data, + }; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (unlikely(!event)) + return -ENOENT; + + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || + event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) + return -EINVAL; + + if (unlikely(event->oncpu != smp_processor_id())) + return -EOPNOTSUPP; + + perf_sample_data_init(&sample_data, 0, 0); + sample_data.raw = &raw; + perf_event_output(event, &sample_data, regs); + return 0; +} + +static const struct bpf_func_proto bpf_perf_event_output_proto = { + .func = bpf_perf_event_output, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -242,6 +286,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto; default: return NULL; } -- cgit v1.1 From ee1d267423a1f8041e2b1a33fc23e4393c67677e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 20 Oct 2015 00:39:03 -0700 Subject: pstore: add pstore unregister pstore doesn't support unregistering yet. It was marked as TODO. This patch adds some code to fix it: 1) Add functions to unregister kmsg/console/ftrace/pmsg. 2) Add a function to free compression buffer. 3) Unmap the memory and free it. 4) Add a function to unregister pstore filesystem. Signed-off-by: Geliang Tang Acked-by: Kees Cook [Removed __exit annotation from ramoops_remove(). Reported by Arnd Bergmann] Signed-off-by: Tony Luck --- kernel/printk/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8f0324e..b16f354 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -517,6 +517,7 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } +EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { -- cgit v1.1 From 5211613978cb7353a3237e4372958c0e7514683f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 22 Oct 2015 13:32:08 -0700 Subject: kmod: don't run async usermode helper as a child of kworker thread call_usermodehelper_exec_sync() does fork() + wait() with "unignored" SIGCHLD. What we have missed is that this worker thread can have other children previously forked by call_usermodehelper_exec_work() without UMH_WAIT_PROC. If such a child exits in between it becomes a zombie because auto-reaping only works if SIGCHLD is ignored, and nobody can reap it (unless/until this worker thread exits too). Change the !UMH_WAIT_PROC case to use CLONE_PARENT. Note: this is only first step. All PF_KTHREAD tasks, even created by kernel_thread() should have ->parent == kthreadd by default. Fixes: bb304a5c6fc63d8506c ("kmod: handle UMH_WAIT_PROC from system unbound workqueue") Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Rik van Riel Cc: Christoph Lameter Cc: Tejun Heo Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index da98d05..0277d12 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work) call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; - + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - SIGCHLD); + CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); -- cgit v1.1 From 0aaafaabfcba8aa991913cd3280a5dbf7f111a2a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Oct 2015 11:50:08 +0200 Subject: sched/core: Add missing lockdep_unpin() annotations Luca and Wanpeng reported two missing annotations that led to false lockdep complaints. Add the missing annotations. Reported-by: Luca Abeni Reported-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: cbce1a686700 ("sched,lockdep: Employ lock pinning") Link: http://lkml.kernel.org/r/20151023095008.GY17308@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++++- kernel/sched/deadline.c | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bd7d60..bcd214e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p) trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Nothing relies on rq->lock after this, so its fine to + * drop it. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } #endif task_rq_unlock(rq, p, &flags); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 142df26..8b0a15e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Queueing this task back might have overloaded rq, check if we need * to kick someone away. */ - if (has_pushable_dl_tasks(rq)) + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + lockdep_unpin_lock(&rq->lock); push_dl_task(rq); + lockdep_pin_lock(&rq->lock); + } #endif unlock: -- cgit v1.1 From 03f136a2074b2b8890da4a24df7104558ad0da48 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 14 Jul 2015 19:24:45 +0200 Subject: timeconst: Update path in comment Signed-off-by: Jason A. Donenfeld Cc: hofrat@osadl.org Link: http://lkml.kernel.org/r/1436894685-5868-1-git-send-email-Jason@zx2c4.com Signed-off-by: Thomas Gleixner --- kernel/time/timeconst.bc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index c7388de..c486889 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -39,7 +39,7 @@ define fmuls(b,n,d) { } define timeconst(hz) { - print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Automatically generated by kernel/time/timeconst.bc */\n" print "/* Time conversion constants for HZ == ", hz, " */\n" print "\n" -- cgit v1.1 From 7904b5c4988e18b50056b5e71a3ffca752a8a451 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Sep 2015 17:13:19 -0400 Subject: tracepoint: Give priority to probes of tracepoints In order to guarantee that a probe will be called before other probes that are attached to a tracepoint, there needs to be a mechanism to provide priority of one probe over the others. Adding a prio field to the struct tracepoint_func, which lets the probes be sorted by the priority set in the structure. If no priority is specified, then a priority of 10 is given (this is a macro, and perhaps may be changed in the future). Now probes may be added to affect other probes that are attached to a tracepoint with a guaranteed order. One use case would be to allow tracing of tracepoints be able to filter by pid. A special (higher priority probe) may be added to the sched_switch tracepoint and set the necessary flags of the other tracepoints to notify them if they should be traced or not. In case a tracepoint is enabled at the sched_switch tracepoint too, the order of the two are not random. Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 61 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 3490407..ecd536d 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -91,11 +91,13 @@ static void debug_print_probes(struct tracepoint_func *funcs) printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); } -static struct tracepoint_func *func_add(struct tracepoint_func **funcs, - struct tracepoint_func *tp_func) +static struct tracepoint_func * +func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, + int prio) { - int nr_probes = 0; struct tracepoint_func *old, *new; + int nr_probes = 0; + int pos = -1; if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); @@ -104,18 +106,33 @@ static struct tracepoint_func *func_add(struct tracepoint_func **funcs, old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + /* Insert before probes of lower priority */ + if (pos < 0 && old[nr_probes].prio < prio) + pos = nr_probes; if (old[nr_probes].func == tp_func->func && old[nr_probes].data == tp_func->data) return ERR_PTR(-EEXIST); + } } /* + 2 : one for new probe, one for NULL func */ new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); - if (old) - memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes] = *tp_func; + if (old) { + if (pos < 0) { + pos = nr_probes; + memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); + } else { + /* Copy higher priority probes ahead of the new probe */ + memcpy(new, old, pos * sizeof(struct tracepoint_func)); + /* Copy the rest after it. */ + memcpy(new + pos + 1, old + pos, + (nr_probes - pos) * sizeof(struct tracepoint_func)); + } + } else + pos = 0; + new[pos] = *tp_func; new[nr_probes + 1].func = NULL; *funcs = new; debug_print_probes(*funcs); @@ -174,7 +191,7 @@ static void *func_remove(struct tracepoint_func **funcs, * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, - struct tracepoint_func *func) + struct tracepoint_func *func, int prio) { struct tracepoint_func *old, *tp_funcs; @@ -183,7 +200,7 @@ static int tracepoint_add_func(struct tracepoint *tp, tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); - old = func_add(&tp_funcs, func); + old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { WARN_ON_ONCE(1); return PTR_ERR(old); @@ -240,6 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * @tp: tracepoint * @probe: probe handler * @data: tracepoint data + * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -247,7 +265,8 @@ static int tracepoint_remove_func(struct tracepoint *tp, * performed either with a tracepoint module going notifier, or from * within module exit functions. */ -int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) +int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, + void *data, int prio) { struct tracepoint_func tp_func; int ret; @@ -255,10 +274,30 @@ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; - ret = tracepoint_add_func(tp, &tp_func); + tp_func.prio = prio; + ret = tracepoint_add_func(tp, &tp_func, prio); mutex_unlock(&tracepoints_mutex); return ret; } +EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @tp: tracepoint + * @probe: probe handler + * @data: tracepoint data + * @prio: priority of this function over other registered functions + * + * Returns 0 if ok, error value on error. + * Note: if @tp is within a module, the caller is responsible for + * unregistering the probe before the module is gone. This can be + * performed either with a tracepoint module going notifier, or from + * within module exit functions. + */ +int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) +{ + return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); +} EXPORT_SYMBOL_GPL(tracepoint_probe_register); /** -- cgit v1.1 From 4909010788640b7101bf50cddb7c5e60172b4433 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 24 Sep 2015 11:33:26 -0400 Subject: tracing: Add set_event_pid directory for future use Create a tracing directory called set_event_pid, which currently has no function, but will be used to filter all events for the tracing instance or the pids that are added to the file. The reason no functionality is added with this commit is that this commit focuses on the creation and removal of the pids in a safe manner. And tests can be made against this change to make sure things are correct before hooking features to the list of pids. Cc: "Paul E. McKenney" Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 7 ++ kernel/trace/trace_events.c | 287 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb8a61c..2504810 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -176,6 +176,12 @@ struct trace_options { struct trace_option_dentry *topts; }; +struct trace_pid_list { + unsigned int nr_pids; + int order; + pid_t *pids; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -201,6 +207,7 @@ struct trace_array { bool allocated_snapshot; unsigned long max_latency; #endif + struct trace_pid_list __rcu *filtered_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d120cfe..2ad7014 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,8 +15,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -445,6 +447,43 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } +static int cmp_pid(const void *key, const void *elt) +{ + const pid_t *search_pid = key; + const pid_t *pid = elt; + + if (*search_pid == *pid) + return 0; + if (*search_pid < *pid) + return -1; + return 1; +} + +static void __ftrace_clear_event_pids(struct trace_array *tr) +{ + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + if (!pid_list) + return; + + rcu_assign_pointer(tr->filtered_pids, NULL); + + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); + + free_pages((unsigned long)pid_list->pids, pid_list->order); + kfree(pid_list); +} + +static void ftrace_clear_event_pids(struct trace_array *tr) +{ + mutex_lock(&event_mutex); + __ftrace_clear_event_pids(tr); + mutex_unlock(&event_mutex); +} + static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; @@ -777,6 +816,56 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void *p_start(struct seq_file *m, loff_t *pos) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + /* + * Grab the mutex, to keep calls to p_next() having the same + * tr->filtered_pids as p_start() has. + * If we just passed the tr->filtered_pids around, then RCU would + * have been enough, but doing that makes things more complex. + */ + mutex_lock(&event_mutex); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + if (!pid_list || *pos >= pid_list->nr_pids) + return NULL; + + return (void *)&pid_list->pids[*pos]; +} + +static void p_stop(struct seq_file *m, void *p) +{ + rcu_read_unlock_sched(); + mutex_unlock(&event_mutex); +} + +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + + (*pos)++; + + if (*pos >= pid_list->nr_pids) + return NULL; + + return (void *)&pid_list->pids[*pos]; +} + +static int p_show(struct seq_file *m, void *v) +{ + pid_t *pid = v; + + seq_printf(m, "%d\n", *pid); + return 0; +} + static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1334,8 +1423,165 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static int max_pids(struct trace_pid_list *pid_list) +{ + return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); +} + +static ssize_t +ftrace_event_pid_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list = NULL; + struct trace_parser parser; + unsigned long val; + loff_t this_pos; + ssize_t read = 0; + ssize_t ret = 0; + pid_t pid; + int i; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) + return -ENOMEM; + + mutex_lock(&event_mutex); + /* + * Load as many pids into the array before doing a + * swap from the tr->filtered_pids to the new list. + */ + while (cnt > 0) { + + this_pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &this_pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val > INT_MAX) + break; + + pid = (pid_t)val; + + ret = -ENOMEM; + if (!pid_list) { + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + break; + + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + if (filtered_pids) + pid_list->order = filtered_pids->order; + else + pid_list->order = 0; + + pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, + pid_list->order); + if (!pid_list->pids) + break; + + if (filtered_pids) { + pid_list->nr_pids = filtered_pids->nr_pids; + memcpy(pid_list->pids, filtered_pids->pids, + pid_list->nr_pids * sizeof(pid_t)); + } else + pid_list->nr_pids = 0; + } + + if (pid_list->nr_pids >= max_pids(pid_list)) { + pid_t *pid_page; + + pid_page = (void *)__get_free_pages(GFP_KERNEL, + pid_list->order + 1); + if (!pid_page) + break; + memcpy(pid_page, pid_list->pids, + pid_list->nr_pids * sizeof(pid_t)); + free_pages((unsigned long)pid_list->pids, pid_list->order); + + pid_list->order++; + pid_list->pids = pid_page; + } + + pid_list->pids[pid_list->nr_pids++] = pid; + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + if (pid_list) + free_pages((unsigned long)pid_list->pids, pid_list->order); + kfree(pid_list); + mutex_unlock(&event_mutex); + return ret; + } + + if (!pid_list) { + mutex_unlock(&event_mutex); + return ret; + } + + sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); + + /* Remove duplicates */ + for (i = 1; i < pid_list->nr_pids; i++) { + int start = i; + + while (i < pid_list->nr_pids && + pid_list->pids[i - 1] == pid_list->pids[i]) + i++; + + if (start != i) { + if (i < pid_list->nr_pids) { + memmove(&pid_list->pids[start], &pid_list->pids[i], + (pid_list->nr_pids - i) * sizeof(pid_t)); + pid_list->nr_pids -= i - start; + i = start; + } else + pid_list->nr_pids = start; + } + } + + rcu_assign_pointer(tr->filtered_pids, pid_list); + + mutex_unlock(&event_mutex); + + if (filtered_pids) { + synchronize_sched(); + + free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); + kfree(filtered_pids); + } + + ret = read; + *ppos += read; + + return ret; +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { @@ -1352,6 +1598,13 @@ static const struct seq_operations show_set_event_seq_ops = { .stop = t_stop, }; +static const struct seq_operations show_set_pid_seq_ops = { + .start = p_start, + .next = p_next, + .show = p_show, + .stop = p_stop, +}; + static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, @@ -1367,6 +1620,14 @@ static const struct file_operations ftrace_set_event_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_set_event_pid_fops = { + .open = ftrace_event_set_pid_open, + .read = seq_read, + .write = ftrace_event_pid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, @@ -1477,6 +1738,26 @@ ftrace_event_set_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_event_set_pid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + static struct event_subsystem * create_new_subsystem(const char *name) { @@ -2471,6 +2752,9 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } + entry = tracefs_create_file("set_event_pid", 0644, parent, + tr, &ftrace_set_event_pid_fops); + /* ring buffer internal formats */ trace_create_file("header_page", 0444, d_events, ring_buffer_print_page_header, @@ -2551,6 +2835,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); + /* Clear the pid list */ + __ftrace_clear_event_pids(tr); + /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); -- cgit v1.1 From 3fdaf80f4a836911c0eda1cee92f8aa625f90197 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 25 Sep 2015 12:58:44 -0400 Subject: tracing: Implement event pid filtering Add the necessary hooks to use the pids loaded in set_event_pid to filter all the events enabled in the tracing instance that match the pids listed. Two probes are added to both sched_switch and sched_wakeup tracepoints to be called before other probes are called and after the other probes are called. The first is used to set the necessary flags to let the probes know to test if they should be traced or not. The sched_switch pre probe will set the "ignore_pid" flag if neither the previous or next task has a matching pid. The sched_switch probe will set the "ignore_pid" flag if the next task does not match the matching pid. The pre probe allows for probes tracing sched_switch to be traced if necessary. The sched_wakeup pre probe will set the "ignore_pid" flag if neither the current task nor the wakee task has a matching pid. The sched_wakeup post probe will set the "ignore_pid" flag if the current task does not have a matching pid. Cc: "Paul E. McKenney" Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 + kernel/trace/trace_events.c | 148 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2504810..89ffdaf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -156,6 +156,8 @@ struct trace_array_cpu { pid_t pid; kuid_t uid; char comm[TASK_COMM_LEN]; + + bool ignore_pid; }; struct tracer; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2ad7014..ab07058 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -22,6 +22,8 @@ #include #include +#include + #include #include "trace_output.h" @@ -212,12 +214,32 @@ int trace_event_raw_init(struct trace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) +{ + struct trace_array *tr = trace_file->tr; + struct trace_array_cpu *data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + if (!pid_list) + return false; + + data = this_cpu_ptr(tr->trace_buffer.data); + + return data->ignore_pid; +} +EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); + void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) { struct trace_event_call *event_call = trace_file->event_call; + if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(trace_file)) + return NULL; + local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); fbuffer->trace_file = trace_file; @@ -459,15 +481,114 @@ static int cmp_pid(const void *key, const void *elt) return 1; } +static bool +check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + pid_t search_pid; + pid_t *pid; + + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + search_pid = task->pid; + + pid = bsearch(&search_pid, filtered_pids->pids, + filtered_pids->nr_pids, sizeof(pid_t), + cmp_pid); + if (!pid) + return true; + + return false; +} + +static void +event_filter_pid_sched_switch_probe_pre(void *data, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, prev) && + check_ignore_pid(pid_list, next)); +} + +static void +event_filter_pid_sched_switch_probe_post(void *data, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, next)); +} + +static void +event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are already tracing */ + if (!this_cpu_read(tr->trace_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, task)); +} + +static void +event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are not tracing */ + if (this_cpu_read(tr->trace_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + /* Set tracing if current is enabled */ + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, current)); +} + static void __ftrace_clear_event_pids(struct trace_array *tr) { struct trace_pid_list *pid_list; + struct trace_event_file *file; + int cpu; pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); if (!pid_list) return; + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); + + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + + list_for_each_entry(file, &tr->events, list) { + clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false; + rcu_assign_pointer(tr->filtered_pids, NULL); /* Wait till all users are no longer using pid filtering */ @@ -1429,13 +1550,14 @@ static int max_pids(struct trace_pid_list *pid_list) } static ssize_t -ftrace_event_pid_write(struct file *file, const char __user *ubuf, +ftrace_event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct seq_file *m = file->private_data; + struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *pid_list = NULL; + struct trace_event_file *file; struct trace_parser parser; unsigned long val; loff_t this_pos; @@ -1564,15 +1686,35 @@ ftrace_event_pid_write(struct file *file, const char __user *ubuf, rcu_assign_pointer(tr->filtered_pids, pid_list); - mutex_unlock(&event_mutex); + list_for_each_entry(file, &tr->events, list) { + set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } if (filtered_pids) { synchronize_sched(); free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); kfree(filtered_pids); + } else { + /* + * Register a probe that is called before all other probes + * to set ignore_pid if next or prev do not match. + * Register a probe this is called after all other probes + * to only keep ignore_pid set if next pid matches. + */ + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, + tr, 0); } + mutex_unlock(&event_mutex); + ret = read; *ppos += read; -- cgit v1.1 From 8ca532ad2b050da0d0db3544d9ab8b40675e4ca1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 21 Oct 2015 15:27:36 -0400 Subject: tracing: Check all tasks on each CPU when filtering pids My tests found that if a task is running but not filtered when set_event_pid is modified, then it can still be traced. Call on_each_cpu() to check if the current running task should be filtered and update the per cpu flags of tr->data appropriately. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ab07058..2b7fccd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1549,6 +1549,22 @@ static int max_pids(struct trace_pid_list *pid_list) return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); } +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->filtered_pids, + mutex_is_locked(&event_mutex)); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, current)); +} + static ssize_t ftrace_event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1711,6 +1727,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); + + /* + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + */ + on_each_cpu(ignore_task_cpu, tr, 1); } mutex_unlock(&event_mutex); -- cgit v1.1 From fb662288284e8f2ec26f13d50a6b0d5781771648 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 26 Oct 2015 03:45:22 -0400 Subject: tracing: Fix sparse RCU warning p_start() and p_stop() are seq_file functions that match. Teach sparse to know that rcu_read_lock_sched() that is taken by p_start() is released by p_stop. Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2b7fccd..fb0261e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -938,6 +938,7 @@ static void t_stop(struct seq_file *m, void *p) } static void *p_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; @@ -960,6 +961,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) } static void p_stop(struct seq_file *m, void *p) + __releases(RCU) { rcu_read_unlock_sched(); mutex_unlock(&event_mutex); -- cgit v1.1 From 182475b7a2831abf7e6ca83b2aced0bef5dcdfd3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 26 Oct 2015 16:55:56 -0400 Subject: memremap: fix highmem support Currently memremap checks if the range is "System RAM" and returns the kernel linear address. This is broken for highmem platforms where a range may be "System RAM", but is not part of the kernel linear mapping. Fallback to ioremap_cache() in these cases, to let the arch code attempt to handle it. Note that ARM ioremap will WARN when attempting to remap ram, and in that case the caller needs to be fixed. For this reason, existing ioremap_cache() usages for ARM are already trained to avoid attempts to remap ram. The impact of this bug is low for now since the pmem driver is the only user of memremap(), but this is important to fix before more conversions to memremap arrive in 4.4. Cc: Rafael J. Wysocki Reported-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- kernel/memremap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 72b0c66..9d6b555 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -24,6 +24,16 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) } #endif +static void *try_ram_remap(resource_size_t offset, size_t size) +{ + struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + + /* In the simple case just return the existing linear address */ + if (!PageHighMem(page)) + return __va(offset); + return NULL; /* fallback to ioremap_cache */ +} + /** * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address @@ -66,8 +76,8 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) * the requested range is potentially in "System RAM" */ if (is_ram == REGION_INTERSECTS) - addr = __va(offset); - else + addr = try_ram_remap(offset, size); + if (!addr) addr = ioremap_cache(offset, size); } -- cgit v1.1 From 62544ce8e01c1879d420ba309f7f319d24c0f4e6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Oct 2015 17:10:14 -0700 Subject: bpf: fix bpf_perf_event_read() helper Fix safety checks for bpf_perf_event_read(): - only non-inherited events can be added to perf_event_array map (do this check statically at map insertion time) - dynamically check that event is local and !pmu->count Otherwise buggy bpf program can cause kernel splat. Also fix error path after perf_event_attrs() and remove redundant 'extern'. Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter") Signed-off-by: Alexei Starovoitov Tested-by: Wang Nan Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 25 ++++++++++++++++--------- kernel/trace/bpf_trace.c | 7 ++++++- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e3cfe46..3f4c99e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -292,16 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) attr = perf_event_attrs(event); if (IS_ERR(attr)) - return (void *)attr; + goto err; - if (attr->type != PERF_TYPE_RAW && - !(attr->type == PERF_TYPE_SOFTWARE && - attr->config == PERF_COUNT_SW_BPF_OUTPUT) && - attr->type != PERF_TYPE_HARDWARE) { - perf_event_release_kernel(event); - return ERR_PTR(-EINVAL); - } - return event; + if (attr->inherit) + goto err; + + if (attr->type == PERF_TYPE_RAW) + return event; + + if (attr->type == PERF_TYPE_HARDWARE) + return event; + + if (attr->type == PERF_TYPE_SOFTWARE && + attr->config == PERF_COUNT_SW_BPF_OUTPUT) + return event; +err: + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 47febbe..003df38 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) if (!event) return -ENOENT; + /* make sure event is local and doesn't have pmu::count */ + if (event->oncpu != smp_processor_id() || + event->pmu->count) + return -EINVAL; + /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -207,7 +212,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return perf_event_read_local(event); } -const struct bpf_func_proto bpf_perf_event_read_proto = { +static const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = false, .ret_type = RET_INTEGER, -- cgit v1.1 From 1075ef5950da97927ae1b3ef76d03e211c4fdb55 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 23 Oct 2015 14:58:19 -0700 Subject: bpf: make tracing helpers gpl only exported perf symbols are GPL only, mark eBPF helper functions used in tracing as GPL only as well. Suggested-by: Peter Zijlstra Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 003df38..4228fd3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -214,7 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) static const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, - .gpl_only = false, + .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, @@ -255,7 +255,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, - .gpl_only = false, + .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, -- cgit v1.1 From f8e529ed941ba2bbcbf310b575d968159ce7e895 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Tue, 27 Oct 2015 09:23:59 +0900 Subject: seccomp, ptrace: add support for dumping seccomp filters This patch adds support for dumping a process' (classic BPF) seccomp filters via ptrace. PTRACE_SECCOMP_GET_FILTER allows the tracer to dump the user's classic BPF seccomp filters. addr should be an integer which represents the ith seccomp filter (0 is the most recently installed filter). data should be a struct sock_filter * with enough room for the ith filter, or NULL, in which case the filter is not saved. The return value for this command is the number of BPF instructions the program represents, or negative in the case of errors. Command specific errors are ENOENT: which indicates that there is no ith filter in this seccomp tree, and EMEDIUMTYPE, which indicates that the ith filter was not installed as a classic BPF filter. A caveat with this approach is that there is no way to get explicitly at the heirarchy of seccomp filters, and users need to memcmp() filters to decide which are inherited. This means that a task which installs two of the same filter can potentially confuse users of this interface. v2: * make save_orig const * check that the orig_prog exists (not necessary right now, but when grows eBPF support it will be) * s/n/filter_off and make it an unsigned long to match ptrace * count "down" the tree instead of "up" when passing a filter offset v3: * don't take the current task's lock for inspecting its seccomp mode * use a 0x42** constant for the ptrace command value v4: * don't copy to userspace while holding spinlocks v5: * add another condition to WARN_ON v6: * rebase on net-next Signed-off-by: Tycho Andersen Acked-by: Kees Cook CC: Will Drewry Reviewed-by: Oleg Nesterov CC: Andy Lutomirski CC: Pavel Emelyanov CC: Serge E. Hallyn CC: Alexei Starovoitov CC: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/ptrace.c | 5 ++++ kernel/seccomp.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 787320d..b760bae 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request, break; } #endif + + case PTRACE_SECCOMP_GET_FILTER: + ret = seccomp_get_filter(child, addr, datavp); + break; + default: break; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 06858a7..580ac2d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; + const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(-ENOMEM); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, - seccomp_check_filter, false); + seccomp_check_filter, save_orig); if (ret < 0) { kfree(sfilter); return ERR_PTR(ret); @@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) /* prctl interface doesn't have flags, so they are always zero. */ return do_seccomp(op, 0, uargs); } + +#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) +long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, + void __user *data) +{ + struct seccomp_filter *filter; + struct sock_fprog_kern *fprog; + long ret; + unsigned long count = 0; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + spin_lock_irq(&task->sighand->siglock); + if (task->seccomp.mode != SECCOMP_MODE_FILTER) { + ret = -EINVAL; + goto out; + } + + filter = task->seccomp.filter; + while (filter) { + filter = filter->prev; + count++; + } + + if (filter_off >= count) { + ret = -ENOENT; + goto out; + } + count -= filter_off; + + filter = task->seccomp.filter; + while (filter && count > 1) { + filter = filter->prev; + count--; + } + + if (WARN_ON(count != 1 || !filter)) { + /* The filter tree shouldn't shrink while we're using it. */ + ret = -ENOENT; + goto out; + } + + fprog = filter->prog->orig_prog; + if (!fprog) { + /* This must be a new non-cBPF filter, since we save every + * every cBPF filter's orig_prog above when + * CONFIG_CHECKPOINT_RESTORE is enabled. + */ + ret = -EMEDIUMTYPE; + goto out; + } + + ret = fprog->len; + if (!data) + goto out; + + get_seccomp_filter(task); + spin_unlock_irq(&task->sighand->siglock); + + if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) + ret = -EFAULT; + + put_seccomp_filter(task); + return ret; + +out: + spin_unlock_irq(&task->sighand->siglock); + return ret; +} +#endif -- cgit v1.1 From d57456753787ab158f906f1f8eb58d54a2ccd9f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2015 11:43:05 +0900 Subject: cgroup: fix race condition around termination check in css_task_iter_next() css_task_iter_next() checked @it->cur_task before grabbing css_set_lock and assumed that the result won't change afterwards; however, tasks could leave the cgroup being iterated terminating the iterator before css_task_lock is acquired. If this happens, css_task_iter_next() tries to calculate the current task from NULL cg_list pointer leading to the following oops. BUG: unable to handle kernel paging request at fffffffffffff7d0 IP: [] css_task_iter_next+0x42/0x80 ... CPU: 4 PID: 6391 Comm: JobQDisp2 Not tainted 4.0.9-22_fbk4_rc3_81616_ge8d9cb6 #1 Hardware name: Quanta Freedom/Winterfell, BIOS F03_3B08 03/04/2014 task: ffff880868e46400 ti: ffff88083404c000 task.ti: ffff88083404c000 RIP: 0010:[] [] css_task_iter_next+0x42/0x80 RSP: 0018:ffff88083404fd28 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88083404fd68 RCX: ffff8804697fb8b0 RDX: fffffffffffff7c0 RSI: ffff8803b7dff800 RDI: ffffffff822c0278 RBP: ffff88083404fd38 R08: 0000000000017160 R09: ffff88046f4070c0 R10: ffffffff810d61f7 R11: 0000000000000293 R12: ffff880863bf8400 R13: ffff88046b87fd80 R14: 0000000000000000 R15: ffff88083404fe58 FS: 00007fa0567e2700(0000) GS:ffff88046f900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffff7d0 CR3: 0000000469568000 CR4: 00000000001406e0 Stack: 0000000000000246 0000000000000000 ffff88083404fde8 ffffffff810d6248 ffff88083404fd68 0000000000000000 ffff8803b7dff800 000001ef000001ee 0000000000000000 0000000000000000 ffff880863bf8568 0000000000000000 Call Trace: [] cgroup_pidlist_start+0x258/0x550 [] cgroup_seqfile_start+0x1d/0x20 [] kernfs_seq_start+0x5f/0xa0 [] seq_read+0x166/0x380 [] kernfs_fop_read+0x11d/0x180 [] __vfs_read+0x18/0x50 [] vfs_read+0x8d/0x150 [] SyS_read+0x4f/0xb0 [] system_call_fastpath+0x12/0x17 Fix it by moving the termination condition check inside css_set_lock. @it->cur_task is now cleared after being put and @it->task_pos is tested for termination instead of @it->cset_pos as they indicate the same condition and @it->task_pos is what's being dereferenced. Signed-off-by: Tejun Heo Reported-by: Calvin Owens Fixes: ed27b9f7a17d ("cgroup: don't hold css_set_rwsem across css task iteration") Acked-by: Zefan Li --- kernel/cgroup.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4f4fc53..b9d0cce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3930,18 +3930,19 @@ void css_task_iter_start(struct cgroup_subsys_state *css, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { - if (!it->cset_pos) - return NULL; - - if (it->cur_task) + if (it->cur_task) { put_task_struct(it->cur_task); + it->cur_task = NULL; + } spin_lock_bh(&css_set_lock); - it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); - get_task_struct(it->cur_task); - - css_task_iter_advance(it); + if (it->task_pos) { + it->cur_task = list_entry(it->task_pos, struct task_struct, + cg_list); + get_task_struct(it->cur_task); + css_task_iter_advance(it); + } spin_unlock_bh(&css_set_lock); -- cgit v1.1 From cdea01b2bf98affb7e9c44530108a4a28535eee8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 30 Oct 2015 05:25:59 +0900 Subject: blktrace: re-write setting q->blk_trace This is really about simplifying the double xchg patterns into a single cmpxchg, with the same logic. Other than the immediate cleanup, there are some subtleties this change deals with: (i) While the load of the old bt is fully ordered wrt everything, ie: old_bt = xchg(&q->blk_trace, bt); [barrier] if (old_bt) (void) xchg(&q->blk_trace, old_bt); [barrier] blk_trace could still be changed between the xchg and the old_bt load. Note that this description is merely theoretical and afaict very small, but doing everything in a single context with cmpxchg closes this potential race. (ii) Ordering guarantees are obviously kept with cmpxchg. (iii) Gets rid of the hacky-by-nature (void)xchg pattern. Signed-off-by: Davidlohr Bueso eviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 90e72a0..e3a2618 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; struct dentry *dir = NULL; int ret; @@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->trace_state = Blktrace_setup; ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); + if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); @@ -1481,7 +1478,7 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); @@ -1497,12 +1494,9 @@ static int blk_trace_setup_queue(struct request_queue *q, blk_trace_setup_lba(bt, bdev); - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - ret = -EBUSY; + ret = -EBUSY; + if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); -- cgit v1.1 From 799fd44cf5bbcc51c46b674035bfc49cbf6907ba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 2 Nov 2015 13:08:26 -0500 Subject: tracing: Call on_each_cpu() when adding or removing single pids from set_event_pid For the case where pids are already in set_event_pid, and one is added or removed then each CPU should be checked to make sure that the new or old pid is on or not on a CPU. For example: # echo 123 >> set_event_pid or # echo '!123' >> set_event_pid Link: http://lkml.kernel.org/r/20151030061643.GA19480@cac Suggested-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fb0261e..292bccf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1729,14 +1729,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); - - /* - * Ignoring of pids is done at task switch. But we have to - * check for those tasks that are currently running. - */ - on_each_cpu(ignore_task_cpu, tr, 1); } + /* + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. + */ + on_each_cpu(ignore_task_cpu, tr, 1); + mutex_unlock(&event_mutex); ret = read; -- cgit v1.1 From bdb5d0f9045ed88811b6253682dff6b576dd0064 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Tue, 27 Oct 2015 20:12:13 +0800 Subject: tracing: Only benchmark the time tracepoints take if tracing is on There's no need to record the time tracepoints take when tracing is off. This is because: 1) We cannot see these records since ring_buffer record is off at that moment. 2) If tracing is off and benchmark tracepoint is enabled, the time tracepoint takes is fewer than the same situation when tracing is on, since the tracepoints need to be wrote into ring_buffer, it would take more time. If turn on tracing at this moment, the average and standard deviation cannot exactly present the time that tracepoints take to write data into ring_buffer. Link: http://lkml.kernel.org/r/1445947933-27955-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 40a14cb..0f109c4 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -43,7 +43,7 @@ static void trace_do_benchmark(void) unsigned int std = 0; /* Only run if the tracepoint is actually active */ - if (!trace_benchmark_event_enabled()) + if (!trace_benchmark_event_enabled() || !tracing_is_on()) return; local_irq_disable(); -- cgit v1.1 From 681a4a2f4529517422835b7395df07404dfe2278 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Sun, 18 Oct 2015 19:58:08 +0800 Subject: tracing: Update instance_rmdir() to use tracefs_remove_recursive Update instancd_rmdir to use tracefs_remove_recursive instead of debugfs_remove_recursive.This was left in the transition from debugfs to tracefs. Link: http://lkml.kernel.org/r/1445169490-18315-2-git-send-email-hello.wjx@gmail.com Cc: stable@vger.kernel.org # 4.1+ Fixes: 8434dc9340cd2 ("tracing: Convert the tracing facility over to use tracefs") Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78022c1..67873c6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6689,7 +6689,7 @@ static int instance_rmdir(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); - debugfs_remove_recursive(tr->dir); + tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { -- cgit v1.1 From 26ab2ef4516f5c9579b46188809f387406063262 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:29 +0800 Subject: tracing: report_latency() in trace_sched_wakeup.c can return boolean This patch makes report_latency return bool to improve readability, indicating whether this new latency should be reported/recorded. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-2-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4661442..855c2c7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -346,16 +346,16 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) - return 0; + return false; } else { if (delta <= tr->max_latency) - return 0; + return false; } - return 1; + return true; } static void -- cgit v1.1 From 79851821b2c94fc66cddb80b8b12dcfa09f6e7cb Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:30 +0800 Subject: tracing: report_latency() in trace_irqsoff.c can return boolean This patch makes report_latency return bool due to this particular function only using either one or zero as its return value. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-3-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index eaf5291..e4e5658 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -282,16 +282,16 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) - return 0; + return false; } else { if (delta <= tr->max_latency) - return 0; + return false; } - return 1; + return true; } static void -- cgit v1.1 From 06ca320952dc21c537055d2aa36a2c2e96a1b94d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:31 +0800 Subject: ring-buffer: rb_is_reader_page() can return boolean Make rb_is_reader_page() return bool to improve readability due to this particular function only using either true or false as its return value. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-4-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fc347f8..26a948f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -829,7 +829,7 @@ rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, * writer is ever on it, the previous pointer never points * back to the reader page. */ -static int rb_is_reader_page(struct buffer_page *page) +static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; -- cgit v1.1 From 3d4e204d81eec30abffe55d01912e07ce81eef12 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:32 +0800 Subject: ring_buffer: ring_buffer_empty{cpu}() can return boolean Make ring_buffer_empty() and ring_buffer_empty_cpu() return bool. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-5-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 26a948f..fc9ce12 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4267,7 +4267,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); * rind_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ -int ring_buffer_empty(struct ring_buffer *buffer) +bool ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; @@ -4285,10 +4285,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) local_irq_restore(flags); if (!ret) - return 0; + return false; } - return 1; + return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -4297,7 +4297,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); * @buffer: The ring buffer * @cpu: The CPU buffer to test */ -int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; @@ -4305,7 +4305,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 1; + return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); -- cgit v1.1 From da58834cf2fa83fe3885753009fecaa49a85f246 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:33 +0800 Subject: ring-buffer: rb_per_cpu_empty() can return boolean Makes rb_per_cpu_empty() return bool to improve readability. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-6-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fc9ce12..a228789 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3039,7 +3039,7 @@ int ring_buffer_write(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = rb_set_head_page(cpu_buffer); @@ -3047,7 +3047,7 @@ static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) /* In case of error, head will be NULL */ if (unlikely(!head)) - return 1; + return true; return reader->read == rb_page_commit(reader) && (commit == reader || -- cgit v1.1 From cdb2a0a91566d413a6e4e2c57c5d341a2e1173f3 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:34 +0800 Subject: ring-buffer: rb_event_is_commit() can return boolean Make rb_event_is_commit() return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-7-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a228789..75f1d05 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2270,7 +2270,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) return skip_time_extend(event); } -static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, +static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event); /** @@ -2498,7 +2498,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static inline int +static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { -- cgit v1.1 From 907bff917a659a8e50e285dc42ef51d7eaba6e62 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:35 +0800 Subject: tracing: is_legal_op() can return boolean Make is_legal_op() return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-8-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index bd1bf18..f93a219 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -973,15 +973,15 @@ static bool is_string_field(struct ftrace_event_field *field) field->filter_type == FILTER_PTR_STRING; } -static int is_legal_op(struct ftrace_event_field *field, int op) +static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return 0; + return false; if (!is_string_field(field) && op == OP_GLOB) - return 0; + return false; - return 1; + return true; } static filter_pred_fn_t select_comparison_fn(int op, int field_size, -- cgit v1.1 From c6650b2e57725abaa2e36e620d06fa576d26c21c Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 29 Sep 2015 22:43:36 +0800 Subject: tracing: ftrace_event_is_function() can return boolean Make ftrace_event_is_function() return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Link: http://lkml.kernel.org/r/1443537816-5788-9-git-send-email-bywxiaobai@163.com Signed-off-by: Yaowei Bai Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_export.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 89ffdaf..be2126f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -873,7 +873,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops); #define ftrace_destroy_filter_files(ops) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ -int ftrace_event_is_function(struct trace_event_call *call); +bool ftrace_event_is_function(struct trace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index adabf7d..39aa7aa 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), filter, NULL) -int ftrace_event_is_function(struct trace_event_call *call) +bool ftrace_event_is_function(struct trace_event_call *call) { return call == &event_function; } -- cgit v1.1 From ac00881f922106b45e934dc6988fa9c1ec542d71 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 30 Oct 2015 15:16:26 -0700 Subject: bpf: convert hashtab lock to raw lock When running bpf samples on rt kernel, it reports the below warning: BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917 in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping Preemption disabled at:[] kprobe_perf_func+0x30/0x228 CPU: 3 PID: 477 Comm: ping Not tainted 4.1.10-rt8 #4 Hardware name: Freescale Layerscape 2085a RDB Board (DT) Call trace: [] dump_backtrace+0x0/0x128 [] show_stack+0x20/0x30 [] dump_stack+0x7c/0xa0 [] ___might_sleep+0x188/0x1a0 [] rt_spin_lock+0x28/0x40 [] htab_map_update_elem+0x124/0x320 [] bpf_map_update_elem+0x40/0x58 [] __bpf_prog_run+0xd48/0x1640 [] trace_call_bpf+0x8c/0x100 [] kprobe_perf_func+0x30/0x228 [] kprobe_dispatcher+0x34/0x58 [] kprobe_handler+0x114/0x250 [] kprobe_breakpoint_handler+0x1c/0x30 [] brk_handler+0x88/0x98 [] do_debug_exception+0x50/0xb8 Exception stack(0xffff808349687460 to 0xffff808349687580) 7460: 4ca2b600 ffff8083 4a3a7000 ffff8083 49687620 ffff8083 0069c5f8 ffff8000 7480: 00000001 00000000 007e0628 ffff8000 496874b0 ffff8083 007e1de8 ffff8000 74a0: 496874d0 ffff8083 0008e04c ffff8000 00000001 00000000 4ca2b600 ffff8083 74c0: 00ba2e80 ffff8000 49687528 ffff8083 49687510 ffff8083 000e5c70 ffff8000 74e0: 00c22348 ffff8000 00000000 ffff8083 49687510 ffff8083 000e5c74 ffff8000 7500: 4ca2b600 ffff8083 49401800 ffff8083 00000001 00000000 00000000 00000000 7520: 496874d0 ffff8083 00000000 00000000 00000000 00000000 00000000 00000000 7540: 2f2e2d2c 33323130 00000000 00000000 4c944500 ffff8083 00000000 00000000 7560: 00000000 00000000 008751e0 ffff8000 00000001 00000000 124e2d1d 00107b77 Convert hashtab lock to raw lock to avoid such warning. Signed-off-by: Yang Shi Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 28592d7..19909b2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,7 +17,7 @@ struct bpf_htab { struct bpf_map map; struct hlist_head *buckets; - spinlock_t lock; + raw_spinlock_t lock; u32 count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -82,7 +82,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) for (i = 0; i < htab->n_buckets; i++) INIT_HLIST_HEAD(&htab->buckets[i]); - spin_lock_init(&htab->lock); + raw_spin_lock_init(&htab->lock); htab->count = 0; htab->elem_size = sizeof(struct htab_elem) + @@ -234,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, l_new->hash = htab_map_hash(l_new->key, key_size); /* bpf_map_update_elem() can be called in_irq() */ - spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&htab->lock, flags); head = select_bucket(htab, l_new->hash); @@ -270,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } else { htab->count++; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); return 0; err: - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); kfree(l_new); return ret; } @@ -295,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) hash = htab_map_hash(key, key_size); - spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&htab->lock, flags); head = select_bucket(htab, hash); @@ -308,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); return ret; } -- cgit v1.1 From aa79781b65b9cf79807ade78f2703f5e9402c336 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Oct 2015 14:58:06 +0100 Subject: bpf: abstract anon_inode_getfd invocations Since we're going to use anon_inode_getfd() invocations in more than just the current places, make a helper function for both, so that we only need to pass a map/prog pointer to the helper itself in order to get a fd. The new helpers are called bpf_map_new_fd() and bpf_prog_new_fd(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 687dd6c..2b89ef0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -111,6 +111,12 @@ static const struct file_operations bpf_map_fops = { .release = bpf_map_release, }; +static int bpf_map_new_fd(struct bpf_map *map) +{ + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, + O_RDWR | O_CLOEXEC); +} + /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ @@ -141,8 +147,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); - + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ goto free_map; @@ -538,6 +543,12 @@ static const struct file_operations bpf_prog_fops = { .release = bpf_prog_release, }; +static int bpf_prog_new_fd(struct bpf_prog *prog) +{ + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, + O_RDWR | O_CLOEXEC); +} + static struct bpf_prog *get_prog(struct fd f) { struct bpf_prog *prog; @@ -647,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ goto free_used_maps; -- cgit v1.1 From c210129760a010b555372ef74f4e1a46d4eb8a22 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Oct 2015 14:58:07 +0100 Subject: bpf: align and clean bpf_{map,prog}_get helpers Add a bpf_map_get() function that we're going to use later on and align/clean the remaining helpers a bit so that we have them a bit more consistent: - __bpf_map_get() and __bpf_prog_get() that both work on the fd struct, check whether the descriptor is eBPF and return the pointer to the map/prog stored in the private data. Also, we can return f.file->private_data directly, the function signature is enough of a documentation already. - bpf_map_get() and bpf_prog_get() that both work on u32 user fd, call their respective __bpf_map_get()/__bpf_prog_get() variants, and take a reference. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 41 +++++++++++++++++++++++------------------ kernel/bpf/verifier.c | 3 +-- 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2b89ef0..3fff82c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -162,19 +162,29 @@ free_map: /* if error is returned, fd is released. * On success caller should complete fd access with matching fdput() */ -struct bpf_map *bpf_map_get(struct fd f) +struct bpf_map *__bpf_map_get(struct fd f) { - struct bpf_map *map; - if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_map_fops) { fdput(f); return ERR_PTR(-EINVAL); } - map = f.file->private_data; + return f.file->private_data; +} + +static struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + atomic_inc(&map->refcnt); + fdput(f); return map; } @@ -202,7 +212,7 @@ static int map_lookup_elem(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -261,7 +271,7 @@ static int map_update_elem(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -314,7 +324,7 @@ static int map_delete_elem(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -355,7 +365,7 @@ static int map_get_next_key(union bpf_attr *attr) return -EINVAL; f = fdget(ufd); - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -549,21 +559,16 @@ static int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *get_prog(struct fd f) +static struct bpf_prog *__bpf_prog_get(struct fd f) { - struct bpf_prog *prog; - if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_prog_fops) { fdput(f); return ERR_PTR(-EINVAL); } - prog = f.file->private_data; - - return prog; + return f.file->private_data; } /* called by sockets/tracing/seccomp before attaching program to an event @@ -574,13 +579,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = get_prog(f); - + prog = __bpf_prog_get(f); if (IS_ERR(prog)) return prog; atomic_inc(&prog->aux->refcnt); fdput(f); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b56cf51..fdc88c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1989,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) } f = fdget(insn->imm); - - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); -- cgit v1.1 From e9d8afa90b789b07d414637ab557d169d6b2b84e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Oct 2015 14:58:08 +0100 Subject: bpf: consolidate bpf_prog_put{, _rcu} dismantle paths We currently have duplicated cleanup code in bpf_prog_put() and bpf_prog_put_rcu() cleanup paths. Back then we decided that it was not worth it to make it a common helper called by both, but with the recent addition of resource charging, we could have avoided the fix in commit ac00737f4e81 ("bpf: Need to call bpf_prog_uncharge_memlock from bpf_prog_put") if we would have had only a single, common path. We can simplify it further by assigning aux->prog only once during allocation time. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 ++- kernel/bpf/syscall.c | 15 +++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8086471..334b1bd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -92,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->prog = fp; return fp; } @@ -116,6 +117,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; + fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. @@ -726,7 +728,6 @@ void bpf_prog_free(struct bpf_prog *fp) struct bpf_prog_aux *aux = fp->aux; INIT_WORK(&aux->work, bpf_prog_free_deferred); - aux->prog = fp; schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3fff82c..d7783cb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -513,7 +513,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } -static void __prog_put_rcu(struct rcu_head *rcu) +static void __prog_put_common(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -525,19 +525,14 @@ static void __prog_put_rcu(struct rcu_head *rcu) /* version of bpf_prog_put() that is called after a grace period */ void bpf_prog_put_rcu(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - prog->aux->prog = prog; - call_rcu(&prog->aux->rcu, __prog_put_rcu); - } + if (atomic_dec_and_test(&prog->aux->refcnt)) + call_rcu(&prog->aux->rcu, __prog_put_common); } void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - free_used_maps(prog->aux); - bpf_prog_uncharge_memlock(prog); - bpf_prog_free(prog); - } + if (atomic_dec_and_test(&prog->aux->refcnt)) + __prog_put_common(&prog->aux->rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); -- cgit v1.1 From b2197755b2633e164a439682fb05a9b5ea48f706 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Oct 2015 14:58:09 +0100 Subject: bpf: add support for persistent maps/progs This work adds support for "persistent" eBPF maps/programs. The term "persistent" is to be understood that maps/programs have a facility that lets them survive process termination. This is desired by various eBPF subsystem users. Just to name one example: tc classifier/action. Whenever tc parses the ELF object, extracts and loads maps/progs into the kernel, these file descriptors will be out of reach after the tc instance exits. So a subsequent tc invocation won't be able to access/relocate on this resource, and therefore maps cannot easily be shared, f.e. between the ingress and egress networking data path. The current workaround is that Unix domain sockets (UDS) need to be instrumented in order to pass the created eBPF map/program file descriptors to a third party management daemon through UDS' socket passing facility. This makes it a bit complicated to deploy shared eBPF maps or programs (programs f.e. for tail calls) among various processes. We've been brainstorming on how we could tackle this issue and various approches have been tried out so far, which can be read up further in the below reference. The architecture we eventually ended up with is a minimal file system that can hold map/prog objects. The file system is a per mount namespace singleton, and the default mount point is /sys/fs/bpf/. Any subsequent mounts within a given namespace will point to the same instance. The file system allows for creating a user-defined directory structure. The objects for maps/progs are created/fetched through bpf(2) with two new commands (BPF_OBJ_PIN/BPF_OBJ_GET). I.e. a bpf file descriptor along with a pathname is being passed to bpf(2) that in turn creates (we call it eBPF object pinning) the file system nodes. Only the pathname is being passed to bpf(2) for getting a new BPF file descriptor to an existing node. The user can use that to access maps and progs later on, through bpf(2). Removal of file system nodes is being managed through normal VFS functions such as unlink(2), etc. The file system code is kept to a very minimum and can be further extended later on. The next step I'm working on is to add dump eBPF map/prog commands to bpf(2), so that a specification from a given file descriptor can be retrieved. This can be used by things like CRIU but also applications can inspect the meta data after calling BPF_OBJ_GET. Big thanks also to Alexei and Hannes who significantly contributed in the design discussion that eventually let us end up with this architecture here. Reference: https://lkml.org/lkml/2015/10/15/925 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 4 +- kernel/bpf/inode.c | 387 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 30 +++- 3 files changed, 417 insertions(+), 4 deletions(-) create mode 100644 kernel/bpf/inode.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6983be..1327258 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,2 +1,4 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o + +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c new file mode 100644 index 0000000..be6d726 --- /dev/null +++ b/kernel/bpf/inode.c @@ -0,0 +1,387 @@ +/* + * Minimal file system backend for holding eBPF maps and programs, + * used by bpf(2) object pinning. + * + * Authors: + * + * Daniel Borkmann + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum bpf_type { + BPF_TYPE_UNSPEC = 0, + BPF_TYPE_PROG, + BPF_TYPE_MAP, +}; + +static void *bpf_any_get(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + break; + case BPF_TYPE_MAP: + atomic_inc(&((struct bpf_map *)raw)->refcnt); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return raw; +} + +static void bpf_any_put(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + bpf_prog_put(raw); + break; + case BPF_TYPE_MAP: + bpf_map_put(raw); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) +{ + void *raw; + + *type = BPF_TYPE_MAP; + raw = bpf_map_get(ufd); + if (IS_ERR(raw)) { + *type = BPF_TYPE_PROG; + raw = bpf_prog_get(ufd); + } + + return raw; +} + +static const struct inode_operations bpf_dir_iops; + +static const struct inode_operations bpf_prog_iops = { }; +static const struct inode_operations bpf_map_iops = { }; + +static struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) +{ + struct inode *inode; + + switch (mode & S_IFMT) { + case S_IFDIR: + case S_IFREG: + break; + default: + return ERR_PTR(-EINVAL); + } + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOSPC); + + inode->i_ino = get_next_ino(); + inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime; + inode->i_ctime = inode->i_atime; + + inode_init_owner(inode, dir, mode); + + return inode; +} + +static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) +{ + *type = BPF_TYPE_UNSPEC; + if (inode->i_op == &bpf_prog_iops) + *type = BPF_TYPE_PROG; + else if (inode->i_op == &bpf_map_iops) + *type = BPF_TYPE_MAP; + else + return -EACCES; + + return 0; +} + +static bool bpf_dname_reserved(const struct dentry *dentry) +{ + return strchr(dentry->d_name.name, '.'); +} + +static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + + inc_nlink(inode); + inc_nlink(dir); + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct inode_operations *iops) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = iops; + inode->i_private = dentry->d_fsdata; + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t devt) +{ + enum bpf_type type = MINOR(devt); + + if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || + dentry->d_fsdata == NULL) + return -EPERM; + + switch (type) { + case BPF_TYPE_PROG: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); + case BPF_TYPE_MAP: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); + default: + return -EPERM; + } +} + +static const struct inode_operations bpf_dir_iops = { + .lookup = simple_lookup, + .mknod = bpf_mkobj, + .mkdir = bpf_mkdir, + .rmdir = simple_rmdir, + .unlink = simple_unlink, +}; + +static int bpf_obj_do_pin(const struct filename *pathname, void *raw, + enum bpf_type type) +{ + struct dentry *dentry; + struct inode *dir; + struct path path; + umode_t mode; + dev_t devt; + int ret; + + dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + devt = MKDEV(UNNAMED_MAJOR, type); + + ret = security_path_mknod(&path, dentry, mode, devt); + if (ret) + goto out; + + dir = d_inode(path.dentry); + if (dir->i_op != &bpf_dir_iops) { + ret = -EPERM; + goto out; + } + + dentry->d_fsdata = raw; + ret = vfs_mknod(dir, dentry, mode, devt); + dentry->d_fsdata = NULL; +out: + done_path_create(&path, dentry); + return ret; +} + +int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +{ + struct filename *pname; + enum bpf_type type; + void *raw; + int ret; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_fd_probe_obj(ufd, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + ret = bpf_obj_do_pin(pname, raw, type); + if (ret != 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void *bpf_obj_do_get(const struct filename *pathname, + enum bpf_type *type) +{ + struct inode *inode; + struct path path; + void *raw; + int ret; + + ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = d_backing_inode(path.dentry); + ret = inode_permission(inode, MAY_WRITE); + if (ret) + goto out; + + ret = bpf_inode_type(inode, type); + if (ret) + goto out; + + raw = bpf_any_get(inode->i_private, *type); + touch_atime(&path); + + path_put(&path); + return raw; +out: + path_put(&path); + return ERR_PTR(ret); +} + +int bpf_obj_get_user(const char __user *pathname) +{ + enum bpf_type type = BPF_TYPE_UNSPEC; + struct filename *pname; + int ret = -ENOENT; + void *raw; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_obj_do_get(pname, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + if (type == BPF_TYPE_PROG) + ret = bpf_prog_new_fd(raw); + else if (type == BPF_TYPE_MAP) + ret = bpf_map_new_fd(raw); + else + goto out; + + if (ret < 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void bpf_evict_inode(struct inode *inode) +{ + enum bpf_type type; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); +} + +static const struct super_operations bpf_super_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = bpf_evict_inode, +}; + +static int bpf_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr bpf_rfiles[] = { { "" } }; + struct inode *inode; + int ret; + + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); + if (ret) + return ret; + + sb->s_op = &bpf_super_ops; + + inode = sb->s_root->d_inode; + inode->i_op = &bpf_dir_iops; + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= S_ISVTX | S_IRWXUGO; + + return 0; +} + +static struct dentry *bpf_mount(struct file_system_type *type, int flags, + const char *dev_name, void *data) +{ + return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); +} + +static struct file_system_type bpf_fs_type = { + .owner = THIS_MODULE, + .name = "bpf", + .mount = bpf_mount, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +MODULE_ALIAS_FS("bpf"); + +static int __init bpf_init(void) +{ + int ret; + + ret = sysfs_create_mount_point(fs_kobj, "bpf"); + if (ret) + return ret; + + ret = register_filesystem(&bpf_fs_type); + if (ret) + sysfs_remove_mount_point(fs_kobj, "bpf"); + + return ret; +} +fs_initcall(bpf_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d7783cb..0d3313d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -111,7 +111,7 @@ static const struct file_operations bpf_map_fops = { .release = bpf_map_release, }; -static int bpf_map_new_fd(struct bpf_map *map) +int bpf_map_new_fd(struct bpf_map *map) { return anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); @@ -174,7 +174,7 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -static struct bpf_map *bpf_map_get(u32 ufd) +struct bpf_map *bpf_map_get(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; @@ -548,7 +548,7 @@ static const struct file_operations bpf_prog_fops = { .release = bpf_prog_release, }; -static int bpf_prog_new_fd(struct bpf_prog *prog) +int bpf_prog_new_fd(struct bpf_prog *prog) { return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); @@ -674,6 +674,24 @@ free_prog_nouncharge: return err; } +#define BPF_OBJ_LAST_FIELD bpf_fd + +static int bpf_obj_pin(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ)) + return -EINVAL; + + return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); +} + +static int bpf_obj_get(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + return -EINVAL; + + return bpf_obj_get_user(u64_to_ptr(attr->pathname)); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -734,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_LOAD: err = bpf_prog_load(&attr); break; + case BPF_OBJ_PIN: + err = bpf_obj_pin(&attr); + break; + case BPF_OBJ_GET: + err = bpf_obj_get(&attr); + break; default: err = -EINVAL; break; -- cgit v1.1 From 1d056d9c95be87725c07e514930b41c2c7174e75 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 3 Nov 2015 11:39:20 +0100 Subject: bpf, verifier: annotate verbose printer with __printf The verbose() printer dumps the verifier state to user space, so let gcc take care to check calls to verbose() for (future) errors. make with W=1 correctly suggests: function might be possible candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format]. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fdc88c5..c607305 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -214,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static void verbose(const char *fmt, ...) +static __printf(1, 2) void verbose(const char *fmt, ...) { va_list args; -- cgit v1.1 From bb99d8ccec7f83a2730a29d1ae7eee5ffa446a9e Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 30 Oct 2015 14:25:39 +0900 Subject: tracing: Allow arch-specific stack tracer A stack frame may be used in a different way depending on cpu architecture. Thus it is not always appropriate to slurp the stack contents, as current check_stack() does, in order to calcurate a stack index (height) at a given function call. At least not on arm64. In addition, there is a possibility that we will mistakenly detect a stale stack frame which has not been overwritten. This patch makes check_stack() a weak function so as to later implement arch-specific version. Link: http://lkml.kernel.org/r/1446182741-31019-5-git-send-email-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 80 +++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b746399..50945a7 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -16,24 +16,22 @@ #include "trace.h" -#define STACK_TRACE_ENTRIES 500 - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +unsigned stack_trace_index[STACK_TRACE_ENTRIES]; /* * Reserve one entry for the passed in ip. This will allow * us to remove most or all of the stack size overhead * added by the stack tracer itself. */ -static struct stack_trace max_stack_trace = { +struct stack_trace stack_trace_max = { .max_entries = STACK_TRACE_ENTRIES - 1, .entries = &stack_dump_trace[0], }; -static unsigned long max_stack_size; -static arch_spinlock_t max_stack_lock = +unsigned long stack_trace_max_size; +arch_spinlock_t max_stack_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static DEFINE_PER_CPU(int, trace_active); @@ -42,30 +40,38 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void print_max_stack(void) +void stack_trace_print(void) { long i; int size; pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + stack_trace_max.nr_entries); - for (i = 0; i < max_stack_trace.nr_entries; i++) { + for (i = 0; i < stack_trace_max.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (i+1 == max_stack_trace.nr_entries || + if (i+1 == stack_trace_max.nr_entries || stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; + size = stack_trace_index[i]; else - size = stack_dump_index[i] - stack_dump_index[i+1]; + size = stack_trace_index[i] - stack_trace_index[i+1]; - pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i], size, (void *)stack_dump_trace[i]); } } -static inline void +/* + * When arch-specific code overides this function, the following + * data should be filled up, assuming max_stack_lock is held to + * prevent concurrent updates. + * stack_trace_index[] + * stack_trace_max + * stack_trace_max_size + */ +void __weak check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; @@ -78,7 +84,7 @@ check_stack(unsigned long ip, unsigned long *stack) /* Remove the frame of the tracer */ this_size -= frame_size; - if (this_size <= max_stack_size) + if (this_size <= stack_trace_max_size) return; /* we do not handle interrupt stacks yet */ @@ -93,18 +99,18 @@ check_stack(unsigned long ip, unsigned long *stack) this_size -= tracer_frame; /* a race could have already updated it */ - if (this_size <= max_stack_size) + if (this_size <= stack_trace_max_size) goto out; - max_stack_size = this_size; + stack_trace_max_size = this_size; - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; + stack_trace_max.nr_entries = 0; + stack_trace_max.skip = 3; - save_stack_trace(&max_stack_trace); + save_stack_trace(&stack_trace_max); /* Skip over the overhead of the stack tracer itself */ - for (i = 0; i < max_stack_trace.nr_entries; i++) { + for (i = 0; i < stack_trace_max.nr_entries; i++) { if (stack_dump_trace[i] == ip) break; } @@ -124,18 +130,18 @@ check_stack(unsigned long ip, unsigned long *stack) * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < max_stack_trace.nr_entries) { + while (i < stack_trace_max.nr_entries) { int found = 0; - stack_dump_index[x] = this_size; + stack_trace_index[x] = this_size; p = start; - for (; p < top && i < max_stack_trace.nr_entries; p++) { + for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; if (*p == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; - this_size = stack_dump_index[x++] = + this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -150,7 +156,7 @@ check_stack(unsigned long ip, unsigned long *stack) if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); - max_stack_size -= tracer_frame; + stack_trace_max_size -= tracer_frame; } } } @@ -159,12 +165,12 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - max_stack_trace.nr_entries = x; + stack_trace_max.nr_entries = x; for (; x < i; x++) stack_dump_trace[x] = ULONG_MAX; if (task_stack_end_corrupted(current)) { - print_max_stack(); + stack_trace_print(); BUG(); } @@ -262,7 +268,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -332,9 +338,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + stack_trace_max.nr_entries); - if (!stack_tracer_enabled && !max_stack_size) + if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); return 0; @@ -342,17 +348,17 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= max_stack_trace.nr_entries || + if (i >= stack_trace_max.nr_entries || stack_dump_trace[i] == ULONG_MAX) return 0; - if (i+1 == max_stack_trace.nr_entries || + if (i+1 == stack_trace_max.nr_entries || stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; + size = stack_trace_index[i]; else - size = stack_dump_index[i] - stack_dump_index[i+1]; + size = stack_trace_index[i] - stack_trace_index[i+1]; - seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size); trace_lookup_stack(m, i); @@ -442,7 +448,7 @@ static __init int stack_trace_init(void) return 0; trace_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); + &stack_trace_max_size, &stack_max_size_fops); trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); -- cgit v1.1 From d332736df0c277905de06311ae084e2c76580a3f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Nov 2015 14:50:15 -0500 Subject: tracing: Rename max_stack_lock to stack_trace_max_lock Now that max_stack_lock is a global variable, it requires a naming convention that is unlikely to collide. Rename it to the same naming convention that the other stack_trace variables have. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 50945a7..0bd212a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -31,7 +31,7 @@ struct stack_trace stack_trace_max = { }; unsigned long stack_trace_max_size; -arch_spinlock_t max_stack_lock = +arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static DEFINE_PER_CPU(int, trace_active); @@ -65,7 +65,7 @@ void stack_trace_print(void) /* * When arch-specific code overides this function, the following - * data should be filled up, assuming max_stack_lock is held to + * data should be filled up, assuming stack_trace_max_lock is held to * prevent concurrent updates. * stack_trace_index[] * stack_trace_max @@ -92,7 +92,7 @@ check_stack(unsigned long ip, unsigned long *stack) return; local_irq_save(flags); - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) @@ -175,7 +175,7 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } @@ -246,9 +246,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, cpu = smp_processor_id(); per_cpu(trace_active, cpu)++; - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); *ptr = val; - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); per_cpu(trace_active, cpu)--; local_irq_restore(flags); @@ -291,7 +291,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) cpu = smp_processor_id(); per_cpu(trace_active, cpu)++; - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); if (*pos == 0) return SEQ_START_TOKEN; @@ -303,7 +303,7 @@ static void t_stop(struct seq_file *m, void *p) { int cpu; - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); cpu = smp_processor_id(); per_cpu(trace_active, cpu)--; -- cgit v1.1 From fb8c2293e1a3c4a35a571b82cc2efae0c9e59b2b Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Tue, 3 Nov 2015 21:49:20 +0300 Subject: tracing: Remove redundant TP_ARGS redefining TP_ARGS is not used anywhere in trace.h nor trace_entries.h Firstly, I left just #undef TP_ARGS and had no errors - remove it. Link: http://lkml.kernel.org/r/1446576560-14085-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index be2126f..dd76208 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -71,9 +71,6 @@ enum trace_type { tstruct \ } -#undef TP_ARGS -#define TP_ARGS(args...) args - #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) -- cgit v1.1 From 8b46ff6938d2c78a16520a37086944ecbaa8ab8b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 7 Sep 2015 14:38:37 +0200 Subject: ring_buffer: Do no not complete benchmark reader too early It seems that complete(&read_done) might be called too early in some situations. 1st scenario: ------------- CPU0 CPU1 ring_buffer_producer_thread() wake_up_process(consumer); wait_for_completion(&read_start); ring_buffer_consumer_thread() complete(&read_start); ring_buffer_producer() # producing data in # the do-while cycle ring_buffer_consumer(); # reading data # got error # set kill_test = 1; set_current_state( TASK_INTERRUPTIBLE); if (reader_finish) # false schedule(); # producer still in the middle of # do-while cycle if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); # spurious wakeup while (!reader_finish && !kill_test) # leaving because # kill_test == 1 reader_finish = 0; complete(&read_done); 1st BANG: We might access uninitialized "read_done" if this is the the first round. # producer finally leaving # the do-while cycle because kill_test == 1; if (consumer) { reader_finish = 1; wake_up_process(consumer); wait_for_completion(&read_done); 2nd BANG: This will never complete because consumer already did the completion. 2nd scenario: ------------- CPU0 CPU1 ring_buffer_producer_thread() wake_up_process(consumer); wait_for_completion(&read_start); ring_buffer_consumer_thread() complete(&read_start); ring_buffer_producer() # CPU3 removes the module <--- difference from # and stops producer <--- the 1st scenario if (kthread_should_stop()) kill_test = 1; ring_buffer_consumer(); while (!reader_finish && !kill_test) # kill_test == 1 => we never go # into the top level while() reader_finish = 0; complete(&read_done); # producer still in the middle of # do-while cycle if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); # spurious wakeup while (!reader_finish && !kill_test) # leaving because kill_test == 1 reader_finish = 0; complete(&read_done); BANG: We are in the same "bang" situations as in the 1st scenario. Root of the problem: -------------------- ring_buffer_consumer() must complete "read_done" only when "reader_finish" variable is set. It must not be skipped due to other conditions. Note that we still must keep the check for "reader_finish" in a loop because there might be spurious wakeups as described in the above scenarios. Solution: ---------- The top level cycle in ring_buffer_consumer() will finish only when "reader_finish" is set. The data will be read in "while-do" cycle so that they are not read after an error (kill_test == 1) or a spurious wake up. In addition, "reader_finish" is manipulated by the producer thread. Therefore we add READ_ONCE() to make sure that the fresh value is read in each cycle. Also we add the corresponding barrier to synchronize the sleep check. Next we set the state back to TASK_RUNNING for the situation where we did not sleep. Just from paranoid reasons, we initialize both completions statically. This is safer, in case there are other races that we are unaware of. As a side effect we could remove the memory barrier from ring_buffer_producer_thread(). IMHO, this was the reason for the barrier. ring_buffer_reset() uses spin locks that should provide the needed memory barrier for using the buffer. Link: http://lkml.kernel.org/r/1441629518-32712-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a1503a0..9ea7949 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -24,8 +24,8 @@ struct rb_page { static int wakeup_interval = 100; static int reader_finish; -static struct completion read_start; -static struct completion read_done; +static DECLARE_COMPLETION(read_start); +static DECLARE_COMPLETION(read_done); static struct ring_buffer *buffer; static struct task_struct *producer; @@ -178,10 +178,14 @@ static void ring_buffer_consumer(void) read_events ^= 1; read = 0; - while (!reader_finish && !kill_test) { - int found; + /* + * Continue running until the producer specifically asks to stop + * and is ready for the completion. + */ + while (!READ_ONCE(reader_finish)) { + int found = 1; - do { + while (found && !kill_test) { int cpu; found = 0; @@ -195,17 +199,23 @@ static void ring_buffer_consumer(void) if (kill_test) break; + if (stat == EVENT_FOUND) found = 1; + } - } while (found && !kill_test); + } + /* Wait till the producer wakes us up when there is more data + * available or when the producer wants us to finish reading. + */ set_current_state(TASK_INTERRUPTIBLE); if (reader_finish) break; schedule(); } + __set_current_state(TASK_RUNNING); reader_finish = 0; complete(&read_done); } @@ -389,13 +399,10 @@ static int ring_buffer_consumer_thread(void *arg) static int ring_buffer_producer_thread(void *arg) { - init_completion(&read_start); - while (!kthread_should_stop() && !kill_test) { ring_buffer_reset(buffer); if (consumer) { - smp_wmb(); wake_up_process(consumer); wait_for_completion(&read_start); } -- cgit v1.1 From f47cb66df2f89dd1f796742c64f9ead77e548a6a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 7 Sep 2015 14:38:38 +0200 Subject: ring_buffer: Fix more races when terminating the producer in the benchmark The commit b44754d8262d3aab8 ("ring_buffer: Allow to exit the ring buffer benchmark immediately") added a hack into ring_buffer_producer() that set @kill_test when kthread_should_stop() returned true. It improved the situation a lot. It stopped the kthread in most cases because the producer spent most of the time in the patched while cycle. But there are still few possible races when kthread_should_stop() is set outside of the cycle. Then we do not set @kill_test and some other checks pass. This patch adds a better fix. It renames @test_kill/TEST_KILL() into a better descriptive @test_error/TEST_ERROR(). Also it introduces break_test() function that checks for both @test_error and kthread_should_stop(). The new function is used in the producer when the check for @test_error is not enough. It is not used in the consumer because its state is manipulated by the producer via the "reader_finish" variable. Also we add a missing check into ring_buffer_producer_thread() between setting TASK_INTERRUPTIBLE and calling schedule_timeout(). Otherwise, we might miss a wakeup from kthread_stop(). Link: http://lkml.kernel.org/r/1441629518-32712-3-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 54 +++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9ea7949..9e00fd1 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -60,12 +60,12 @@ MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); static int read_events; -static int kill_test; +static int test_error; -#define KILL_TEST() \ +#define TEST_ERROR() \ do { \ - if (!kill_test) { \ - kill_test = 1; \ + if (!test_error) { \ + test_error = 1; \ WARN_ON(1); \ } \ } while (0) @@ -75,6 +75,11 @@ enum event_status { EVENT_DROPPED, }; +static bool break_test(void) +{ + return test_error || kthread_should_stop(); +} + static enum event_status read_event(int cpu) { struct ring_buffer_event *event; @@ -87,7 +92,7 @@ static enum event_status read_event(int cpu) entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); return EVENT_DROPPED; } @@ -115,10 +120,10 @@ static enum event_status read_page(int cpu) rpage = bpage; /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; - for (i = 0; i < commit && !kill_test; i += inc) { + for (i = 0; i < commit && !test_error ; i += inc) { if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { - KILL_TEST(); + TEST_ERROR(); break; } @@ -128,7 +133,7 @@ static enum event_status read_page(int cpu) case RINGBUF_TYPE_PADDING: /* failed writes may be discarded events */ if (!event->time_delta) - KILL_TEST(); + TEST_ERROR(); inc = event->array[0] + 4; break; case RINGBUF_TYPE_TIME_EXTEND: @@ -137,12 +142,12 @@ static enum event_status read_page(int cpu) case 0: entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); break; } read++; if (!event->array[0]) { - KILL_TEST(); + TEST_ERROR(); break; } inc = event->array[0] + 4; @@ -150,17 +155,17 @@ static enum event_status read_page(int cpu) default: entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); break; } read++; inc = ((event->type_len + 1) * 4); } - if (kill_test) + if (test_error) break; if (inc <= 0) { - KILL_TEST(); + TEST_ERROR(); break; } } @@ -185,7 +190,7 @@ static void ring_buffer_consumer(void) while (!READ_ONCE(reader_finish)) { int found = 1; - while (found && !kill_test) { + while (found && !test_error) { int cpu; found = 0; @@ -197,7 +202,7 @@ static void ring_buffer_consumer(void) else stat = read_page(cpu); - if (kill_test) + if (test_error) break; if (stat == EVENT_FOUND) @@ -273,10 +278,7 @@ static void ring_buffer_producer(void) if (cnt % wakeup_interval) cond_resched(); #endif - if (kthread_should_stop()) - kill_test = 1; - - } while (ktime_before(end_time, timeout) && !kill_test); + } while (ktime_before(end_time, timeout) && !break_test()); trace_printk("End ring buffer hammer\n"); if (consumer) { @@ -297,7 +299,7 @@ static void ring_buffer_producer(void) entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); - if (kill_test && !kthread_should_stop()) + if (test_error) trace_printk("ERROR!\n"); if (!disable_reader) { @@ -378,15 +380,14 @@ static void wait_to_die(void) static int ring_buffer_consumer_thread(void *arg) { - while (!kthread_should_stop() && !kill_test) { + while (!break_test()) { complete(&read_start); ring_buffer_consumer(); set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop() || kill_test) + if (break_test()) break; - schedule(); } __set_current_state(TASK_RUNNING); @@ -399,7 +400,7 @@ static int ring_buffer_consumer_thread(void *arg) static int ring_buffer_producer_thread(void *arg) { - while (!kthread_should_stop() && !kill_test) { + while (!break_test()) { ring_buffer_reset(buffer); if (consumer) { @@ -408,15 +409,18 @@ static int ring_buffer_producer_thread(void *arg) } ring_buffer_producer(); - if (kill_test) + if (break_test()) goto out_kill; trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); + if (break_test()) + goto out_kill; schedule_timeout(HZ * SLEEP_TIME); } out_kill: + __set_current_state(TASK_RUNNING); if (!kthread_should_stop()) wait_to_die(); -- cgit v1.1 From 919cd9799936843d0af4f0904a3e39e70294c4d8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 4 Sep 2015 12:45:56 -0400 Subject: tracing: Allow dumping traces without tracking trace started cpus We don't init iter->started when dumping the ftrace buffer, and there's no real need to do so - so allow skipping that check if the iter doesn't have an initialized ->started cpumask. Link: http://lkml.kernel.org/r/1441385156-27279-1-git-send-email-sasha.levin@oracle.com Signed-off-by: Sasha Levin Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 67873c6..6459e14 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2671,13 +2671,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpumask_test_cpu(iter->cpu, iter->started)) + if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - cpumask_set_cpu(iter->cpu, iter->started); + if (iter->started) + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) -- cgit v1.1 From 54ed1444052467044e9e01334ac8123dd6345211 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Nov 2015 16:19:02 -0500 Subject: ring_buffer: Remove unneeded smp_wmb() before wakeup of reader benchmark wake_up_process() has a memory barrier before doing anything, thus adding a memory barrier before calling it is redundant. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9e00fd1..6df9a83 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -288,8 +288,6 @@ static void ring_buffer_producer(void) /* the completions must be visible before the finish var */ smp_wmb(); reader_finish = 1; - /* finish var visible before waking up the consumer */ - smp_wmb(); wake_up_process(consumer); wait_for_completion(&read_done); } -- cgit v1.1 From 105ff3cbf225036b75a6a46c96d1ddce8e7bdc66 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 3 Nov 2015 17:22:17 -0800 Subject: atomic: remove all traces of READ_ONCE_CTRL() and atomic*_read_ctrl() This seems to be a mis-reading of how alpha memory ordering works, and is not backed up by the alpha architecture manual. The helper functions don't do anything special on any other architectures, and the arguments that support them being safe on other architectures also argue that they are safe on alpha. Basically, the "control dependency" is between a previous read and a subsequent write that is dependent on the value read. Even if the subsequent write is actually done speculatively, there is no way that such a speculative write could be made visible to other cpu's until it has been committed, which requires validating the speculation. Note that most weakely ordered architectures (very much including alpha) do not guarantee any ordering relationship between two loads that depend on each other on a control dependency: read A if (val == 1) read B because the conditional may be predicted, and the "read B" may be speculatively moved up to before reading the value A. So we require the user to insert a smp_rmb() between the two accesses to be correct: read A; if (A == 1) smp_rmb() read B Alpha is further special in that it can break that ordering even if the *address* of B depends on the read of A, because the cacheline that is read later may be stale unless you have a memory barrier in between the pointer read and the read of the value behind a pointer: read ptr read offset(ptr) whereas all other weakly ordered architectures guarantee that the data dependency (as opposed to just a control dependency) will order the two accesses. As a result, alpha needs a "smp_read_barrier_depends()" in between those two reads for them to be ordered. The coontrol dependency that "READ_ONCE_CTRL()" and "atomic_read_ctrl()" had was a control dependency to a subsequent *write*, however, and nobody can finalize such a subsequent write without having actually done the read. And were you to write such a value to a "stale" cacheline (the way the unordered reads came to be), that would seem to lose the write entirely. So the things that make alpha able to re-order reads even more aggressively than other weak architectures do not seem to be relevant for a subsequent write. Alpha memory ordering may be strange, but there's no real indication that it is *that* strange. Also, the alpha architecture reference manual very explicitly talks about the definition of "Dependence Constraints" in section 5.6.1.7, where a preceding read dominates a subsequent write. Such a dependence constraint admittedly does not impose a BEFORE (alpha architecture term for globally visible ordering), but it does guarantee that there can be no "causal loop". I don't see how you could avoid such a loop if another cpu could see the stored value and then impact the value of the first read. Put another way: the read and the write could not be seen as being out of order wrt other cpus. So I do not see how these "x_ctrl()" functions can currently be necessary. I may have to eat my words at some point, but in the absense of clear proof that alpha actually needs this, or indeed even an explanation of how alpha could _possibly_ need it, I do not believe these functions are called for. And if it turns out that alpha really _does_ need a barrier for this case, that barrier still should not be "smp_read_barrier_depends()". We'd have to make up some new speciality barrier just for alpha, along with the documentation for why it really is necessary. Cc: Peter Zijlstra Cc: Paul E McKenney Cc: Dmitry Vyukov Cc: Will Deacon Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/events/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 182bc30..b5d1ea7 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); do { - tail = READ_ONCE_CTRL(rb->user_page->data_tail); + tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) -- cgit v1.1 From a4d1e68823033905de4f927e2e392e21a1c507e3 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Wed, 4 Nov 2015 09:14:29 +0800 Subject: tracing: Apply tracer specific options from kernel command line. Currently, the trace_options parameter is only applied in tracer_alloc_buffers() when global_trace.current_trace is nop_trace, so a tracer specific option will not be applied even when the specific tracer is also enabled from kernel command line. For example, the 'func_stack_trace' option can't be enabled with the following kernel parameter: ftrace=function ftrace_filter=kfree trace_options=func_stack_trace We can enable tracer specific options by simply apply the options again if the specific tracer is also supplied from command line and started in register_tracer(). To make trace_boot_options_buf can be parsed again, a comma and a space is put back if they were replaced by strsep and strstrip respectively. Also make register_tracer() be __init to access the __init data, and in fact register_tracer is only called from __init code. Link: http://lkml.kernel.org/r/1446599669-9294-1-git-send-email-hello.wjx@gmail.com Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6459e14..7fe7cc9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -214,12 +214,10 @@ __setup("alloc_snapshot", boot_alloc_snapshot); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; -static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - trace_boot_options = trace_boot_options_buf; return 0; } __setup("trace_options=", set_trace_boot_options); @@ -1223,13 +1221,15 @@ static inline int run_tracer_selftest(struct tracer *type) static void add_tracer_options(struct trace_array *tr, struct tracer *t); +static void __init apply_trace_boot_options(void); + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer * * Register a new plugin tracer. */ -int register_tracer(struct tracer *type) +int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; @@ -1288,6 +1288,9 @@ int register_tracer(struct tracer *type) /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; + + apply_trace_boot_options(); + /* disable other selftests, since this will break it. */ tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -3589,6 +3592,7 @@ static int trace_set_options(struct trace_array *tr, char *option) int neg = 0; int ret = -ENODEV; int i; + size_t orig_len = strlen(option); cmp = strstrip(option); @@ -3612,9 +3616,37 @@ static int trace_set_options(struct trace_array *tr, char *option) mutex_unlock(&trace_types_lock); + /* + * If the first trailing whitespace is replaced with '\0' by strstrip, + * turn it back into a space. + */ + if (orig_len > strlen(option)) + option[strlen(option)] = ' '; + return ret; } +static void __init apply_trace_boot_options(void) +{ + char *buf = trace_boot_options_buf; + char *option; + + while (true) { + option = strsep(&buf, ","); + + if (!option) + break; + if (!*option) + continue; + + trace_set_options(&global_trace, option); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -7248,12 +7280,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.events); list_add(&global_trace.list, &ftrace_trace_arrays); - while (trace_boot_options) { - char *option; - - option = strsep(&trace_boot_options, ","); - trace_set_options(&global_trace, option); - } + apply_trace_boot_options(); register_snapshot_cmd(); -- cgit v1.1 From 43ed384339ae67a74a8ba4851268b23216ef7a44 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Nov 2015 22:15:14 -0500 Subject: tracing: Put back comma for empty fields in boot string parsing Both early_enable_events() and apply_trace_boot_options() parse a boot string that may get parsed later on. They both use strsep() which converts a comma into a nul character. To still allow the boot string to be parsed again the same way, the nul character gets converted back to a comma after the token is processed. The problem is that these two functions check for an empty parameter (two commas in a row ",,"), and continue the loop if the parameter is empty, but fails to place the comma back. In this case, the second parsing will end at this blank field, and not process fields afterward. In most cases, users should not have an empty field, but if its going to be checked, the code might as well be correct. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++--- kernel/trace/trace_events.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7fe7cc9..2198a63 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3636,10 +3636,9 @@ static void __init apply_trace_boot_options(void) if (!option) break; - if (!*option) - continue; - trace_set_options(&global_trace, option); + if (*option) + trace_set_options(&global_trace, option); /* Put back the comma to allow this to be called again */ if (buf) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 292bccf..bee1e15 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3042,16 +3042,16 @@ early_enable_events(struct trace_array *tr, bool disable_first) if (!token) break; - if (!*token) - continue; - /* Restarting syscalls requires that we stop them first */ - if (disable_first) - ftrace_set_clr_event(tr, token, 0); + if (*token) { + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } /* Put back the comma to allow this to be called again */ if (buf) -- cgit v1.1 From 32a1dbaece7e37cea415e03cd426172249aa859e Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 4 Nov 2015 08:23:50 -0500 Subject: audit: try harder to send to auditd upon netlink failure There are several reports of the kernel losing contact with auditd when it is, in fact, still running. When this happens, kernel syslogs show: "audit: *NO* daemon at audit_pid=" although auditd is still running, and is apparently happy, listening on the netlink socket. The pid in the "*NO* daemon" message matches the pid of the running auditd process. Restarting auditd solves this. The problem appears to happen randomly, and doesn't seem to be strongly correlated to the rate of audit events being logged. The problem happens fairly regularly (every few days), but not yet reproduced to order. On production kernels, BUG_ON() is a no-op, so any error will trigger this. Commit 34eab0a7cd45 ("audit: prevent an older auditd shutdown from orphaning a newer auditd startup") eliminates one possible cause. This isn't the case here, since the PID in the error message and the PID of the running auditd match. The primary expected cause of error here is -ECONNREFUSED when the audit daemon goes away, when netlink_getsockbyportid() can't find the auditd portid entry in the netlink audit table (or there is no receive function). If -EPERM is returned, that situation isn't likely to be resolved in a timely fashion without administrator intervention. In both cases, reset the audit_pid. This does not rule out a race condition. SELinux is expected to return zero since this isn't an INET or INET6 socket. Other LSMs may have other return codes. Log the error code for better diagnosis in the future. In the case of -ENOMEM, the situation could be temporary, based on local or general availability of buffers. -EAGAIN should never happen since the netlink audit (kernel) socket is set to MAX_SCHEDULE_TIMEOUT. -ERESTARTSYS and -EINTR are not expected since this kernel thread is not expected to receive signals. In these cases (or any other unexpected ones for now), report the error and re-schedule the thread, retrying up to 5 times. v2: Removed BUG_ON(). Moved comma in pr_*() statements. Removed audit_strerror() text. Reported-by: Vipin Rathor Reported-by: Signed-off-by: Richard Guy Briggs [PM: applied rgb's fixup patch to correct audit_log_lost() format issues] Signed-off-by: Paul Moore --- kernel/audit.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 662c007..c4c98d8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb) static void kauditd_send_skb(struct sk_buff *skb) { int err; + int attempts = 0; +#define AUDITD_RETRIES 5 + +restart: /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ + pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", + audit_pid, err); if (audit_pid) { - pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared"); - audit_pid = 0; - audit_sock = NULL; + if (err == -ECONNREFUSED || err == -EPERM + || ++attempts >= AUDITD_RETRIES) { + char s[32]; + + snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); + audit_log_lost(s); + audit_pid = 0; + audit_sock = NULL; + } else { + pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", + attempts, audit_pid); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + goto restart; + } } /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); -- cgit v1.1 From 9fcf836b215ca5685030ecab3e35ecc14ee3bcfb Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 4 Nov 2015 08:23:51 -0500 Subject: audit: audit_string_contains_control can be boolean This patch makes audit_string_contains_control return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai [PM: tweaked subject line] Signed-off-by: Paul Moore --- kernel/audit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c4c98d8..648036f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1583,14 +1583,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string, * @string: string to be checked * @len: max length of the string to check */ -int audit_string_contains_control(const char *string, size_t len) +bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) - return 1; + return true; } - return 0; + return false; } /** -- cgit v1.1 From 6f1b5d7afe1d737b7ca726e08e26f2e0367876d2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 4 Nov 2015 08:23:51 -0500 Subject: audit: audit_tree_match can be boolean This patch makes audit_tree_match return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai [PM: tweaked the subject line] Signed-off-by: Paul Moore --- kernel/audit.h | 2 +- kernel/audit_tree.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index dadf86a..de6cbb7 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -301,7 +301,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); extern void audit_put_chunk(struct audit_chunk *); -extern int audit_tree_match(struct audit_chunk *, struct audit_tree *); +extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); extern int audit_make_tree(struct audit_krule *, char *, u32); extern int audit_add_tree_rule(struct audit_krule *); extern int audit_remove_tree_rule(struct audit_krule *); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 94ecdab..5efe9b29 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) return NULL; } -int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) +bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) { int n; for (n = 0; n < chunk->count; n++) if (chunk->owners[n].owner == tree) - return 1; - return 0; + return true; + return false; } /* tagging and untagging inodes with trees */ -- cgit v1.1 From 725131efa52812973afda6ff3fbeec6cc22882a5 Mon Sep 17 00:00:00 2001 From: Scott Matheina Date: Wed, 4 Nov 2015 08:23:51 -0500 Subject: audit: fix comment block whitespace Signed-off-by: Scott Matheina [PM: fixed subject line] Signed-off-by: Paul Moore --- kernel/auditfilter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7714d93..b8ff9e1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -39,13 +39,13 @@ * Locking model: * * audit_filter_mutex: - * Synchronizes writes and blocking reads of audit's filterlist - * data. Rcu is used to traverse the filterlist and access - * contents of structs audit_entry, audit_watch and opaque - * LSM rules during filtering. If modified, these structures - * must be copied and replace their counterparts in the filterlist. - * An audit_parent struct is not accessed during filtering, so may - * be written directly provided audit_filter_mutex is held. + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * LSM rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. */ /* Audit filter lists, defined in */ -- cgit v1.1 From c5ea6efda6ff0fd591d6b7a2e1ba086b196dd864 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Wed, 4 Nov 2015 08:23:52 -0500 Subject: audit: removing unused variable Variable rc in not required as it is just used for unchanged for return, and return is always 0 in the function. Signed-off-by: Saurabh Sengar [PM: fixed spelling errors in description] Signed-off-by: Paul Moore --- kernel/audit.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 648036f..85570f3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -703,23 +703,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { - int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; - return rc; + return 0; } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) - return rc; + return 0; audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); - return rc; + return 0; } int is_audit_feature_set(int i) -- cgit v1.1 From 233a68667cf4c134d07ef7e22bdd77786b5c7360 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 4 Nov 2015 08:23:52 -0500 Subject: audit: make audit_log_common_recv_msg() a void function It always returns zero and no one is checking the return value. Signed-off-by: Paul Moore --- kernel/audit.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 85570f3..8a056a3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -701,24 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; - return 0; + return; } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) - return 0; + return; audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); - - return 0; } int is_audit_feature_set(int i) -- cgit v1.1 From 22b886dd1018093920c4250dee2a9a3cb7cff7b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Nov 2015 12:15:33 -0500 Subject: timers: Use proper base migration in add_timer_on() Regardless of the previous CPU a timer was on, add_timer_on() currently simply sets timer->flags to the new CPU. As the caller must be seeing the timer as idle, this is locally fine, but the timer leaving the old base while unlocked can lead to race conditions as follows. Let's say timer was on cpu 0. cpu 0 cpu 1 ----------------------------------------------------------------------------- del_timer(timer) succeeds del_timer(timer) lock_timer_base(timer) locks cpu_0_base add_timer_on(timer, 1) spin_lock(&cpu_1_base->lock) timer->flags set to cpu_1_base operates on @timer operates on @timer This triggered with mod_delayed_work_on() which contains "if (del_timer()) add_timer_on()" sequence eventually leading to the following oops. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] detach_if_pending+0x69/0x1a0 ... Workqueue: wqthrash wqthrash_workfunc [wqthrash] task: ffff8800172ca680 ti: ffff8800172d0000 task.ti: ffff8800172d0000 RIP: 0010:[] [] detach_if_pending+0x69/0x1a0 ... Call Trace: [] del_timer+0x44/0x60 [] try_to_grab_pending+0xb6/0x160 [] mod_delayed_work_on+0x33/0x80 [] wqthrash_workfunc+0x61/0x90 [wqthrash] [] process_one_work+0x1e8/0x650 [] worker_thread+0x4e/0x450 [] kthread+0xef/0x110 [] ret_from_fork+0x3f/0x70 Fix it by updating add_timer_on() to perform proper migration as __mod_timer() does. Reported-and-tested-by: Jeff Layton Signed-off-by: Tejun Heo Cc: Chris Worley Cc: bfields@fieldses.org Cc: Michael Skralivetsky Cc: Trond Myklebust Cc: Shaohua Li Cc: Jeff Layton Cc: kernel-team@fb.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151029103113.2f893924@tlielax.poochiereds.net Link: http://lkml.kernel.org/r/20151104171533.GI5749@mtj.duckdns.org Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 74591ba..bbc5d11 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -977,13 +977,29 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.1 From 451637e454f0b41689cd07cdc3fa53388c22890d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:44:24 -0800 Subject: kernel/watchdog.c: is_hardlockup can be boolean Make is_hardlockup return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 64ed1c3..568ba64 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -263,15 +263,15 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static int is_hardlockup(void) +static bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return 1; + return true; __this_cpu_write(hrtimer_interrupts_saved, hrint); - return 0; + return false; } #endif -- cgit v1.1 From d283c640cee6472852b95036ddd512c2ba0c1139 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:27 -0800 Subject: watchdog: fix error handling in proc_watchdog_thresh() The original watchdog_park_threads() function that was introduced by commit 81a4beef91ba ("watchdog: introduce watchdog_park_threads() and watchdog_unpark_threads()") takes a very simple approach to handle errors returned by kthread_park(): It attempts to roll back all watchdog threads to the unparked state. However, this may be undesired behaviour from the perspective of the caller which may want to handle errors as appropriate in its specific context. Currently, there are two possible call chains: - watchdog suspend/resume interface lockup_detector_suspend watchdog_park_threads - write to parameters in /proc/sys/kernel proc_watchdog_update watchdog_enable_all_cpus update_watchdog_all_cpus watchdog_park_threads Instead of 'blindly' attempting to unpark the watchdog threads if a kthread_park() call fails, the new approach is to disable the lockup detectors in the above call chains. Failure becomes visible to the user as follows: - error messages from lockup_detector_suspend() or watchdog_enable_all_cpus() - the state that can be read from /proc/sys/kernel/watchdog_enabled - the 'write' system call in the latter call chain returns an error I did not experience kthread_park() failures in practice, I used some instrumentation to fake error returns from kthread_park() in order to test the patches. This patch (of 5): Restore the previous value of watchdog_thresh _and_ sample_period if proc_watchdog_update() returns an error. The variables must be consistent to avoid false positives of the lockup detectors. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 568ba64..284f0e6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -914,13 +914,14 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, goto out; /* - * Update the sample period. - * Restore 'watchdog_thresh' on failure. + * Update the sample period. Restore on failure. */ set_sample_period(); err = proc_watchdog_update(); - if (err) + if (err) { watchdog_thresh = old; + set_sample_period(); + } out: mutex_unlock(&watchdog_proc_mutex); return err; -- cgit v1.1 From 58cf690a09987c9a56933df05c0369d691d6224d Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:30 -0800 Subject: watchdog: move watchdog_disable_all_cpus() outside of ifdef Move watchdog_disable_all_cpus() outside of the ifdef so that it is available if CONFIG_SYSCTL is not defined. This is preparation for "watchdog: implement error handling in update_watchdog_all_cpus() and callers". Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 284f0e6..f0f8a78 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -347,6 +347,9 @@ static void watchdog_interrupt_count(void) static int watchdog_nmi_enable(unsigned int cpu); static void watchdog_nmi_disable(unsigned int cpu); +static int watchdog_enable_all_cpus(void); +static void watchdog_disable_all_cpus(void); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -756,9 +759,6 @@ static int watchdog_enable_all_cpus(void) return err; } -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { if (watchdog_running) { @@ -767,6 +767,8 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL + /* * Update the run state of the lockup detectors. */ -- cgit v1.1 From b43cb43cb85b91d79d9f0719ff581e8cb6dfbb8f Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:33 -0800 Subject: watchdog: implement error handling in update_watchdog_all_cpus() and callers update_watchdog_all_cpus() now passes errors from watchdog_park_threads() up to functions in the call chain. This allows watchdog_enable_all_cpus() and proc_watchdog_update() to handle such errors too. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f0f8a78..704f933 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -731,10 +731,17 @@ void lockup_detector_resume(void) mutex_unlock(&watchdog_proc_mutex); } -static void update_watchdog_all_cpus(void) +static int update_watchdog_all_cpus(void) { - watchdog_park_threads(); + int ret; + + ret = watchdog_park_threads(); + if (ret) + return ret; + watchdog_unpark_threads(); + + return 0; } static int watchdog_enable_all_cpus(void) @@ -753,9 +760,17 @@ static int watchdog_enable_all_cpus(void) * Enable/disable the lockup detectors or * change the sample period 'on the fly'. */ - update_watchdog_all_cpus(); + err = update_watchdog_all_cpus(); + + if (err) { + watchdog_disable_all_cpus(); + pr_err("Failed to update lockup detectors, disabled\n"); + } } + if (err) + watchdog_enabled = 0; + return err; } @@ -851,12 +866,13 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } while (cmpxchg(&watchdog_enabled, old, new) != old); /* - * Update the run state of the lockup detectors. - * Restore 'watchdog_enabled' on failure. + * Update the run state of the lockup detectors. There is _no_ + * need to check the value returned by proc_watchdog_update() + * and to restore the previous value of 'watchdog_enabled' as + * both lockup detectors are disabled if proc_watchdog_update() + * returns an error. */ err = proc_watchdog_update(); - if (err) - watchdog_enabled = old; } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v1.1 From c993590c6ae6273681d9fb2a8d26dce03bf9d96c Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:36 -0800 Subject: watchdog: implement error handling in lockup_detector_suspend() lockup_detector_suspend() now handles errors from watchdog_park_threads(). Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 704f933..e8b19db 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -707,6 +707,11 @@ int lockup_detector_suspend(void) if (ret == 0) watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } mutex_unlock(&watchdog_proc_mutex); -- cgit v1.1 From ee7fed540563b27e1028bec0b509921496c91bf9 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:39 -0800 Subject: watchdog: do not unpark threads in watchdog_park_threads() on error If kthread_park() returns an error, watchdog_park_threads() should not blindly 'roll back' the already parked threads to the unparked state. Instead leave it up to the callers to handle such errors appropriately in their context. For example, it is redundant to unpark the threads if the lockup detectors will soon be disabled by the callers anyway. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e8b19db..452e4ed 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -654,6 +654,12 @@ static struct smp_hotplug_thread watchdog_threads = { /* * park all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function returns an error if kthread_park() of a watchdog thread + * fails. In this situation, the watchdog threads of some CPUs can already + * be parked and the watchdog threads of other CPUs can still be runnable. + * Callers are expected to handle this special condition as appropriate in + * their context. */ static int watchdog_park_threads(void) { @@ -665,10 +671,6 @@ static int watchdog_park_threads(void) if (ret) break; } - if (ret) { - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); - } put_online_cpus(); return ret; -- cgit v1.1 From 55537871ef666b4153fd1ef8782e4a13fee142cc Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 5 Nov 2015 18:44:41 -0800 Subject: kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup In many cases of hardlockup reports, it's actually not possible to know why it triggered, because the CPU that got stuck is usually waiting on a resource (with IRQs disabled) in posession of some other CPU is holding. IOW, we are often looking at the stacktrace of the victim and not the actual offender. Introduce sysctl / cmdline parameter that makes it possible to have hardlockup detector perform all-CPU backtrace. Signed-off-by: Jiri Kosina Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ kernel/watchdog.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 96c856b..1a5faa3e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -898,6 +898,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_SMP */ #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 452e4ed..f6b32b8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif /* @@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); + struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + panic("Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- cgit v1.1 From ac1f591249d95372f3a5ab3828d4af5dfbf5efd3 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 5 Nov 2015 18:44:44 -0800 Subject: kernel/watchdog.c: add sysctl knob hardlockup_panic The only way to enable a hardlockup to panic the machine is to set 'nmi_watchdog=panic' on the kernel command line. This makes it awkward for end users and folks who want to run automate tests (like myself). Mimic the softlockup_panic knob and create a /proc/sys/kernel/hardlockup_panic knob. Signed-off-by: Don Zickus Cc: Ulrich Obergfell Acked-by: Jiri Kosina Reviewed-by: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ kernel/watchdog.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a5faa3e..dc6858d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -888,6 +888,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f6b32b8..0a23125 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -112,7 +112,7 @@ static unsigned long soft_lockup_nmi_warn; * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic = +unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; /* -- cgit v1.1 From ee89e71eb091d3ef8ca2be8bd4ec77ccfa91334c Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:47 -0800 Subject: kernel/watchdog.c: avoid race between lockup detector suspend/resume and CPU hotplug The lockup detector suspend/resume interface that was introduced by commit 8c073d27d7ad ("watchdog: introduce watchdog_suspend() and watchdog_resume()") does not protect itself against races with CPU hotplug. Hence, theoretically it is possible that a new watchdog thread is started on a hotplugged CPU while the lockup detector is suspended, and the thread could thus interfere unexpectedly with the code that requested to suspend the lockup detector. Avoid the race by calling get_online_cpus() in lockup_detector_suspend() put_online_cpus() in lockup_detector_resume() Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0a23125..7357842d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -719,6 +719,7 @@ int lockup_detector_suspend(void) { int ret = 0; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); /* * Multiple suspend requests can be active in parallel (counted by @@ -759,6 +760,7 @@ void lockup_detector_resume(void) watchdog_unpark_threads(); mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); } static int update_watchdog_all_cpus(void) -- cgit v1.1 From 8614ddef82139d08234dbf681188f9bcddae9f03 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:50 -0800 Subject: kernel/watchdog.c: avoid races between /proc handlers and CPU hotplug The handler functions for watchdog parameters in /proc/sys/kernel do not protect themselves against races with CPU hotplug. Hence, theoretically it is possible that a new watchdog thread is started on a hotplugged CPU while a parameter is being modified, and the thread could thus use a parameter value that is 'in transition'. For example, if 'watchdog_thresh' is being set to zero (note: this disables the lockup detectors) the thread would erroneously use the value zero as the sample period. To avoid such races and to keep the /proc handler code consistent, call {get|put}_online_cpus() in proc_watchdog_common() {get|put}_online_cpus() in proc_watchdog_thresh() {get|put}_online_cpus() in proc_watchdog_cpumask() Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7357842d..13fdda1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -857,6 +857,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); if (watchdog_suspended) { @@ -908,6 +909,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -949,6 +951,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); if (watchdog_suspended) { @@ -974,6 +977,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -988,6 +992,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); if (watchdog_suspended) { @@ -1015,6 +1020,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } -- cgit v1.1 From a2a45b85ec45db4b041ea5d93b21033dbc3cc0fc Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:53 -0800 Subject: kernel/watchdog.c: remove {get|put}_online_cpus() from watchdog_{park|unpark}_threads() watchdog_{park|unpark}_threads() are now called in code paths that protect themselves against CPU hotplug, so {get|put}_online_cpus() calls are redundant and can be removed. Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 13fdda1..84c4744 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -683,33 +683,35 @@ static struct smp_hotplug_thread watchdog_threads = { * be parked and the watchdog threads of other CPUs can still be runnable. * Callers are expected to handle this special condition as appropriate in * their context. + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). */ static int watchdog_park_threads(void) { int cpu, ret = 0; - get_online_cpus(); for_each_watchdog_cpu(cpu) { ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); if (ret) break; } - put_online_cpus(); return ret; } /* * unpark all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). */ static void watchdog_unpark_threads(void) { int cpu; - get_online_cpus(); for_each_watchdog_cpu(cpu) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); - put_online_cpus(); } /* -- cgit v1.1 From 39d2da2161d35de301ec5397ce9103c68b883054 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:56 -0800 Subject: kernel/watchdog.c: fix race between proc_watchdog_thresh() and watchdog_timer_fn() Theoretically it is possible that the watchdog timer expires right at the time when a user sets 'watchdog_thresh' to zero (note: this disables the lockup detectors). In this scenario, the is_softlockup() function - which is called by the timer - could produce a false positive. Fix this by checking the current value of 'watchdog_thresh'. Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84c4744..18f34cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -289,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; -- cgit v1.1 From da39da3a54fed88e29024f2f1f6cd7357cd03a44 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:48:05 -0800 Subject: mm, oom: remove task_lock protecting comm printing The oom killer takes task_lock() in a couple of places solely to protect printing the task's comm. A process's comm, including current's comm, may change due to /proc/pid/comm or PR_SET_NAME. The comm will always be NULL-terminated, so the worst race scenario would only be during update. We can tolerate a comm being printed that is in the middle of an update to avoid taking the lock. Other locations in the kernel have already dropped task_lock() when printing comm, so this is consistent. Signed-off-by: David Rientjes Suggested-by: Oleg Nesterov Cc: Michal Hocko Cc: Vladimir Davydov Cc: Sergey Senozhatsky Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0acff0..9ef59a3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, } /** - * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @tsk: pointer to task_struct of some task. + * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * - * Description: Prints @task's name, cpuset name, and cached copy of its + * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ -void cpuset_print_task_mems_allowed(struct task_struct *tsk) +void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); - cgrp = task_cs(tsk)->css.cgroup; - pr_info("%s cpuset=", tsk->comm); + cgrp = task_cs(current)->css.cgroup; + pr_info("%s cpuset=", current->comm); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); + pr_cont(" mems_allowed=%*pbl\n", + nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); } -- cgit v1.1 From a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:33 -0800 Subject: mm: mlock: add new mlock system call With the refactored mlock code, introduce a new system call for mlock. The new call will allow the user to specify what lock states are being added. mlock2 is trivial at the moment, but a follow on patch will add a new mlock state making it useful. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Heiko Carstens Cc: Geert Uytterhoeven Cc: Catalin Marinas Cc: Stephen Rothwell Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a02decf..0623787 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -194,6 +194,7 @@ cond_syscall(sys_mlock); cond_syscall(sys_munlock); cond_syscall(sys_mlockall); cond_syscall(sys_munlockall); +cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); -- cgit v1.1 From de60f5f10c58d4f34b68622442c0e04180367f3f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:36 -0800 Subject: mm: introduce VM_LOCKONFAULT The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking. For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc). This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in. Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6ac8942..a30fae4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,7 +454,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= + ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; -- cgit v1.1 From 8b1291994d8e5e621a8af7e165b106e50d04bbf1 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Fri, 6 Nov 2015 16:04:16 +0800 Subject: tracing: Make tracing work when debugfs is not configured in Currently tracing_init_dentry() returns -ENODEV when debugfs is not configured in, which causes tracefs not populated with tracing files and directories, so we will get an empty directory even after we manually mount tracefs. We can make tracing_init_dentry() return NULL if debugfs is not configured in and can manually mount tracefs. But return -ENODEV if debugfs is configured in but not initialized or failed to create automount point as that would break backward compatibility with older tools. Link: http://lkml.kernel.org/r/1446797056-11683-1-git-send-email-hello.wjx@gmail.com Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2198a63..08af79c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6847,7 +6847,9 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!debugfs_initialized())) + if (WARN_ON(!tracefs_initialized()) || + (IS_ENABLED(CONFIG_DEBUG_FS) && + WARN_ON(!debugfs_initialized()))) return ERR_PTR(-ENODEV); /* -- cgit v1.1 From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 6 +++--- kernel/cgroup.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a3..5ffcbd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce..f1603c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4..deae390 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4..3a97060 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/smp.c b/kernel/smp.c index 0785447..d903c02 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); -- cgit v1.1 From 71baba4b92dc1fa1bc461742c6ab1942ec6034e9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:28 -0800 Subject: mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM __GFP_WAIT was used to signal that the caller was in atomic context and could not sleep. Now it is possible to distinguish between true atomic context and callers that are not willing to sleep. The latter should clear __GFP_DIRECT_RECLAIM so kswapd will still wake. As clearing __GFP_WAIT behaves differently, there is a risk that people will clear the wrong flags. This patch renames __GFP_WAIT to __GFP_RECLAIM to clearly indicate what it does -- setting it allows all reclaim activity, clearing them prevents it. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Christoph Lameter Acked-by: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb5..12cd989 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; @@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { -- cgit v1.1 From 3d9c637f4ae74b45d95bb6cbd793fbffad0a709c Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 6 Nov 2015 16:29:12 -0800 Subject: module: export param_free_charp() Change the param_free_charp() function from static to exported. It is used by zswap in the next patch ("zswap: use charp for zswap param strings"). Signed-off-by: Dan Streetman Acked-by: Rusty Russell Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b6554aa..93a380a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -325,10 +325,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, -- cgit v1.1 From 3824657c522f19f85a76bd932821174a5557a382 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 6 Nov 2015 16:30:38 -0800 Subject: printk: prevent userland from spoofing kernel messages The following statement of ABI/testing/dev-kmsg is not quite right: It is not possible to inject messages from userspace with the facility number LOG_KERN (0), to make sure that the origin of the messages can always be reliably determined. Userland actually can inject messages with a facility of 0 by abusing the fact that the facility is stored in a u8 data type. By using a facility which is a multiple of 256 the assignment of msg->facility in log_store() implicitly truncates it to 0, i.e. LOG_KERN, allowing users of /dev/kmsg to spoof kernel messages as shown below: The following call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0 >/dev/kmsg ...leads to the following log entry (dmesg -x | tail -n 1): user :emerg : [ 66.137758] Kernel panic - not syncing: beer empty However, this call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0x800 >/dev/kmsg ...leads to the slightly different log entry (note the kernel facility): kern :emerg : [ 74.177343] Kernel panic - not syncing: beer empty Fix that by limiting the user provided facility to 8 bit right from the beginning and catch the truncation early. Fixes: 7ff9554bb578 ("printk: convert byte-buffer to variable-length...") Signed-off-by: Mathias Krause Cc: Greg Kroah-Hartman Cc: Petr Mladek Cc: Alex Elder Cc: Joe Perches Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b16f354..2ce8826 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,6 +269,9 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 @@ -612,7 +615,6 @@ struct devkmsg_user { static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -642,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; -- cgit v1.1 From 2e01fabe67ccaff1d59bda01e60a61f5fb0aa7b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:19 -0800 Subject: signals: kill block_all_signals() and unblock_all_signals() It is hardly possible to enumerate all problems with block_all_signals() and unblock_all_signals(). Just for example, 1. block_all_signals(SIGSTOP/etc) simply can't help if the caller is multithreaded. Another thread can dequeue the signal and force the group stop. 2. Even is the caller is single-threaded, it will "stop" anyway. It will not sleep, but it will spin in kernel space until SIGCONT or SIGKILL. And a lot more. In short, this interface doesn't work at all, at least the last 10+ years. Daniel said: Yeah the only times I played around with the DRM_LOCK stuff was when old drivers accidentally deadlocked - my impression is that the entire DRM_LOCK thing was never really tested properly ;-) Hence I'm all for purging where this leaks out of the drm subsystem. Signed-off-by: Oleg Nesterov Acked-by: Daniel Vetter Acked-by: Dave Airlie Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 51 +-------------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe..f2cbd4e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. -- cgit v1.1 From 5fa534c987784c4811757a34c425aff3ce3b5037 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:31 -0800 Subject: coredump: ensure all coredumping tasks have SIGNAL_GROUP_COREDUMP task_will_free_mem() is wrong in many ways, and in particular the SIGNAL_GROUP_COREDUMP check is not reliable: a task can participate in the coredumping without SIGNAL_GROUP_COREDUMP bit set. change zap_threads() paths to always set SIGNAL_GROUP_COREDUMP even if other CLONE_VM processes can't react to SIGKILL. Fortunately, at least oom-kill case if fine; it kills all tasks sharing the same mm, so it should also kill the process which actually dumps the core. The change in prepare_signal() is not strictly necessary, it just ensures that the patch does not bring another subtle behavioural change. But it reminds us that this SIGNAL_GROUP_EXIT/COREDUMP case needs more changes. Signed-off-by: Oleg Nesterov Cc: David Rientjes Cc: Kyle Walker Acked-by: Michal Hocko Cc: Stanislav Kozina Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f2cbd4e..c0b01fe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -788,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. -- cgit v1.1 From de90a6bcaede81f35e8caf4566d1006267230377 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 6 Nov 2015 16:32:45 -0800 Subject: kexec: use file name as the output message prefix kexec output message misses the prefix "kexec", when Dave Young split the kexec code. Now, we use file name as the output message prefix. Currently, the format of output message: [ 140.290795] SYSC_kexec_load: hello, world [ 140.291534] kexec: sanity_check_segment_list: hello, world Ideally, the format of output message: [ 30.791503] kexec: SYSC_kexec_load, Hello, world [ 79.182752] kexec_core: sanity_check_segment_list, Hello, world Remove the custom prefix "kexec" in output message. Signed-off-by: Minfei Huang Acked-by: Dave Young Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ kernel/kexec_core.c | 4 ++-- kernel/kexec_file.c | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc3..d873b64 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bd9f8a0..11b64a6 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,7 +6,7 @@ * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void) crash_notes = __alloc_percpu(size, align); if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2..b70ada0 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -9,6 +9,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -- cgit v1.1 From 8639b46139b0e4ea3b1ab1c274e410ee327f1d89 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Fri, 6 Nov 2015 16:32:48 -0800 Subject: pidns: fix set/getpriority and ioprio_set/get in PRIO_USER mode setpriority(PRIO_USER, 0, x) will change the priority of tasks outside of the current pid namespace. This is in contrast to both the other modes of setpriority and the example of kill(-1). Fix this. getpriority and ioprio have the same failure mode, fix them too. Eric said: : After some more thinking about it this patch sounds justifiable. : : My goal with namespaces is not to build perfect isolation mechanisms : as that can get into ill defined territory, but to build well defined : mechanisms. And to handle the corner cases so you can use only : a single namespace with well defined results. : : In this case you have found the two interfaces I am aware of that : identify processes by uid instead of by pid. Which quite frankly is : weird. Unfortunately the weird unexpected cases are hard to handle : in the usual way. : : I was hoping for a little more information. Changes like this one we : have to be careful of because someone might be depending on the current : behavior. I don't think they are and I do think this make sense as part : of the pid namespace. Signed-off-by: Ben Segall Cc: Oleg Nesterov Cc: Al Viro Cc: Ambrose Feinstein Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f6..6af9212 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; -- cgit v1.1 From 08d78658f393fefaa2e6507ea052c6f8ef4002a2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Nov 2015 16:32:58 -0800 Subject: panic: release stale console lock to always get the logbuf printed out In some cases we may end up killing the CPU holding the console lock while still having valuable data in logbuf. E.g. I'm observing the following: - A crash is happening on one CPU and console_unlock() is being called on some other. - console_unlock() tries to print out the buffer before releasing the lock and on slow console it takes time. - in the meanwhile crashing CPU does lots of printk()-s with valuable data (which go to the logbuf) and sends IPIs to all other CPUs. - console_unlock() finishes printing previous chunk and enables interrupts before trying to print out the rest, the CPU catches the IPI and never releases console lock. This is not the only possible case: in VT/fb subsystems we have many other console_lock()/console_unlock() users. Non-masked interrupts (or receiving NMI in case of extreme slowness) will have the same result. Getting the whole console buffer printed out on crash should be top priority. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Masami Hiramatsu Cc: Jiri Kosina Cc: Baoquan He Cc: Prarit Bhargava Cc: Xie XiuQi Cc: Seth Jennings Cc: "K. Y. Srinivasan" Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff..4579dbb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -147,6 +148,15 @@ void panic(const char *fmt, ...) bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. + */ + console_trylock(); + console_unlock(); + if (!panic_blink) panic_blink = no_blink; -- cgit v1.1 From 03e88ae6b369da2a26a6e09ad165e57d210789cd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 6 Nov 2015 22:07:26 +0300 Subject: tracing: Remove unused ftrace_cpu_disabled per cpu variable Since the ring buffer is lockless, there is no need to disable ftrace on CPU. And no one doing so: after commit 68179686ac67cb ("tracing: Remove ftrace_disable/enable_cpu()") ftrace_cpu_disabled stays the same after initialization, nothing changes it. ftrace_cpu_disabled shouldn't be used by any external module since it disables only function and graph_function tracers but not any other tracer. Link: http://lkml.kernel.org/r/1446836846-22239-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ------ kernel/trace/trace.h | 1 - kernel/trace/trace_functions_graph.c | 6 ------ 3 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 08af79c..b115826 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -100,8 +100,6 @@ static DEFINE_PER_CPU(bool, trace_cmdline_save); */ static int tracing_disabled = 1; -DEFINE_PER_CPU(int, ftrace_cpu_disabled); - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -1775,10 +1773,6 @@ trace_function(struct trace_array *tr, struct ring_buffer_event *event; struct ftrace_entry *entry; - /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd76208..919d9d0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,7 +667,6 @@ extern int DYN_FTRACE_TEST_NAME2(void); extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 92382af..a663cbb 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -288,9 +288,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return 0; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) @@ -403,9 +400,6 @@ void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, sizeof(*entry), flags, pc); if (!event) -- cgit v1.1 From 2fd59077755c44dbbd9b2fa89cf988235a3a6a2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Nov 2015 05:48:38 -0800 Subject: perf: Disable IRQs across RCU RS CS that acquires scheduler lock The perf_lock_task_context() function disables preemption across its RCU read-side critical section because that critical section acquires a scheduler lock. If there was a preemption during that RCU read-side critical section, the rcu_read_unlock() could attempt to acquire scheduler locks, resulting in deadlock. However, recent optimizations to expedited grace periods mean that IPI handlers that execute during preemptible RCU read-side critical sections can now cause the subsequent rcu_read_unlock() to acquire scheduler locks. Disabling preemption does nothiing to prevent these IPI handlers from executing, so these optimizations introduced a deadlock. In theory, this deadlock could be avoided by pulling all wakeups and printk()s out from rnp->lock critical sections, but in practice this would re-introduce some RCU CPU stall warning bugs. Given that acquiring scheduler locks entails disabling interrupts, these deadlocks can be avoided by disabling interrupts (instead of disabling preemption) across any RCU read-side critical that acquires scheduler locks and holds them across the rcu_read_unlock(). This commit therefore makes this change for perf_lock_task_context(). Reported-by: Dave Jones Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151104134838.GR29027@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea02109..f8e5c44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1050,13 +1050,13 @@ retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when - * part of the read side critical section was preemptible -- see + * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read - * side critical section is non-preemptible. + * side critical section has interrupts disabled. */ - preempt_disable(); + local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { @@ -1070,21 +1070,22 @@ retry: * if so. If we locked the right context, then it * can't get swapped on us any more. */ - raw_spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); rcu_read_unlock(); - preempt_enable(); + local_irq_restore(*flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); ctx = NULL; } } rcu_read_unlock(); - preempt_enable(); + if (!ctx) + local_irq_restore(*flags); return ctx; } -- cgit v1.1 From b71b437eedaed985062492565d9d421d975ae845 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Nov 2015 10:50:51 +0100 Subject: perf: Fix inherited events vs. tracepoint filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arnaldo reported that tracepoint filters seem to misbehave (ie. not apply) on inherited events. The fix is obvious; filters are only set on the actual (parent) event, use the normal pattern of using this parent event for filters. This is safe because each child event has a reference to it. Reported-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frédéric Weisbecker Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Wang Nan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151102095051.GN17308@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f8e5c44..98a4b9d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6909,6 +6909,10 @@ static int perf_tp_filter_match(struct perf_event *event, { void *record = data->raw->data; + /* only top level events have filters set */ + if (event->parent) + event = event->parent; + if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; -- cgit v1.1 From 25b3e5a3344e1f700c1efec5b6f0199f04707fb1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Nov 2015 15:56:22 -0500 Subject: sched/numa: Fix math underflow in task_tick_numa() The NUMA balancing code implements delays in scanning by advancing curr->node_stamp beyond curr->se.sum_exec_runtime. With unsigned math, that creates an underflow, which results in task_numa_work being queued all the time, even when we don't want to. Avoiding the math underflow makes it possible to reduce CPU overhead in the NUMA balancing code. Reported-and-tested-by: Jan Stancek Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/1446756983-28173-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 824aa9f..f04fda8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2302,7 +2302,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) now = curr->se.sum_exec_runtime; period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - if (now - curr->node_stamp > period) { + if (now > curr->node_stamp + period) { if (!curr->node_stamp) curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; -- cgit v1.1 From 79211c8ed19c055ca105502c8733800d442a0ae6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 9 Nov 2015 14:58:13 -0800 Subject: remove abs64() Switch everything to the new and more capable implementation of abs(). Mainly to give the new abs() a bit of a workout. Cc: Michal Nazarewicz Cc: John Stultz Cc: Ingo Molnar Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 2 +- kernel/time/timekeeping.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0d8fe8b..1347882 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b1356b7..d563c19 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, negative = (tick_error < 0); /* Sort out the magnitude of the correction */ - tick_error = abs64(tick_error); + tick_error = abs(tick_error); for (adj = 0; tick_error > interval; adj++) tick_error >>= 1; -- cgit v1.1 From f70cd6b07e629f367bb9b1ac9d0e3e669eb325c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 02:39:55 +0100 Subject: context_tracking: remove duplicate enabled check All calls to context_tracking_enter and context_tracking_exit are already checking context_tracking_is_enabled, except the context_tracking_user_enter and context_tracking_user_exit functions left in for the benefit of assembly calls. Pull the check up to those functions, by making them simple wrappers around the user_enter and user_exit inline functions. Cc: Frederic Weisbecker Cc: Paul McKenney Reviewed-by: Rik van Riel Tested-by: Rik van Riel Acked-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- kernel/context_tracking.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 0a495ab..6d4c6ce 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -63,15 +63,6 @@ void context_tracking_enter(enum ctx_state state) unsigned long flags; /* - * Repeat the user_enter() check here because some archs may be calling - * this from asm and if no CPU needs context tracking, they shouldn't - * go further. Repeat the check here until they support the inline static - * key check. - */ - if (!context_tracking_is_enabled()) - return; - - /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() @@ -128,7 +119,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter); void context_tracking_user_enter(void) { - context_tracking_enter(CONTEXT_USER); + user_enter(); } NOKPROBE_SYMBOL(context_tracking_user_enter); @@ -148,9 +139,6 @@ void context_tracking_exit(enum ctx_state state) { unsigned long flags; - if (!context_tracking_is_enabled()) - return; - if (in_interrupt()) return; @@ -181,7 +169,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit); void context_tracking_user_exit(void) { - context_tracking_exit(CONTEXT_USER); + user_exit(); } NOKPROBE_SYMBOL(context_tracking_user_exit); -- cgit v1.1 From d0e536d89395ecd8ab78fe999dc4d6f5d140ce46 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 02:39:56 +0100 Subject: context_tracking: avoid irq_save/irq_restore on guest entry and exit guest_enter and guest_exit must be called with interrupts disabled, since they take the vtime_seqlock with write_seq{lock,unlock}. Therefore, it is not necessary to check for exceptions, nor to save/restore the IRQ state, when context tracking functions are called by guest_enter and guest_exit. Split the body of context_tracking_entry and context_tracking_exit out to __-prefixed functions, and use them from KVM. Rik van Riel has measured this to speed up a tight vmentry/vmexit loop by about 2%. Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: Paul McKenney Reviewed-by: Rik van Riel Tested-by: Rik van Riel Signed-off-by: Paolo Bonzini --- kernel/context_tracking.c | 64 ++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6d4c6ce..d8560ee 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -58,27 +58,13 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void context_tracking_enter(enum ctx_state state) +void __context_tracking_enter(enum ctx_state state) { - unsigned long flags; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); - local_irq_save(flags); if (!context_tracking_recursion_enter()) - goto out_irq_restore; + return; if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { @@ -111,7 +97,27 @@ void context_tracking_enter(enum ctx_state state) __this_cpu_write(context_tracking.state, state); } context_tracking_recursion_exit(); -out_irq_restore: +} +NOKPROBE_SYMBOL(__context_tracking_enter); +EXPORT_SYMBOL_GPL(__context_tracking_enter); + +void context_tracking_enter(enum ctx_state state) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_enter(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -135,16 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_exit(enum ctx_state state) +void __context_tracking_exit(enum ctx_state state) { - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); if (!context_tracking_recursion_enter()) - goto out_irq_restore; + return; if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { @@ -161,7 +161,19 @@ void context_tracking_exit(enum ctx_state state) __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); -out_irq_restore: +} +NOKPROBE_SYMBOL(__context_tracking_exit); +EXPORT_SYMBOL_GPL(__context_tracking_exit); + +void context_tracking_exit(enum ctx_state state) +{ + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_exit(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); -- cgit v1.1 From 4717f133736dec10605da9e29e707144c8d486df Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 10 Nov 2015 11:58:12 +0200 Subject: genirq/PM: Restore system wake up from chained interrupts Commit e509bd7da149 ("genirq: Allow migration of chained interrupts by installing default action") breaks PCS wake up IRQ behaviour on TI OMAP based platforms (dra7-evm). TI OMAP IRQ wake up configuration: GIC-irqchip->PCM_IRQ |- omap_prcm_register_chain_handler |- PRCM-irqchip -> PRCM_IO_IRQ |- pcs_irq_chain_handler |- pinctrl-irqchip -> PCS_uart1_wakeup_irq This happens because IRQ PM code (irq/pm.c) is expected to ignore chained interrupts by default: static bool suspend_device_irq(struct irq_desc *desc) { if (!desc->action || desc->no_suspend_depth) return false; - it's expected !desc->action = true for chained interrupts; but, after above change, all chained interrupt descriptors will have default action handler installed - chained_action. As result, chained interrupts will be silently disabled during system suspend. Hence, fix it by introducing helper function irq_desc_is_chained() and use it in suspend_device_irq() for chained interrupts identification and skip them, once detected. Fixes: e509bd7da149 ("genirq: Allow migration of chained interrupts..") Signed-off-by: Grygorii Strashko Reviewed-by: Mika Westerberg Cc: Tony Lindgren Cc: Cc: Cc: Tony Lindgren Link: http://lkml.kernel.org/r/1447149492-20699-1-git-send-email-grygorii.strashko@ti.com Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 5 +++++ kernel/irq/pm.c | 3 ++- kernel/irq/proc.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 05c2188..fcab63c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -199,6 +199,11 @@ static inline int irq_desc_get_node(struct irq_desc *desc) return irq_common_data_get_node(&desc->irq_common_data); } +static inline int irq_desc_is_chained(struct irq_desc *desc) +{ + return (desc->action && desc->action == &chained_action); +} + #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 21c6261..84ab239 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -70,7 +70,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { - if (!desc->action || desc->no_suspend_depth) + if (!desc->action || irq_desc_is_chained(desc) || + desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(&desc->irq_data)) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a916cf1..a2c02fd 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -475,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; - if ((!action || action == &chained_action) && !any_count) + if ((!action || irq_desc_is_chained(desc)) && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); -- cgit v1.1 From e428abbbf616cd8fdd1162e4a624ad1d47b47544 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 10 Nov 2015 05:15:15 +0800 Subject: tracing: #ifdef out uses of max trace when CONFIG_TRACER_MAX_TRACE is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tracing_max_lat_fops is used only when TRACER_MAX_TRACE enabled, so also swith the related code. The related warning with defconfig under x86_64: CC kernel/trace/trace.o kernel/trace/trace.c:5466:37: warning: ‘tracing_max_lat_fops’ defined but not used [-Wunused-const-variable] static const struct file_operations tracing_max_lat_fops = { Signed-off-by: Chen Gang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b115826..87fb980 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4548,6 +4548,8 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE + static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4562,6 +4564,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); } +#endif + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -5463,12 +5467,14 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, }; +#endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, -- cgit v1.1 From a31d82d85afdcbdb8c4128dfd6146992dc6b3576 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Nov 2015 15:28:17 -0500 Subject: bpf_trace: Make dependent on PERF_EVENTS Arnd Bergmann reported: In my ARM randconfig tests, I'm getting a build error for newly added code in bpf_perf_event_read and bpf_perf_event_output whenever CONFIG_PERF_EVENTS is disabled: kernel/trace/bpf_trace.c: In function 'bpf_perf_event_read': kernel/trace/bpf_trace.c:203:11: error: 'struct perf_event' has no member named 'oncpu' if (event->oncpu != smp_processor_id() || ^ kernel/trace/bpf_trace.c:204:11: error: 'struct perf_event' has no member named 'pmu' event->pmu->count) This can happen when UPROBE_EVENT is enabled but KPROBE_EVENT is disabled. I'm not sure if that is a configuration we care about, otherwise we could prevent this case from occuring by adding Kconfig dependencies. Looking at this further, it's really that UPROBE_EVENT enables PERF_EVENTS. By just having BPF_EVENTS depend on PERF_EVENTS, then all is fine. Link: http://lkml.kernel.org/r/4525348.Aq9YoXkChv@wuerfel Reported-by: Arnd Bergmann Signed-off-by: Steven Rostedt Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1153c43..f5727b5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT || UPROBE_EVENT + depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS bool default y help -- cgit v1.1 From e41b104c7dba92443e594e6bc86e4b0bf1cdf573 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 6 Nov 2015 14:25:00 +0800 Subject: livepatch: x86: fix relocation computation with kASLR With kASLR enabled, old_addr provided by patch module is being shifted accrodingly so that the symbol lookups work. To have module relocations handled properly as well, the same transformation needs to be perfomed on relocation address information. [jkosina@suse.cz: extended / reworded changelog a bit] Reported-by: Cyril B. Signed-off-by: Zhou Chengming Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 6e53441..db545cb 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -294,6 +294,12 @@ static int klp_write_object_relocations(struct module *pmod, for (reloc = obj->relocs; reloc->name; reloc++) { if (!klp_is_module(obj)) { + +#if defined(CONFIG_RANDOMIZE_BASE) + /* If KASLR has been enabled, adjust old value accordingly */ + if (kaslr_enabled()) + reloc->val += kaslr_offset(); +#endif ret = klp_verify_vmlinux_symbol(reloc->name, reloc->val); if (ret) -- cgit v1.1 From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 20 Nov 2015 15:57:21 -0800 Subject: kernel/signal.c: unexport sigsuspend() sigsuspend() is nowhere used except in signal.c itself, so we can mark it static do not pollute the global namespace. But this patch is more than a boring cleanup patch, it fixes a real issue on UserModeLinux. UML has a special console driver to display ttys using xterm, or other terminal emulators, on the host side. Vegard reported that sometimes UML is unable to spawn a xterm and he's facing the following warning: WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0() It turned out that this warning makes absolutely no sense as the UML xterm code calls sigsuspend() on the host side, at least it tries. But as the kernel itself offers a sigsuspend() symbol the linker choose this one instead of the glibc wrapper. Interestingly this code used to work since ever but always blocked signals on the wrong side. Some recent kernel change made the WARN_ON() trigger and uncovered the bug. It is a wonderful example of how much works by chance on computers. :-) Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()") Signed-off-by: Richard Weinberger Reported-by: Vegard Nossum Tested-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c0b01fe..f3f1f7a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3503,7 +3503,7 @@ SYSCALL_DEFINE0(pause) #endif -int sigsuspend(sigset_t *set) +static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); -- cgit v1.1 From 7625b3a0007decf2b135cb47ca67abc78a7b1bc1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 20 Nov 2015 15:57:24 -0800 Subject: kernel/panic.c: turn off locks debug before releasing console lock Commit 08d78658f393 ("panic: release stale console lock to always get the logbuf printed out") introduced an unwanted bad unlock balance report when panic() is called directly and not from OOPS (e.g. from out_of_memory()). The difference is that in case of OOPS we disable locks debug in oops_enter() and on direct panic call nobody does that. Fixes: 08d78658f393 ("panic: release stale console lock to always get the logbuf printed out") Reported-by: kernel test robot Signed-off-by: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Masami Hiramatsu Cc: Jiri Kosina Cc: Baoquan He Cc: Prarit Bhargava Cc: Xie XiuQi Cc: Seth Jennings Cc: "K. Y. Srinivasan" Cc: Jan Kara Cc: Petr Mladek Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 4579dbb..4b150bc 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -152,8 +152,11 @@ void panic(const char *fmt, ...) * We may have ended up stopping the CPU holding the lock (in * smp_send_stop()) while still having some valuable data in the console * buffer. Try to acquire the lock then release it regardless of the - * result. The release will also print the buffers out. + * result. The release will also print the buffers out. Locks debug + * should be disabled to avoid reporting bad unlock balance when + * panic() is not being callled from OOPS. */ + debug_locks_off(); console_trylock(); console_unlock(); -- cgit v1.1