diff options
author | Ingo Molnar <mingo@kernel.org> | 2016-02-29 09:42:07 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-02-29 09:42:07 +0100 |
commit | 6aa447bcbb444cd1b738613a20627f288d631665 (patch) | |
tree | 62030bb14be0006248181305ac1fe377a99e5b65 /kernel | |
parent | abedf8e2419fb873d919dd74de2e84b510259339 (diff) | |
parent | 48be3a67da7413d62e5efbcf2c73a9dddf61fb96 (diff) | |
download | op-kernel-dev-6aa447bcbb444cd1b738613a20627f288d631665.zip op-kernel-dev-6aa447bcbb444cd1b738613a20627f288d631665.tar.gz |
Merge branch 'sched/urgent' into sched/core, to pick up fixes before applying new changes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/verifier.c | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 31 | ||||
-rw-r--r-- | kernel/cpuset.c | 71 | ||||
-rw-r--r-- | kernel/events/core.c | 372 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 58 | ||||
-rw-r--r-- | kernel/memremap.c | 8 | ||||
-rw-r--r-- | kernel/module.c | 124 | ||||
-rw-r--r-- | kernel/resource.c | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 4 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 24 | ||||
-rw-r--r-- | kernel/sched/fair.c | 37 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 36 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 6 | ||||
-rw-r--r-- | kernel/workqueue.c | 74 |
17 files changed, 546 insertions, 317 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f..2e7f7ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640..d27904c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -58,6 +58,7 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <linux/cpuset.h> #include <net/sock.h> /* @@ -2739,6 +2740,7 @@ out_unlock_rcu: out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fc..41989ab 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; - - tsk->mems_allowed = *to; + struct cpuset_migrate_mm_work *mwork; - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1714,6 +1737,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 5946460..6146148 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -64,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event) * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * - * This is because we need a ctx->lock serialized variable (ctx->is_active) - * to reliably determine if a particular task/context is scheduled in. The - * task_curr() use in task_function_call() is racy in that a remote context - * switch is not a single atomic operation. - * - * As is, the situation is 'safe' because we set rq->curr before we do the - * actual context switch. This means that task_curr() will fail early, but - * we'll continue spinning on ctx->is_active until we've passed - * perf_event_task_sched_out(). - * - * Without this ctx->lock serialized variable we could have race where we find - * the task (and hence the context) would not be active while in fact they are. - * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ @@ -212,7 +212,7 @@ static int event_function(void *info) */ if (ctx->task) { if (ctx->task != current) { - ret = -EAGAIN; + ret = -ESRCH; goto unlock; } @@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da return; } -again: if (task == TASK_TOMBSTONE) return; +again: if (!task_function_call(task, event_function, &efs)) return; @@ -289,13 +289,15 @@ again: * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task != TASK_TOMBSTONE) { - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto again; - } - func(event, NULL, ctx, data); + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; } + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -314,6 +316,7 @@ again: enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -321,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -1288,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1645,7 +1656,7 @@ out: static bool is_orphaned_event(struct perf_event *event) { - return event->state == PERF_EVENT_STATE_EXIT; + return event->state == PERF_EVENT_STATE_DEAD; } static inline int pmu_filter_match(struct perf_event *event) @@ -1690,14 +1701,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1732,7 +1743,6 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL -#define DETACH_STATE 0x02UL /* * Cross CPU call to remove a performance event @@ -1752,8 +1762,6 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (flags & DETACH_STATE) - event->state = PERF_EVENT_STATE_EXIT; if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; @@ -2063,14 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2097,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { - struct perf_event_context *ctx = info; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + bool activate = true; + int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); - /* - * If we hit the 'wrong' task, we've since scheduled and - * everything should be sorted, nothing to do! - */ task_ctx = ctx; - if (ctx->task != current) + + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; goto unlock; + } /* - * If task_ctx is set, it had better be to us. + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. */ - WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); + } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - ctx_resched(cpuctx, task_ctx); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } + unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context + * Attach a performance event to a context. + * + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { - struct task_struct *task = NULL; + struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); @@ -2147,40 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx, if (event->cpu != -1) event->cpu = cpu; + if (!task) { + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) + return; + /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. - * - * So what we do is we add the event to the list here, which will allow - * a future context switch to DTRT and then send a racy IPI. If the IPI - * fails to hit the right task, this means a context switch must have - * happened and that will have taken care of business. */ - raw_spin_lock_irq(&ctx->lock); - task = ctx->task; +again: /* - * Worse, we cannot even rely on the ctx actually existing anymore. If - * between find_get_context() and perf_install_in_context() the task - * went through perf_event_exit_task() its dead and we should not be - * adding new events. + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. */ - if (task == TASK_TOMBSTONE) { + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { + /* + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). + */ raw_spin_unlock_irq(&ctx->lock); return; } - update_context_time(ctx); + raw_spin_unlock_irq(&ctx->lock); /* - * Update cgrp time only if current cgrp matches event->cgrp. - * Must be done before calling add_event_to_ctx(). + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. */ - update_cgrp_time_from_event(event); - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - - if (task) - task_function_call(task, __perf_install_in_context, ctx); - else - cpu_function_call(cpu, __perf_install_in_context, ctx); + goto again; } /* @@ -2219,17 +2265,18 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - update_context_time(ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + __perf_event_mark_enabled(event); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) { - perf_cgroup_set_timestamp(current, ctx); // XXX ? + if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - } + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2237,8 +2284,10 @@ static void __perf_event_enable(struct perf_event *event, * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; + } task_ctx = cpuctx->task_ctx; if (ctx->task) @@ -2344,24 +2393,33 @@ static void ctx_sched_out(struct perf_event_context *ctx, } ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2641,18 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); -} - /* * Called with IRQs disabled */ @@ -2735,7 +2781,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) return; - ctx->is_active |= event_type; + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; @@ -2743,18 +2789,24 @@ ctx_sched_in(struct perf_event_context *ctx, WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -3120,6 +3172,7 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) enabled |= event_enable_on_exec(event, ctx); @@ -3537,12 +3590,22 @@ static void unaccount_event(struct perf_event *event) if (has_branch_stack(event)) dec = true; - if (dec) - static_key_slow_dec_deferred(&perf_sched_events); + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3752,30 +3815,42 @@ static void put_event(struct perf_event *event) */ int perf_event_release_kernel(struct perf_event *event) { - struct perf_event_context *ctx; + struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); - perf_event_ctx_unlock(event, ctx); + perf_remove_from_context(event, DETACH_GROUP); + raw_spin_lock_irq(&ctx->lock); /* - * At this point we must have event->state == PERF_EVENT_STATE_EXIT, - * either from the above perf_remove_from_context() or through - * perf_event_exit_event(). + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * Therefore, anybody acquiring event->child_mutex after the below - * loop _must_ also see this, most importantly inherit_event() which - * will avoid placing more children on the list. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + + perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); @@ -3830,8 +3905,8 @@ again: } mutex_unlock(&event->child_mutex); - /* Must be the last reference */ - put_event(event); +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3988,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -7769,8 +7844,28 @@ static void account_event(struct perf_event *event) if (is_cgroup_event(event)) inc = true; - if (inc) - static_key_slow_inc(&perf_sched_events.key); + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -8389,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8509,7 +8613,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8563,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8577,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8695,7 +8810,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -9206,7 +9321,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9282,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; @@ -9315,9 +9428,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56..716547f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -666,7 +666,7 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP @@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +742,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1934,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: @@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr; int i, j; @@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2057,7 +2061,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2091,7 +2095,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -3875,7 +3879,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4027,10 +4027,10 @@ void lockdep_init(void) return; for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); + INIT_HLIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 70ee377..b981a7b 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -136,8 +136,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } @@ -150,7 +152,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } diff --git a/kernel/module.c b/kernel/module.c index 8358f46..794ebe8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -303,6 +303,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -981,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -2480,10 +2485,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2492,29 +2508,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_strtab = s = mod->core_layout.base + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -3263,9 +3284,8 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); @@ -3295,6 +3315,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3371,6 +3392,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); + ftrace_module_enable(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; @@ -3496,7 +3518,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); @@ -3627,6 +3649,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3634,6 +3661,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3643,32 +3671,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3758,19 +3786,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3779,11 +3809,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3822,11 +3853,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } diff --git a/kernel/resource.c b/kernel/resource.c index 09c0597..3669d1b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 87ca0be..291552b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7930,11 +7930,9 @@ void sched_destroy_group(struct task_group *tg) void sched_offline_group(struct task_group *tg) { unsigned long flags; - int i; /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); + unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c97..04a569c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -399,6 +399,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->runtime = pi_se->dl_runtime; } + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -420,7 +423,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -735,8 +738,11 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; return; + } schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -749,8 +755,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(dl_se)) { + dl_se->runtime -= delta_exec; + +throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -994,18 +1002,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ static void yield_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - /* * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - if (p->dl.runtime > 0) { - rq->curr->dl.dl_yielded = 1; - p->dl.runtime = 0; - } + rq->curr->dl.dl_yielded = 1; + update_rq_clock(rq); update_curr_dl(rq); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce24a4..5641042 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8284,11 +8284,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8336,21 +8333,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8432,7 +8437,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d58387..91f0a77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -313,7 +313,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a..0508544 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca592f..57a6eea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_module_enable(struct module *mod) +void ftrace_module_enable(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_module_enable(mod); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - default: - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5098,10 +5068,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f333e57..ab09829 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -869,7 +869,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 202df6c..2a1abba 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61a0264..7ff5dc7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); + +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], @@ -570,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + static bool printed_dbg_warning; + int new_cpu; + + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) @@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** @@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", current->pid, current->comm, target_wq->name, target_func); - WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); |