diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/verifier.c | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 31 | ||||
-rw-r--r-- | kernel/cpuset.c | 71 | ||||
-rw-r--r-- | kernel/events/core.c | 372 | ||||
-rw-r--r-- | kernel/irq/Kconfig | 4 | ||||
-rw-r--r-- | kernel/irq/Makefile | 1 | ||||
-rw-r--r-- | kernel/irq/chip.c | 4 | ||||
-rw-r--r-- | kernel/irq/handle.c | 6 | ||||
-rw-r--r-- | kernel/irq/internals.h | 3 | ||||
-rw-r--r-- | kernel/irq/ipi.c | 326 | ||||
-rw-r--r-- | kernel/irq/irqdesc.c | 21 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 11 | ||||
-rw-r--r-- | kernel/irq/manage.c | 8 | ||||
-rw-r--r-- | kernel/irq/proc.c | 2 | ||||
-rw-r--r-- | kernel/irq/spurious.c | 4 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 58 | ||||
-rw-r--r-- | kernel/memremap.c | 26 | ||||
-rw-r--r-- | kernel/module.c | 124 | ||||
-rw-r--r-- | kernel/resource.c | 5 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 36 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 17 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 13 | ||||
-rw-r--r-- | kernel/workqueue.c | 74 |
26 files changed, 914 insertions, 326 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f..2e7f7ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640..d27904c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -58,6 +58,7 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <linux/cpuset.h> #include <net/sock.h> /* @@ -2739,6 +2740,7 @@ out_unlock_rcu: out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fc..41989ab 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; - - tsk->mems_allowed = *to; + struct cpuset_migrate_mm_work *mwork; - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1714,6 +1737,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 5946460..6146148 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -64,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event) * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * - * This is because we need a ctx->lock serialized variable (ctx->is_active) - * to reliably determine if a particular task/context is scheduled in. The - * task_curr() use in task_function_call() is racy in that a remote context - * switch is not a single atomic operation. - * - * As is, the situation is 'safe' because we set rq->curr before we do the - * actual context switch. This means that task_curr() will fail early, but - * we'll continue spinning on ctx->is_active until we've passed - * perf_event_task_sched_out(). - * - * Without this ctx->lock serialized variable we could have race where we find - * the task (and hence the context) would not be active while in fact they are. - * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ @@ -212,7 +212,7 @@ static int event_function(void *info) */ if (ctx->task) { if (ctx->task != current) { - ret = -EAGAIN; + ret = -ESRCH; goto unlock; } @@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da return; } -again: if (task == TASK_TOMBSTONE) return; +again: if (!task_function_call(task, event_function, &efs)) return; @@ -289,13 +289,15 @@ again: * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task != TASK_TOMBSTONE) { - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto again; - } - func(event, NULL, ctx, data); + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; } + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -314,6 +316,7 @@ again: enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -321,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -1288,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1645,7 +1656,7 @@ out: static bool is_orphaned_event(struct perf_event *event) { - return event->state == PERF_EVENT_STATE_EXIT; + return event->state == PERF_EVENT_STATE_DEAD; } static inline int pmu_filter_match(struct perf_event *event) @@ -1690,14 +1701,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1732,7 +1743,6 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL -#define DETACH_STATE 0x02UL /* * Cross CPU call to remove a performance event @@ -1752,8 +1762,6 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (flags & DETACH_STATE) - event->state = PERF_EVENT_STATE_EXIT; if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; @@ -2063,14 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2097,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { - struct perf_event_context *ctx = info; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + bool activate = true; + int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); - /* - * If we hit the 'wrong' task, we've since scheduled and - * everything should be sorted, nothing to do! - */ task_ctx = ctx; - if (ctx->task != current) + + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; goto unlock; + } /* - * If task_ctx is set, it had better be to us. + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. */ - WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); + } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - ctx_resched(cpuctx, task_ctx); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } + unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context + * Attach a performance event to a context. + * + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { - struct task_struct *task = NULL; + struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); @@ -2147,40 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx, if (event->cpu != -1) event->cpu = cpu; + if (!task) { + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) + return; + /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. - * - * So what we do is we add the event to the list here, which will allow - * a future context switch to DTRT and then send a racy IPI. If the IPI - * fails to hit the right task, this means a context switch must have - * happened and that will have taken care of business. */ - raw_spin_lock_irq(&ctx->lock); - task = ctx->task; +again: /* - * Worse, we cannot even rely on the ctx actually existing anymore. If - * between find_get_context() and perf_install_in_context() the task - * went through perf_event_exit_task() its dead and we should not be - * adding new events. + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. */ - if (task == TASK_TOMBSTONE) { + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { + /* + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). + */ raw_spin_unlock_irq(&ctx->lock); return; } - update_context_time(ctx); + raw_spin_unlock_irq(&ctx->lock); /* - * Update cgrp time only if current cgrp matches event->cgrp. - * Must be done before calling add_event_to_ctx(). + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. */ - update_cgrp_time_from_event(event); - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - - if (task) - task_function_call(task, __perf_install_in_context, ctx); - else - cpu_function_call(cpu, __perf_install_in_context, ctx); + goto again; } /* @@ -2219,17 +2265,18 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - update_context_time(ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + __perf_event_mark_enabled(event); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) { - perf_cgroup_set_timestamp(current, ctx); // XXX ? + if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - } + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2237,8 +2284,10 @@ static void __perf_event_enable(struct perf_event *event, * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; + } task_ctx = cpuctx->task_ctx; if (ctx->task) @@ -2344,24 +2393,33 @@ static void ctx_sched_out(struct perf_event_context *ctx, } ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2641,18 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); -} - /* * Called with IRQs disabled */ @@ -2735,7 +2781,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) return; - ctx->is_active |= event_type; + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; @@ -2743,18 +2789,24 @@ ctx_sched_in(struct perf_event_context *ctx, WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -3120,6 +3172,7 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) enabled |= event_enable_on_exec(event, ctx); @@ -3537,12 +3590,22 @@ static void unaccount_event(struct perf_event *event) if (has_branch_stack(event)) dec = true; - if (dec) - static_key_slow_dec_deferred(&perf_sched_events); + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3752,30 +3815,42 @@ static void put_event(struct perf_event *event) */ int perf_event_release_kernel(struct perf_event *event) { - struct perf_event_context *ctx; + struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); - perf_event_ctx_unlock(event, ctx); + perf_remove_from_context(event, DETACH_GROUP); + raw_spin_lock_irq(&ctx->lock); /* - * At this point we must have event->state == PERF_EVENT_STATE_EXIT, - * either from the above perf_remove_from_context() or through - * perf_event_exit_event(). + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * Therefore, anybody acquiring event->child_mutex after the below - * loop _must_ also see this, most importantly inherit_event() which - * will avoid placing more children on the list. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + + perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); @@ -3830,8 +3905,8 @@ again: } mutex_unlock(&event->child_mutex); - /* Must be the last reference */ - put_event(event); +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3988,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -7769,8 +7844,28 @@ static void account_event(struct perf_event *event) if (is_cgroup_event(event)) inc = true; - if (inc) - static_key_slow_inc(&perf_sched_events.key); + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -8389,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8509,7 +8613,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8563,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8577,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8695,7 +8810,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -9206,7 +9321,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9282,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; @@ -9315,9 +9428,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab..3bbfd6a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbd..2ee42e9 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5797909..2f9f2b0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff78..a15b548 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - /* action might have become NULL since we dropped the lock */ - while (action) { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c..eab521f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 0000000..c37f34b --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,326 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef <qais.yousef@imgtec.com> + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include <linux/irqdomain.h> +#include <linux/irq.h> + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and 0 on failure + */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return 0; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return 0; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return 0; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return 0; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return 0; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return 0; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * + * Return the IPIs allocated with irq_reserve_ipi() to the system destroying + * all virqs associated with them. + */ +void irq_destroy_ipi(unsigned int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return; + } + + if (irq_domain_is_ipi_per_cpu(domain)) + nr_irqs = cpumask_weight(ipimask); + else + nr_irqs = 1; + + irq_domain_free_irqs(irq, nr_irqs); +} + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of the cpu destinations. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0409da0..0ccd028 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3e56d2f0..3a519a0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; @@ -895,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1045,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1078,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1275,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1292,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8411872..3ddd229 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd..4e1b947 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3214417..5707f97 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56..716547f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -666,7 +666,7 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP @@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +742,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1934,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: @@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr; int i, j; @@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2057,7 +2061,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2091,7 +2095,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -3875,7 +3879,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4027,10 +4027,10 @@ void lockdep_init(void) return; for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); + INIT_HLIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } diff --git a/kernel/memremap.c b/kernel/memremap.c index e517a16..b981a7b 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -136,8 +136,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } @@ -150,7 +152,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } @@ -183,7 +185,11 @@ EXPORT_SYMBOL(put_zone_device_page); static void pgmap_radix_release(struct resource *res) { - resource_size_t key; + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; mutex_lock(&pgmap_lock); for (key = res->start; key <= res->end; key += SECTION_SIZE) @@ -226,12 +232,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) percpu_ref_put(pgmap->ref); } - pgmap_radix_release(res); - /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -267,7 +272,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); - resource_size_t key, align_start, align_size; + resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; unsigned long pfn; @@ -309,7 +314,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - for (key = res->start; key <= res->end; key += SECTION_SIZE) { + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; rcu_read_lock(); @@ -336,8 +344,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; diff --git a/kernel/module.c b/kernel/module.c index 8358f46..794ebe8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -303,6 +303,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -981,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -2480,10 +2485,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2492,29 +2508,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_strtab = s = mod->core_layout.base + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -3263,9 +3284,8 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); @@ -3295,6 +3315,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3371,6 +3392,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); + ftrace_module_enable(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; @@ -3496,7 +3518,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); @@ -3627,6 +3649,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3634,6 +3661,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3643,32 +3671,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3758,19 +3786,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3779,11 +3809,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3822,11 +3853,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } diff --git a/kernel/resource.c b/kernel/resource.c index 09c0597..3669d1b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c97..57b939c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -420,7 +420,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a..0508544 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca592f..57a6eea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_module_enable(struct module *mod) +void ftrace_module_enable(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_module_enable(mod); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - default: - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5098,10 +5068,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f333e57..05ddc08 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_generic_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - field = __find_event_field(&ftrace_common_fields, name); + field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -171,8 +171,10 @@ static int trace_define_generic_fields(void) { int ret; - __generic_field(int, cpu, FILTER_OTHER); - __generic_field(char *, comm, FILTER_PTR_STRING); + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); return ret; } @@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f93a219..6816302 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); - if (!strcmp(field->name, "comm")) { - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (field->filter_type == FILTER_STATIC_STRING) { + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - if (!strcmp(field->name, "cpu")) + if (field->filter_type == FILTER_CPU) fn = filter_pred_cpu; else fn = select_comparison_fn(pred->op, field->size, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dda9e67..2a1abba 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack) } /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_max.nr_entries) + i = 0; + + /* * Now find where in the stack these are. */ x = 0; @@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61a0264..7ff5dc7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); + +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], @@ -570,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + static bool printed_dbg_warning; + int new_cpu; + + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) @@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** @@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", current->pid, current->comm, target_wq->name, target_func); - WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); |